[llvm] [RISCV] Prefer alt opcode vectorirazion if unaligned vector mem accesses (PR #154153)

Mon Aug 18 13:00:37 PDT 2025

https://github.com/mgudim updated https://github.com/llvm/llvm-project/pull/154153

>From fdb931f8dfb12dbeca3aa00749777b14be2af7ca Mon Sep 17 00:00:00 2001
From: Mikhail Gudim <mgudim at ventanamicro.com>
Date: Mon, 18 Aug 2025 09:28:29 -0700
Subject: [PATCH] [RISCV] Unaligned vec mem => prefer alt opc vec

Return `true` in `RISCVTTIImpl::preferAlternateOpcodeVectorization` if
subtarget supports unaligned memory accesses.
---
 .../Target/RISCV/RISCVTargetTransformInfo.cpp |  4 +
 .../Target/RISCV/RISCVTargetTransformInfo.h   |  2 +-
 .../RISCV/alt-opc-vectorization.ll            | 82 +++++++++++++++++++
 3 files changed, 87 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/Transforms/SLPVectorizer/RISCV/alt-opc-vectorization.ll

diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 85b3059d87da7..a0763e3d42991 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -2713,6 +2713,10 @@ unsigned RISCVTTIImpl::getMinTripCountTailFoldingThreshold() const {
   return RVVMinTripCount;
 }
 
+bool RISCVTTIImpl::preferAlternateOpcodeVectorization() const {
+  return ST->enableUnalignedVectorMem();
+}
+
 TTI::AddressingModeKind
 RISCVTTIImpl::getPreferredAddressingMode(const Loop *L,
                                          ScalarEvolution *SE) const {
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index 6a1f4b3e3bedf..254908f97186c 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -132,7 +132,7 @@ class RISCVTTIImpl final : public BasicTTIImplBase<RISCVTTIImpl> {
 
   unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const override;
 
-  bool preferAlternateOpcodeVectorization() const override { return false; }
+  bool preferAlternateOpcodeVectorization() const override;
 
   bool preferEpilogueVectorization() const override {
     // Epilogue vectorization is usually unprofitable - tail folding or
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/alt-opc-vectorization.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/alt-opc-vectorization.ll
new file mode 100644
index 0000000000000..bd4d512705f20
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/alt-opc-vectorization.ll
@@ -0,0 +1,82 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple=riscv64 -mattr=+v,+unaligned-vector-mem \
+; RUN: -passes=slp-vectorizer -S \
+; RUN: < %s | FileCheck %s --check-prefixes=UNALIGNED_VEC_MEM
+
+; RUN: opt -mtriple=riscv64 -mattr=+v \
+; RUN: -passes=slp-vectorizer -S \
+; RUN: < %s | FileCheck %s --check-prefixes=NO_UNALIGNED_VEC_MEM
+
+define void @alternate_opcodes(ptr %pl, ptr %ps, i8 %x) {
+; UNALIGNED_VEC_MEM-LABEL: define void @alternate_opcodes(
+; UNALIGNED_VEC_MEM-SAME: ptr [[PL:%.*]], ptr [[PS:%.*]], i8 [[X:%.*]]) #[[ATTR0:[0-9]+]] {
+; UNALIGNED_VEC_MEM-NEXT:    [[GEP_L0:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 0
+; UNALIGNED_VEC_MEM-NEXT:    [[GEP_S0:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 0
+; UNALIGNED_VEC_MEM-NEXT:    [[TMP1:%.*]] = call <8 x i8> @llvm.experimental.vp.strided.load.v8i8.p0.i64(ptr align 1 [[GEP_L0]], i64 20, <8 x i1> splat (i1 true), i32 8)
+; UNALIGNED_VEC_MEM-NEXT:    [[TMP6:%.*]] = insertelement <8 x i8> poison, i8 [[X]], i32 0
+; UNALIGNED_VEC_MEM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i8> [[TMP6]], <8 x i8> poison, <8 x i32> zeroinitializer
+; UNALIGNED_VEC_MEM-NEXT:    [[TMP4:%.*]] = add <8 x i8> [[TMP1]], [[TMP3]]
+; UNALIGNED_VEC_MEM-NEXT:    [[TMP5:%.*]] = sub <8 x i8> [[TMP1]], [[TMP3]]
+; UNALIGNED_VEC_MEM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; UNALIGNED_VEC_MEM-NEXT:    store <8 x i8> [[TMP2]], ptr [[GEP_S0]], align 1
+; UNALIGNED_VEC_MEM-NEXT:    ret void
+;
+; NO_UNALIGNED_VEC_MEM-LABEL: define void @alternate_opcodes(
+; NO_UNALIGNED_VEC_MEM-SAME: ptr [[PL:%.*]], ptr [[PS:%.*]], i8 [[X:%.*]]) #[[ATTR0:[0-9]+]] {
+; NO_UNALIGNED_VEC_MEM-NEXT:    [[GEP_L0:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 0
+; NO_UNALIGNED_VEC_MEM-NEXT:    [[GEP_S0:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 0
+; NO_UNALIGNED_VEC_MEM-NEXT:    [[TMP1:%.*]] = call <8 x i8> @llvm.experimental.vp.strided.load.v8i8.p0.i64(ptr align 1 [[GEP_L0]], i64 20, <8 x i1> splat (i1 true), i32 8)
+; NO_UNALIGNED_VEC_MEM-NEXT:    [[TMP6:%.*]] = insertelement <8 x i8> poison, i8 [[X]], i32 0
+; NO_UNALIGNED_VEC_MEM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i8> [[TMP6]], <8 x i8> poison, <8 x i32> zeroinitializer
+; NO_UNALIGNED_VEC_MEM-NEXT:    [[TMP4:%.*]] = add <8 x i8> [[TMP1]], [[TMP3]]
+; NO_UNALIGNED_VEC_MEM-NEXT:    [[TMP5:%.*]] = sub <8 x i8> [[TMP1]], [[TMP3]]
+; NO_UNALIGNED_VEC_MEM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; NO_UNALIGNED_VEC_MEM-NEXT:    store <8 x i8> [[TMP2]], ptr [[GEP_S0]], align 1
+; NO_UNALIGNED_VEC_MEM-NEXT:    ret void
+;
+  %gep_l0 = getelementptr inbounds i8, ptr %pl, i64 0
+  %gep_l1 = getelementptr inbounds i8, ptr %pl, i64 20
+  %gep_l2 = getelementptr inbounds i8, ptr %pl, i64 40
+  %gep_l3 = getelementptr inbounds i8, ptr %pl, i64 60
+  %gep_l4 = getelementptr inbounds i8, ptr %pl, i64 80
+  %gep_l5 = getelementptr inbounds i8, ptr %pl, i64 100
+  %gep_l6 = getelementptr inbounds i8, ptr %pl, i64 120
+  %gep_l7 = getelementptr inbounds i8, ptr %pl, i64 140
+
+  %load0  = load i8, ptr %gep_l0
+  %load1  = load i8, ptr %gep_l1
+  %load2  = load i8, ptr %gep_l2
+  %load3  = load i8, ptr %gep_l3
+  %load4  = load i8, ptr %gep_l4
+  %load5  = load i8, ptr %gep_l5
+  %load6  = load i8, ptr %gep_l6
+  %load7  = load i8, ptr %gep_l7
+
+  %add0 = add i8 %load0, %x
+  %add1 = add i8 %load1, %x
+  %add2 = add i8 %load2, %x
+  %add3 = add i8 %load3, %x
+  %sub0 = sub i8 %load4, %x
+  %sub1 = sub i8 %load5, %x
+  %sub2 = sub i8 %load6, %x
+  %sub3 = sub i8 %load7, %x
+
+  %gep_s0 = getelementptr inbounds i8, ptr %ps, i64 0
+  %gep_s1 = getelementptr inbounds i8, ptr %ps, i64 1
+  %gep_s2 = getelementptr inbounds i8, ptr %ps, i64 2
+  %gep_s3 = getelementptr inbounds i8, ptr %ps, i64 3
+  %gep_s4 = getelementptr inbounds i8, ptr %ps, i64 4
+  %gep_s5 = getelementptr inbounds i8, ptr %ps, i64 5
+  %gep_s6 = getelementptr inbounds i8, ptr %ps, i64 6
+  %gep_s7 = getelementptr inbounds i8, ptr %ps, i64 7
+
+  store i8 %add0, ptr %gep_s0
+  store i8 %add1, ptr %gep_s1
+  store i8 %add2, ptr %gep_s2
+  store i8 %add3, ptr %gep_s3
+  store i8 %sub0, ptr %gep_s4
+  store i8 %sub1, ptr %gep_s5
+  store i8 %sub2, ptr %gep_s6
+  store i8 %sub3, ptr %gep_s7
+  ret void
+}