[llvm] [RISCV] Lower shuffle which splats a single span (without exact VLEN) (PR #127108)

Thu Feb 13 11:03:21 PST 2025

https://github.com/preames updated https://github.com/llvm/llvm-project/pull/127108

>From b04b30324516cdebd922e38a7a5d58225a3d917a Mon Sep 17 00:00:00 2001
From: Philip Reames <preames at rivosinc.com>
Date: Thu, 13 Feb 2025 09:35:28 -0800
Subject: [PATCH 1/2] [RISCV] Lower shuffle which splats a single span (without
 exact VLEN)

If we have a shuffle which repeats the same pattern of elements,
all of which come from the first register in the source register
group, we can lower this to a single vrgather at m1 to perform
the element rearrangement, and reuse that for each register in
the result vector register group.
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   | 48 +++++++++++++++++++
 .../RISCV/rvv/fixed-vectors-int-shuffles.ll   | 37 ++++++++------
 2 files changed, 71 insertions(+), 14 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 829eef2e4d9d9..1156fd2e67fed 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -5360,6 +5360,23 @@ static bool isLowSourceShuffle(ArrayRef<int> Mask, int Span) {
                 [&](const auto &Idx) { return Idx == -1 || Idx < Span; });
 }
 
+/// Return true for a mask which performs an arbitrary shuffle within the first
+/// span, and then repeats that same result across all remaining spans.  Note
+/// that this doesn't check if all the inputs come from a single span!
+static bool isSpanSplatShuffle(ArrayRef<int> Mask, int Span) {
+  SmallVector<int> LowSpan(Span, -1);
+  for (auto [I, M] : enumerate(Mask)) {
+    if (M == -1)
+      continue;
+    int SpanIdx = I % Span;
+    if (LowSpan[SpanIdx] == -1)
+      LowSpan[SpanIdx] = M;
+    if (LowSpan[SpanIdx] != M)
+      return false;
+  }
+  return true;
+}
+
 /// Try to widen element type to get a new mask value for a better permutation
 /// sequence.  This doesn't try to inspect the widened mask for profitability;
 /// we speculate the widened form is equal or better.  This has the effect of
@@ -5775,6 +5792,37 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
         Gather = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ContainerVT, Gather,
                              SubVec, SubIdx);
       }
+    } else if (NumElts > MinVLMAX && isLowSourceShuffle(Mask, MinVLMAX) &&
+               isSpanSplatShuffle(Mask, MinVLMAX)) {
+      // If we have a shuffle which only uses the first register in our source
+      // register group, and repeats the same index across all spans, we can
+      // use a single vrgather (and possibly some register moves).
+      // TODO: This can be generalized for m2 or m4, or for any shuffle for
+      // which we can do a linear number of shuffles to form an m1 which
+      // contains all the output elements.
+      const MVT M1VT = getLMUL1VT(ContainerVT);
+      EVT SubIndexVT = M1VT.changeVectorElementType(IndexVT.getScalarType());
+      auto [InnerTrueMask, InnerVL] =
+          getDefaultScalableVLOps(M1VT, DL, DAG, Subtarget);
+      int N = ContainerVT.getVectorMinNumElements() /
+              M1VT.getVectorMinNumElements();
+      assert(isPowerOf2_32(N) && N <= 8);
+      SDValue SubV1 =
+          DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, M1VT, V1,
+                      DAG.getVectorIdxConstant(0, DL));
+      SDValue SubIndex =
+        DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubIndexVT, LHSIndices,
+                    DAG.getVectorIdxConstant(0, DL));
+      SDValue SubVec =
+        DAG.getNode(GatherVVOpc, DL, M1VT, SubV1, SubIndex,
+                    DAG.getUNDEF(M1VT), InnerTrueMask, InnerVL);
+      Gather = DAG.getUNDEF(ContainerVT);
+      for (int i = 0; i < N; i++) {
+        SDValue SubIdx =
+            DAG.getVectorIdxConstant(M1VT.getVectorMinNumElements() * i, DL);
+        Gather = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ContainerVT, Gather,
+                             SubVec, SubIdx);
+      }
     } else if (NumElts > MinVLMAX && isLowSourceShuffle(Mask, MinVLMAX)) {
       // If we have a shuffle which only uses the first register in our
       // source register group, we can do a linear number of m1 vrgathers
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
index d7120b4a16938..3e31c9de61657 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
@@ -1311,22 +1311,14 @@ define void @shuffle_i128_splat(ptr %p) nounwind {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; CHECK-NEXT:    vle64.v v8, (a0)
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    lui a2, 16
-; CHECK-NEXT:    srli a1, a1, 3
+; CHECK-NEXT:    lui a1, 16
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a2
-; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vx v10, v9, a1
-; CHECK-NEXT:    vslidedown.vx v11, v10, a1
-; CHECK-NEXT:    vsetvli a2, zero, e64, m1, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v13, v8, v10
-; CHECK-NEXT:    vrgatherei16.vv v12, v8, v9
-; CHECK-NEXT:    vrgatherei16.vv v14, v8, v11
-; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vx v9, v11, a1
+; CHECK-NEXT:    vmv.v.x v9, a1
 ; CHECK-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v15, v8, v9
+; CHECK-NEXT:    vrgatherei16.vv v12, v8, v9
+; CHECK-NEXT:    vmv.v.v v13, v12
+; CHECK-NEXT:    vmv.v.v v14, v12
+; CHECK-NEXT:    vmv.v.v v15, v12
 ; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; CHECK-NEXT:    vse64.v v12, (a0)
 ; CHECK-NEXT:    ret
@@ -1435,3 +1427,20 @@ define <4 x i16> @vmerge_3(<4 x i16> %x) {
    %s = shufflevector <4 x i16> %x, <4 x i16> <i16 poison, i16 5, i16 poison, i16 poison>, <4 x i32> <i32 0, i32 5, i32 5, i32 3>
    ret <4 x i16> %s
 }
+
+
+define <8 x i64> @shuffle_v8i164_span_splat(<8 x i64> %a) nounwind {
+; CHECK-LABEL: shuffle_v8i164_span_splat:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vmv.v.i v9, 1
+; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; CHECK-NEXT:    vrgatherei16.vv v12, v8, v9
+; CHECK-NEXT:    vmv.v.v v13, v12
+; CHECK-NEXT:    vmv.v.v v14, v12
+; CHECK-NEXT:    vmv.v.v v15, v12
+; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    ret
+  %res = shufflevector <8 x i64> %a, <8 x i64> poison, <8 x i32> <i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0>
+  ret <8 x i64> %res
+}

>From 7ec5b854035c6573770181b5883671ca4e92316c Mon Sep 17 00:00:00 2001
From: Philip Reames <preames at rivosinc.com>
Date: Thu, 13 Feb 2025 11:00:16 -0800
Subject: [PATCH 2/2] clang-format

---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 1156fd2e67fed..13ea184c817fb 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -5356,8 +5356,7 @@ static bool isLocalRepeatingShuffle(ArrayRef<int> Mask, int Span) {
 
 /// Is this mask only using elements from the first span of the input?
 static bool isLowSourceShuffle(ArrayRef<int> Mask, int Span) {
-  return all_of(Mask,
-                [&](const auto &Idx) { return Idx == -1 || Idx < Span; });
+  return all_of(Mask, [&](const auto &Idx) { return Idx == -1 || Idx < Span; });
 }
 
 /// Return true for a mask which performs an arbitrary shuffle within the first
@@ -5807,15 +5806,13 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
       int N = ContainerVT.getVectorMinNumElements() /
               M1VT.getVectorMinNumElements();
       assert(isPowerOf2_32(N) && N <= 8);
-      SDValue SubV1 =
-          DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, M1VT, V1,
-                      DAG.getVectorIdxConstant(0, DL));
+      SDValue SubV1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, M1VT, V1,
+                                  DAG.getVectorIdxConstant(0, DL));
       SDValue SubIndex =
-        DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubIndexVT, LHSIndices,
-                    DAG.getVectorIdxConstant(0, DL));
-      SDValue SubVec =
-        DAG.getNode(GatherVVOpc, DL, M1VT, SubV1, SubIndex,
-                    DAG.getUNDEF(M1VT), InnerTrueMask, InnerVL);
+          DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubIndexVT, LHSIndices,
+                      DAG.getVectorIdxConstant(0, DL));
+      SDValue SubVec = DAG.getNode(GatherVVOpc, DL, M1VT, SubV1, SubIndex,
+                                   DAG.getUNDEF(M1VT), InnerTrueMask, InnerVL);
       Gather = DAG.getUNDEF(ContainerVT);
       for (int i = 0; i < N; i++) {
         SDValue SubIdx =