[llvm] [RISCV] Lower a shuffle which is nearly identity except one replicated element (PR #135292)

Thu Apr 10 18:11:14 PDT 2025

https://github.com/preames created https://github.com/llvm/llvm-project/pull/135292

This can be done with a vrgather.vi/vx, and (possibly) a register move. The alternative is to do a vrgather.vv with a full width index vector.

We'd already caught the two operands forms of this shuffle; this patch specifically handles the single operand form.

Unfortunately only in abstract, it would be nice if we canonicalized shuffles in some way wouldn't it?

>From 63cf771523879c02d664d68423cf6fd85415765f Mon Sep 17 00:00:00 2001
From: Philip Reames <preames at rivosinc.com>
Date: Thu, 10 Apr 2025 13:26:57 -0700
Subject: [PATCH] [RISCV] Lower a shuffle which is nearly identity except one
 replicated elem

This can be done with a vrgather.vi/vx, and (possibly) a register move.
The alternative is to do a vrgather.vv with a full width index vector.

We'd already caught the two operands forms of this shuffle; this patch
specifically handles the single operand form.

Unfortunately only in abstract, it would be nice if we canonicalized
shuffles in some way wouldn't it?
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   | 44 +++++++++++++++++++
 .../RISCV/rvv/fixed-vectors-shuffle-int.ll    | 33 +++++++-------
 2 files changed, 60 insertions(+), 17 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index f7d192756fd56..fd0562d141796 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -4726,6 +4726,47 @@ static SDValue getDeinterleaveShiftAndTrunc(const SDLoc &DL, MVT VT,
                      DAG.getVectorIdxConstant(0, DL));
 }
 
+/// Match a single source shuffle which is an identity except that some
+/// particular element is repeated.  This can be lowered as a masked
+/// vrgather.vi/vx.  Note that the two source form of this is handled
+/// by the recursive splitting logic and doesn't need special handling.
+static SDValue lowerVECTOR_SHUFFLEAsVRGatherVX(ShuffleVectorSDNode *SVN,
+                                               const RISCVSubtarget &Subtarget,
+                                               SelectionDAG &DAG) {
+
+  SDLoc DL(SVN);
+  MVT VT = SVN->getSimpleValueType(0);
+  SDValue V1 = SVN->getOperand(0);
+  assert(SVN->getOperand(1).isUndef());
+  ArrayRef<int> Mask = SVN->getMask();
+  const unsigned NumElts = VT.getVectorNumElements();
+  MVT XLenVT = Subtarget.getXLenVT();
+
+  std::optional<int> SplatIdx;
+  for (auto [I, M] : enumerate(Mask)) {
+    if (M == -1 || I == (unsigned)M)
+      continue;
+    if (SplatIdx && *SplatIdx != M)
+      return SDValue();
+    SplatIdx = M;
+  }
+
+  if (!SplatIdx)
+    return SDValue();
+
+  SmallVector<SDValue> MaskVals;
+  for (int MaskIndex : Mask) {
+    bool SelectMaskVal = MaskIndex == *SplatIdx;
+    MaskVals.push_back(DAG.getConstant(SelectMaskVal, DL, XLenVT));
+  }
+  assert(MaskVals.size() == NumElts && "Unexpected select-like shuffle");
+  MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
+  SDValue SelectMask = DAG.getBuildVector(MaskVT, DL, MaskVals);
+  SDValue Splat = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT),
+                                       SmallVector<int>(NumElts, *SplatIdx));
+  return DAG.getNode(ISD::VSELECT, DL, VT, SelectMask, Splat, V1);
+}
+
 // Lower the following shuffle to vslidedown.
 // a)
 // t49: v8i8 = extract_subvector t13, Constant:i64<0>
@@ -5852,6 +5893,9 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
     if (SDValue V = lowerVECTOR_SHUFFLEAsRotate(SVN, DAG, Subtarget))
       return V;
 
+    if (SDValue V = lowerVECTOR_SHUFFLEAsVRGatherVX(SVN, Subtarget, DAG))
+      return V;
+
     // Match a spread(4,8) which can be done via extend and shift.  Spread(2)
     // is fully covered in interleave(2) above, so it is ignored here.
     if (VT.getScalarSizeInBits() < Subtarget.getELen()) {
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-int.ll
index b26fc5653afec..e6375e276d37f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-int.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-int.ll
@@ -1419,11 +1419,11 @@ define <8 x i32> @shuffle_v8i32_locally_repeating_neg(<8 x i32> %a) {
 define <8 x i8> @identity_splat0(<8 x i8> %v) {
 ; CHECK-LABEL: identity_splat0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI88_0)
-; CHECK-NEXT:    addi a0, a0, %lo(.LCPI88_0)
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vle8.v v10, (a0)
-; CHECK-NEXT:    vrgather.vv v9, v8, v10
+; CHECK-NEXT:    li a0, 25
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
+; CHECK-NEXT:    vmv.s.x v0, a0
+; CHECK-NEXT:    vmv1r.v v9, v8
+; CHECK-NEXT:    vrgather.vi v9, v8, 0, v0.t
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
   %shuf = shufflevector <8 x i8> %v, <8 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 0, i32 0, i32 5, i32 6, i32 7>
@@ -1433,11 +1433,11 @@ define <8 x i8> @identity_splat0(<8 x i8> %v) {
 define <8 x i8> @identity_splat2(<8 x i8> %v) {
 ; CHECK-LABEL: identity_splat2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI89_0)
-; CHECK-NEXT:    addi a0, a0, %lo(.LCPI89_0)
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vle8.v v10, (a0)
-; CHECK-NEXT:    vrgather.vv v9, v8, v10
+; CHECK-NEXT:    li a0, 28
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
+; CHECK-NEXT:    vmv.s.x v0, a0
+; CHECK-NEXT:    vmv1r.v v9, v8
+; CHECK-NEXT:    vrgather.vi v9, v8, 2, v0.t
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
   %shuf = shufflevector <8 x i8> %v, <8 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 2, i32 5, i32 6, i32 7>
@@ -1448,14 +1448,13 @@ define <8 x i8> @identity_splat2(<8 x i8> %v) {
 define <8 x i8> @vmerge_vxm(<8 x i8> %v, i8 %s) {
 ; CHECK-LABEL: vmerge_vxm:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, %hi(.LCPI90_0)
-; CHECK-NEXT:    addi a1, a1, %lo(.LCPI90_0)
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vle8.v v10, (a1)
-; CHECK-NEXT:    vsetvli zero, zero, e8, mf2, tu, ma
+; CHECK-NEXT:    li a1, 25
+; CHECK-NEXT:    vsetivli zero, 8, e8, m1, tu, ma
 ; CHECK-NEXT:    vmv.s.x v8, a0
-; CHECK-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vrgather.vv v9, v8, v10
+; CHECK-NEXT:    vmv.s.x v0, a1
+; CHECK-NEXT:    vmv1r.v v9, v8
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
+; CHECK-NEXT:    vrgather.vi v9, v8, 0, v0.t
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
   %ins = insertelement <8 x i8> %v, i8 %s, i32 0