[llvm] 2b4a1d4 - [RISCV] Improve codegen for shuffles with LHS/RHS splats

Mon Aug 9 02:41:05 PDT 2021

Author: Fraser Cormack
Date: 2021-08-09T10:31:40+01:00
New Revision: 2b4a1d4b86f63dc8e60632985c63b7b72d0fe77c

URL: https://github.com/llvm/llvm-project/commit/2b4a1d4b86f63dc8e60632985c63b7b72d0fe77c
DIFF: https://github.com/llvm/llvm-project/commit/2b4a1d4b86f63dc8e60632985c63b7b72d0fe77c.diff

LOG: [RISCV] Improve codegen for shuffles with LHS/RHS splats

Shuffles which are broken into separate halves reveal splats in which
a half is accessed via one index; such operations can be optimized to
use "vrgather.vi".

This optimization could be achieved by adding extra patterns to match
`vrgather_vv_vl` which uses a splat as an index operand, but this patch
instead identifies splat earlier. This way, future optimizations can
build on top of the data gathered here, e.g., to splat-gather dominant
indices and insert any leftovers.

Reviewed By: craig.topper

Differential Revision: https://reviews.llvm.org/D107449

Added: 
    

Modified: 
    llvm/lib/Target/RISCV/RISCVISelLowering.cpp
    llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index e9ae4e685b29..1fbbf0b3699f 100644

--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -1971,6 +1971,10 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
   bool SwapOps = DAG.isSplatValue(V2) && !DAG.isSplatValue(V1);
   bool InvertMask = IsSelect == SwapOps;
 
+  // Keep a track of which non-undef indices are used by each LHS/RHS shuffle
+  // half.
+  DenseMap<int, unsigned> LHSIndexCounts, RHSIndexCounts;
+
   // Now construct the mask that will be used by the vselect or blended
   // vrgather operation. For vrgathers, construct the appropriate indices into
   // each vector.
@@ -1985,6 +1989,10 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
       GatherIndicesRHS.push_back(
           IsLHSOrUndefIndex ? DAG.getUNDEF(XLenVT)
                             : DAG.getConstant(MaskIndex - NumElts, DL, XLenVT));
+      if (IsLHSOrUndefIndex && MaskIndex >= 0)
+        ++LHSIndexCounts[MaskIndex];
+      if (!IsLHSOrUndefIndex)
+        ++RHSIndexCounts[MaskIndex - NumElts];
     }
   }
 
@@ -2008,13 +2016,14 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
     return SDValue();
   }
 
-  unsigned GatherOpc = RISCVISD::VRGATHER_VV_VL;
+  unsigned GatherVXOpc = RISCVISD::VRGATHER_VX_VL;
+  unsigned GatherVVOpc = RISCVISD::VRGATHER_VV_VL;
   MVT IndexVT = VT.changeTypeToInteger();
   // Since we can't introduce illegal index types at this stage, use i16 and
   // vrgatherei16 if the corresponding index type for plain vrgather is greater
   // than XLenVT.
   if (IndexVT.getScalarType().bitsGT(XLenVT)) {
-    GatherOpc = RISCVISD::VRGATHEREI16_VV_VL;
+    GatherVVOpc = RISCVISD::VRGATHEREI16_VV_VL;
     IndexVT = IndexVT.changeVectorElementType(MVT::i16);
   }
 
@@ -2027,28 +2036,48 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
   if (SDValue SplatValue = DAG.getSplatValue(V1, /*LegalTypes*/ true)) {
     Gather = lowerScalarSplat(SplatValue, VL, ContainerVT, DL, DAG, Subtarget);
   } else {
-    SDValue LHSIndices = DAG.getBuildVector(IndexVT, DL, GatherIndicesLHS);
-    LHSIndices =
-        convertToScalableVector(IndexContainerVT, LHSIndices, DAG, Subtarget);
-
     V1 = convertToScalableVector(ContainerVT, V1, DAG, Subtarget);
-    Gather =
-        DAG.getNode(GatherOpc, DL, ContainerVT, V1, LHSIndices, TrueMask, VL);
+    // If only one index is used, we can use a "splat" vrgather.
+    // TODO: We can splat the most-common index and fix-up any stragglers, if
+    // that's beneficial.
+    if (LHSIndexCounts.size() == 1) {
+      int SplatIndex = LHSIndexCounts.begin()->getFirst();
+      Gather =
+          DAG.getNode(GatherVXOpc, DL, ContainerVT, V1,
+                      DAG.getConstant(SplatIndex, DL, XLenVT), TrueMask, VL);
+    } else {
+      SDValue LHSIndices = DAG.getBuildVector(IndexVT, DL, GatherIndicesLHS);
+      LHSIndices =
+          convertToScalableVector(IndexContainerVT, LHSIndices, DAG, Subtarget);
+
+      Gather = DAG.getNode(GatherVVOpc, DL, ContainerVT, V1, LHSIndices,
+                           TrueMask, VL);
+    }
   }
 
   // If a second vector operand is used by this shuffle, blend it in with an
   // additional vrgather.
   if (!V2.isUndef()) {
+    V2 = convertToScalableVector(ContainerVT, V2, DAG, Subtarget);
+    // If only one index is used, we can use a "splat" vrgather.
+    // TODO: We can splat the most-common index and fix-up any stragglers, if
+    // that's beneficial.
+    if (RHSIndexCounts.size() == 1) {
+      int SplatIndex = RHSIndexCounts.begin()->getFirst();
+      V2 = DAG.getNode(GatherVXOpc, DL, ContainerVT, V2,
+                       DAG.getConstant(SplatIndex, DL, XLenVT), TrueMask, VL);
+    } else {
+      SDValue RHSIndices = DAG.getBuildVector(IndexVT, DL, GatherIndicesRHS);
+      RHSIndices =
+          convertToScalableVector(IndexContainerVT, RHSIndices, DAG, Subtarget);
+      V2 = DAG.getNode(GatherVVOpc, DL, ContainerVT, V2, RHSIndices, TrueMask,
+                       VL);
+    }
+
     MVT MaskContainerVT = ContainerVT.changeVectorElementType(MVT::i1);
     SelectMask =
         convertToScalableVector(MaskContainerVT, SelectMask, DAG, Subtarget);
 
-    SDValue RHSIndices = DAG.getBuildVector(IndexVT, DL, GatherIndicesRHS);
-    RHSIndices =
-        convertToScalableVector(IndexContainerVT, RHSIndices, DAG, Subtarget);
-
-    V2 = convertToScalableVector(ContainerVT, V2, DAG, Subtarget);
-    V2 = DAG.getNode(GatherOpc, DL, ContainerVT, V2, RHSIndices, TrueMask, VL);
     Gather = DAG.getNode(RISCVISD::VSELECT_VL, DL, ContainerVT, SelectMask, V2,
                          Gather, VL);
   }

diff  --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
index d8eee8ddbd3e..7a4f133989f2 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
@@ -1314,6 +1314,18 @@ foreach vti = AllIntegerVectors in {
                  vti.RegClass:$merge, vti.RegClass:$rs2, vti.RegClass:$rs1,
                  vti.Mask:$vm, GPR:$vl, vti.Log2SEW)>;
 
+  def : Pat<(vti.Vector (riscv_vselect_vl (vti.Mask VMV0:$vm),
+                                          (riscv_vrgather_vx_vl
+                                            vti.RegClass:$rs2,
+                                            uimm5:$imm,
+                                            (vti.Mask true_mask),
+                                            VLOpFrag),
+                                          vti.RegClass:$merge,
+                                          VLOpFrag)),
+            (!cast<Instruction>("PseudoVRGATHER_VI_"# vti.LMul.MX#"_MASK")
+                 vti.RegClass:$merge, vti.RegClass:$rs2, uimm5:$imm,
+                 vti.Mask:$vm, GPR:$vl, vti.Log2SEW)>;
+
   // emul = lmul * 16 / sew
   defvar vlmul = vti.LMul;
   defvar octuple_lmul = vlmul.octuple;
@@ -1385,6 +1397,18 @@ foreach vti = AllFloatVectors in {
                  vti.RegClass:$merge, vti.RegClass:$rs2, vti.RegClass:$rs1,
                  vti.Mask:$vm, GPR:$vl, vti.Log2SEW)>;
 
+  def : Pat<(vti.Vector (riscv_vselect_vl (vti.Mask VMV0:$vm),
+                                          (riscv_vrgather_vx_vl
+                                            vti.RegClass:$rs2,
+                                            uimm5:$imm,
+                                            (vti.Mask true_mask),
+                                            VLOpFrag),
+                                          vti.RegClass:$merge,
+                                          VLOpFrag)),
+            (!cast<Instruction>("PseudoVRGATHER_VI_"# vti.LMul.MX#"_MASK")
+                 vti.RegClass:$merge, vti.RegClass:$rs2, uimm5:$imm,
+                 vti.Mask:$vm, GPR:$vl, vti.Log2SEW)>;
+
   defvar vlmul = vti.LMul;
   defvar octuple_lmul = vlmul.octuple;
   defvar octuple_emul = !srl(!mul(octuple_lmul, 16), vti.Log2SEW);

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll
index 12823121f6ff..b4d28a3579b7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll
@@ -57,28 +57,25 @@ define <4 x float> @hang_when_merging_stores_after_legalization(<8 x float> %x,
 ;
 ; LMULMAX2-LABEL: hang_when_merging_stores_after_legalization:
 ; LMULMAX2:       # %bb.0:
-; LMULMAX2-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
-; LMULMAX2-NEXT:    vmv.v.i v25, 0
-; LMULMAX2-NEXT:    vrgather.vv v26, v8, v25
 ; LMULMAX2-NEXT:    addi a0, zero, 2
 ; LMULMAX2-NEXT:    vsetivli zero, 1, e8, mf8, ta, mu
 ; LMULMAX2-NEXT:    vmv.s.x v0, a0
 ; LMULMAX2-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
-; LMULMAX2-NEXT:    vmv.v.i v27, 3
+; LMULMAX2-NEXT:    vrgather.vi v25, v8, 0
 ; LMULMAX2-NEXT:    vsetvli zero, zero, e32, m1, tu, mu
-; LMULMAX2-NEXT:    vrgather.vv v26, v9, v27, v0.t
-; LMULMAX2-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
-; LMULMAX2-NEXT:    vrgather.vv v28, v10, v25
+; LMULMAX2-NEXT:    vrgather.vi v25, v9, 3, v0.t
 ; LMULMAX2-NEXT:    addi a0, zero, 8
 ; LMULMAX2-NEXT:    vsetivli zero, 1, e8, mf8, ta, mu
 ; LMULMAX2-NEXT:    vmv.s.x v0, a0
-; LMULMAX2-NEXT:    vsetivli zero, 4, e32, m1, tu, mu
-; LMULMAX2-NEXT:    vrgather.vv v28, v11, v27, v0.t
+; LMULMAX2-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
+; LMULMAX2-NEXT:    vrgather.vi v26, v10, 0
+; LMULMAX2-NEXT:    vsetvli zero, zero, e32, m1, tu, mu
+; LMULMAX2-NEXT:    vrgather.vi v26, v11, 3, v0.t
 ; LMULMAX2-NEXT:    addi a0, zero, 3
 ; LMULMAX2-NEXT:    vsetivli zero, 1, e8, mf8, ta, mu
 ; LMULMAX2-NEXT:    vmv.s.x v0, a0
 ; LMULMAX2-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
-; LMULMAX2-NEXT:    vmerge.vvm v8, v28, v26, v0
+; LMULMAX2-NEXT:    vmerge.vvm v8, v26, v25, v0
 ; LMULMAX2-NEXT:    ret
   %z = shufflevector <8 x float> %x, <8 x float> %y, <4 x i32> <i32 0, i32 7, i32 8, i32 15>
   ret <4 x float> %z

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll
index 5393c7adf64b..ced3d6d87a1e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll
@@ -142,10 +142,8 @@ define <4 x double> @vrgather_shuffle_vv_v4f64(<4 x double> %x, <4 x double> %y)
 ; RV32-NEXT:    addi a0, zero, 8
 ; RV32-NEXT:    vsetivli zero, 1, e8, mf8, ta, mu
 ; RV32-NEXT:    vmv.s.x v0, a0
-; RV32-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
-; RV32-NEXT:    vmv.v.i v25, 1
-; RV32-NEXT:    vsetvli zero, zero, e64, m2, tu, mu
-; RV32-NEXT:    vrgatherei16.vv v26, v10, v25, v0.t
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, tu, mu
+; RV32-NEXT:    vrgather.vi v26, v10, 1, v0.t
 ; RV32-NEXT:    vmv2r.v v8, v26
 ; RV32-NEXT:    ret
 ;
@@ -159,10 +157,8 @@ define <4 x double> @vrgather_shuffle_vv_v4f64(<4 x double> %x, <4 x double> %y)
 ; RV64-NEXT:    addi a0, zero, 8
 ; RV64-NEXT:    vsetivli zero, 1, e8, mf8, ta, mu
 ; RV64-NEXT:    vmv.s.x v0, a0
-; RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, mu
-; RV64-NEXT:    vmv.v.i v28, 1
-; RV64-NEXT:    vsetvli zero, zero, e64, m2, tu, mu
-; RV64-NEXT:    vrgather.vv v26, v10, v28, v0.t
+; RV64-NEXT:    vsetivli zero, 4, e64, m2, tu, mu
+; RV64-NEXT:    vrgather.vi v26, v10, 1, v0.t
 ; RV64-NEXT:    vmv2r.v v8, v26
 ; RV64-NEXT:    ret
   %s = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> <i32 1, i32 2, i32 0, i32 5>

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
index ca76aea58a85..c953e81decea 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
@@ -93,10 +93,8 @@ define <4 x i16> @vrgather_shuffle_vv_v4i16(<4 x i16> %x, <4 x i16> %y) {
 ; CHECK-NEXT:    addi a0, zero, 8
 ; CHECK-NEXT:    vsetivli zero, 1, e8, mf8, ta, mu
 ; CHECK-NEXT:    vmv.s.x v0, a0
-; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
-; CHECK-NEXT:    vmv.v.i v26, 1
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, tu, mu
-; CHECK-NEXT:    vrgather.vv v25, v9, v26, v0.t
+; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, tu, mu
+; CHECK-NEXT:    vrgather.vi v25, v9, 1, v0.t
 ; CHECK-NEXT:    vmv1r.v v8, v25
 ; CHECK-NEXT:    ret
   %s = shufflevector <4 x i16> %x, <4 x i16> %y, <4 x i32> <i32 1, i32 2, i32 0, i32 5>
@@ -388,16 +386,13 @@ define <8 x i8> @splat_ve4_ins_i1ve3(<8 x i8> %v) {
 define <8 x i8> @splat_ve2_we0(<8 x i8> %v, <8 x i8> %w) {
 ; CHECK-LABEL: splat_ve2_we0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
-; CHECK-NEXT:    vmv.v.i v26, 2
-; CHECK-NEXT:    vrgather.vv v25, v8, v26
 ; CHECK-NEXT:    addi a0, zero, 66
 ; CHECK-NEXT:    vsetivli zero, 1, e8, mf8, ta, mu
 ; CHECK-NEXT:    vmv.s.x v0, a0
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
-; CHECK-NEXT:    vmv.v.i v26, 0
+; CHECK-NEXT:    vrgather.vi v25, v8, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf2, tu, mu
-; CHECK-NEXT:    vrgather.vv v25, v9, v26, v0.t
+; CHECK-NEXT:    vrgather.vi v25, v9, 0, v0.t
 ; CHECK-NEXT:    vmv1r.v v8, v25
 ; CHECK-NEXT:    ret
   %shuff = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 2, i32 8, i32 2, i32 2, i32 2, i32 2, i32 8, i32 2>
@@ -417,10 +412,8 @@ define <8 x i8> @splat_ve2_we0_ins_i0ve4(<8 x i8> %v, <8 x i8> %w) {
 ; CHECK-NEXT:    addi a0, zero, 66
 ; CHECK-NEXT:    vsetivli zero, 1, e8, mf8, ta, mu
 ; CHECK-NEXT:    vmv.s.x v0, a0
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
-; CHECK-NEXT:    vmv.v.i v26, 0
-; CHECK-NEXT:    vsetvli zero, zero, e8, mf2, tu, mu
-; CHECK-NEXT:    vrgather.vv v25, v9, v26, v0.t
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, tu, mu
+; CHECK-NEXT:    vrgather.vi v25, v9, 0, v0.t
 ; CHECK-NEXT:    vmv1r.v v8, v25
 ; CHECK-NEXT:    ret
   %shuff = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 4, i32 8, i32 2, i32 2, i32 2, i32 2, i32 8, i32 2>
@@ -430,12 +423,11 @@ define <8 x i8> @splat_ve2_we0_ins_i0ve4(<8 x i8> %v, <8 x i8> %w) {
 define <8 x i8> @splat_ve2_we0_ins_i0we4(<8 x i8> %v, <8 x i8> %w) {
 ; CHECK-LABEL: splat_ve2_we0_ins_i0we4:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
-; CHECK-NEXT:    vmv.v.i v26, 2
-; CHECK-NEXT:    vrgather.vv v25, v8, v26
 ; CHECK-NEXT:    addi a0, zero, 67
 ; CHECK-NEXT:    vsetivli zero, 1, e8, mf8, ta, mu
 ; CHECK-NEXT:    vmv.s.x v0, a0
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
+; CHECK-NEXT:    vrgather.vi v25, v8, 2
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmv.v.i v26, 4
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, tu, mu
@@ -458,10 +450,8 @@ define <8 x i8> @splat_ve2_we0_ins_i2ve4(<8 x i8> %v, <8 x i8> %w) {
 ; RV32-NEXT:    addi a0, zero, 66
 ; RV32-NEXT:    vsetivli zero, 1, e8, mf8, ta, mu
 ; RV32-NEXT:    vmv.s.x v0, a0
-; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
-; RV32-NEXT:    vmv.v.i v26, 0
-; RV32-NEXT:    vsetvli zero, zero, e8, mf2, tu, mu
-; RV32-NEXT:    vrgather.vv v25, v9, v26, v0.t
+; RV32-NEXT:    vsetivli zero, 8, e8, mf2, tu, mu
+; RV32-NEXT:    vrgather.vi v25, v9, 0, v0.t
 ; RV32-NEXT:    vmv1r.v v8, v25
 ; RV32-NEXT:    ret
 ;
@@ -476,10 +466,8 @@ define <8 x i8> @splat_ve2_we0_ins_i2ve4(<8 x i8> %v, <8 x i8> %w) {
 ; RV64-NEXT:    addi a0, zero, 66
 ; RV64-NEXT:    vsetivli zero, 1, e8, mf8, ta, mu
 ; RV64-NEXT:    vmv.s.x v0, a0
-; RV64-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
-; RV64-NEXT:    vmv.v.i v26, 0
-; RV64-NEXT:    vsetvli zero, zero, e8, mf2, tu, mu
-; RV64-NEXT:    vrgather.vv v25, v9, v26, v0.t
+; RV64-NEXT:    vsetivli zero, 8, e8, mf2, tu, mu
+; RV64-NEXT:    vrgather.vi v25, v9, 0, v0.t
 ; RV64-NEXT:    vmv1r.v v8, v25
 ; RV64-NEXT:    ret
   %shuff = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 2, i32 8, i32 4, i32 2, i32 2, i32 2, i32 8, i32 2>
@@ -489,19 +477,19 @@ define <8 x i8> @splat_ve2_we0_ins_i2ve4(<8 x i8> %v, <8 x i8> %w) {
 define <8 x i8> @splat_ve2_we0_ins_i2we4(<8 x i8> %v, <8 x i8> %w) {
 ; CHECK-LABEL: splat_ve2_we0_ins_i2we4:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
-; CHECK-NEXT:    vmv.v.i v26, 2
-; CHECK-NEXT:    vrgather.vv v25, v8, v26
 ; CHECK-NEXT:    addi a0, zero, 4
-; CHECK-NEXT:    vmv.s.x v26, a0
-; CHECK-NEXT:    vmv.v.i v27, 0
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
+; CHECK-NEXT:    vmv.s.x v25, a0
+; CHECK-NEXT:    vmv.v.i v26, 0
 ; CHECK-NEXT:    vsetivli zero, 3, e8, mf2, tu, mu
-; CHECK-NEXT:    vslideup.vi v27, v26, 2
+; CHECK-NEXT:    vslideup.vi v26, v25, 2
 ; CHECK-NEXT:    addi a0, zero, 70
 ; CHECK-NEXT:    vsetivli zero, 1, e8, mf8, ta, mu
 ; CHECK-NEXT:    vmv.s.x v0, a0
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, tu, mu
-; CHECK-NEXT:    vrgather.vv v25, v9, v27, v0.t
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
+; CHECK-NEXT:    vrgather.vi v25, v8, 2
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf2, tu, mu
+; CHECK-NEXT:    vrgather.vv v25, v9, v26, v0.t
 ; CHECK-NEXT:    vmv1r.v v8, v25
 ; CHECK-NEXT:    ret
   %shuff = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 2, i32 8, i32 12, i32 2, i32 2, i32 2, i32 8, i32 2>