[llvm] 89f119c - [RISCV] Update matchSplatAsGather to use the index of extract_elt if in-bounds (#118873)

Philip Reames via llvm-commits llvm-commits at lists.llvm.org
Tue Jan 21 12:51:59 PST 2025


Author: Mikhail R. Gadelha
Date: 2025-01-21T12:51:41-08:00
New Revision: 89f119cbdae0beb606229ca422cdd9e1400d0746

URL: https://github.com/llvm/llvm-project/commit/89f119cbdae0beb606229ca422cdd9e1400d0746
DIFF: https://github.com/llvm/llvm-project/commit/89f119cbdae0beb606229ca422cdd9e1400d0746.diff

LOG: [RISCV] Update matchSplatAsGather to use the index of extract_elt if in-bounds (#118873)

This is a follow-up to #117878 and allows the usage of vrgather if the index
we are accessing in VT is constant and within bounds.

This patch replaces the previous behavior of bailing out if the length of the
search vector is greater than the vector of elements we are searching for.
Since matchSplatAsGather works on EXTRACT_VECTOR_ELT, and we know the index
from which the element is extracted, we only need to check if we are doing an
insert from a larger vector into a smaller one, in which we do an extract
instead.

Co-authored-by: Luke Lau luke_lau at icloud.com
Co-authored-by: Philip Reames preames at rivosinc.com

Added: 
    

Modified: 
    llvm/lib/Target/RISCV/RISCVISelLowering.cpp
    llvm/test/CodeGen/RISCV/rvv/intrinsic-vector-match.ll
    llvm/test/CodeGen/RISCV/rvv/splat-vectors.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index d1a5a7602914516..fe6dd0ac6d5a4e4 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -3522,27 +3522,43 @@ static SDValue matchSplatAsGather(SDValue SplatVal, MVT VT, const SDLoc &DL,
   // 
diff erent
   // FIXME: Support i1 vectors, maybe by promoting to i8?
   MVT EltTy = VT.getVectorElementType();
-  if (EltTy == MVT::i1 ||
-      EltTy != Vec.getSimpleValueType().getVectorElementType())
+  MVT SrcVT = Vec.getSimpleValueType();
+  if (EltTy == MVT::i1 || EltTy != SrcVT.getVectorElementType())
     return SDValue();
   SDValue Idx = SplatVal.getOperand(1);
   // The index must be a legal type.
   if (Idx.getValueType() != Subtarget.getXLenVT())
     return SDValue();
 
-  // Check that Index lies within VT
-  // TODO: Can we check if the Index is constant and known in-bounds?
-  if (!TypeSize::isKnownLE(Vec.getValueSizeInBits(), VT.getSizeInBits()))
-    return SDValue();
+  // Check that we know Idx lies within VT
+  if (!TypeSize::isKnownLE(SrcVT.getSizeInBits(), VT.getSizeInBits())) {
+    auto *CIdx = dyn_cast<ConstantSDNode>(Idx);
+    if (!CIdx || CIdx->getZExtValue() >= VT.getVectorMinNumElements())
+      return SDValue();
+  }
 
+  // Convert fixed length vectors to scalable
   MVT ContainerVT = VT;
   if (VT.isFixedLengthVector())
     ContainerVT = getContainerForFixedLengthVector(DAG, VT, Subtarget);
 
-  Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ContainerVT,
-                    DAG.getUNDEF(ContainerVT), Vec,
-                    DAG.getVectorIdxConstant(0, DL));
+  MVT SrcContainerVT = SrcVT;
+  if (SrcVT.isFixedLengthVector()) {
+    SrcContainerVT = getContainerForFixedLengthVector(DAG, SrcVT, Subtarget);
+    Vec = convertToScalableVector(SrcContainerVT, Vec, DAG, Subtarget);
+  }
+
+  // Put Vec in a VT sized vector
+  if (SrcContainerVT.getVectorMinNumElements() <
+      ContainerVT.getVectorMinNumElements())
+    Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ContainerVT,
+                      DAG.getUNDEF(ContainerVT), Vec,
+                      DAG.getVectorIdxConstant(0, DL));
+  else
+    Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ContainerVT, Vec,
+                      DAG.getVectorIdxConstant(0, DL));
 
+  // We checked that Idx fits inside VT earlier
   auto [Mask, VL] = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget);
 
   SDValue Gather = DAG.getNode(RISCVISD::VRGATHER_VX_VL, DL, ContainerVT, Vec,

diff  --git a/llvm/test/CodeGen/RISCV/rvv/intrinsic-vector-match.ll b/llvm/test/CodeGen/RISCV/rvv/intrinsic-vector-match.ll
index 5d730da09ef83fd..7d37d91ee21b552 100644
--- a/llvm/test/CodeGen/RISCV/rvv/intrinsic-vector-match.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/intrinsic-vector-match.ll
@@ -143,9 +143,8 @@ define <vscale x 16 x i1> @match_nxv16i8_v16i8(<vscale x 16 x i8> %op1, <16 x i8
 define <16 x i1> @match_v16i8_v1i8(<16 x i8> %op1, <1 x i8> %op2, <16 x i1> %mask) {
 ; CHECK-LABEL: match_v16i8_v1i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vrgather.vi v10, v9, 0
 ; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-NEXT:    vrgather.vi v10, v9, 0
 ; CHECK-NEXT:    vmseq.vv v8, v8, v10
 ; CHECK-NEXT:    vmand.mm v0, v8, v0
 ; CHECK-NEXT:    ret
@@ -383,69 +382,63 @@ define <8 x i1> @match_v8i16(<8 x i16> %op1, <8 x i16> %op2, <8 x i1> %mask) {
 define <8 x i1> @match_v8i8_v16i8(<8 x i8> %op1, <16 x i8> %op2, <8 x i1> %mask) {
 ; CHECK-LABEL: match_v8i8_v16i8:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT:    vrgather.vi v10, v9, 1
 ; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-NEXT:    vmv.x.s a0, v9
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vslidedown.vi v11, v9, 2
-; CHECK-NEXT:    vmv.x.s a1, v10
-; CHECK-NEXT:    vslidedown.vi v10, v9, 3
-; CHECK-NEXT:    vmv.x.s a2, v11
-; CHECK-NEXT:    vslidedown.vi v11, v9, 4
-; CHECK-NEXT:    vmv.x.s a3, v10
-; CHECK-NEXT:    vslidedown.vi v10, v9, 5
-; CHECK-NEXT:    vmv.x.s a4, v11
-; CHECK-NEXT:    vslidedown.vi v11, v9, 6
-; CHECK-NEXT:    vmv.x.s a5, v10
-; CHECK-NEXT:    vslidedown.vi v10, v9, 7
-; CHECK-NEXT:    vmv.x.s a6, v11
 ; CHECK-NEXT:    vslidedown.vi v11, v9, 8
-; CHECK-NEXT:    vmv.x.s a7, v10
-; CHECK-NEXT:    vslidedown.vi v10, v9, 9
-; CHECK-NEXT:    vmv.x.s t0, v11
+; CHECK-NEXT:    vmv.x.s a0, v11
+; CHECK-NEXT:    vslidedown.vi v11, v9, 9
+; CHECK-NEXT:    vmv.x.s a1, v11
 ; CHECK-NEXT:    vslidedown.vi v11, v9, 10
-; CHECK-NEXT:    vmv.x.s t1, v10
-; CHECK-NEXT:    vslidedown.vi v10, v9, 11
-; CHECK-NEXT:    vmv.x.s t2, v11
+; CHECK-NEXT:    vmv.x.s a2, v11
+; CHECK-NEXT:    vslidedown.vi v11, v9, 11
+; CHECK-NEXT:    vmv.x.s a3, v11
 ; CHECK-NEXT:    vslidedown.vi v11, v9, 12
-; CHECK-NEXT:    vmv.x.s t3, v10
-; CHECK-NEXT:    vslidedown.vi v10, v9, 13
-; CHECK-NEXT:    vmv.x.s t4, v11
+; CHECK-NEXT:    vmv.x.s a4, v11
+; CHECK-NEXT:    vslidedown.vi v11, v9, 13
+; CHECK-NEXT:    vmv.x.s a5, v11
 ; CHECK-NEXT:    vslidedown.vi v11, v9, 14
-; CHECK-NEXT:    vslidedown.vi v9, v9, 15
-; CHECK-NEXT:    vmv.x.s t5, v10
+; CHECK-NEXT:    vmv.x.s a6, v11
+; CHECK-NEXT:    vslidedown.vi v11, v9, 15
+; CHECK-NEXT:    vmv.x.s a7, v11
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vmseq.vx v10, v8, a0
-; CHECK-NEXT:    vmv.x.s a0, v11
-; CHECK-NEXT:    vmseq.vx v11, v8, a1
-; CHECK-NEXT:    vmv.x.s a1, v9
-; CHECK-NEXT:    vmseq.vx v9, v8, a2
+; CHECK-NEXT:    vrgather.vi v11, v9, 0
+; CHECK-NEXT:    vmseq.vv v10, v8, v10
+; CHECK-NEXT:    vmseq.vv v11, v8, v11
+; CHECK-NEXT:    vmor.mm v10, v11, v10
+; CHECK-NEXT:    vrgather.vi v11, v9, 2
+; CHECK-NEXT:    vmseq.vv v11, v8, v11
 ; CHECK-NEXT:    vmor.mm v10, v10, v11
-; CHECK-NEXT:    vmseq.vx v11, v8, a3
+; CHECK-NEXT:    vrgather.vi v11, v9, 3
+; CHECK-NEXT:    vmseq.vv v11, v8, v11
+; CHECK-NEXT:    vmor.mm v10, v10, v11
+; CHECK-NEXT:    vrgather.vi v11, v9, 4
+; CHECK-NEXT:    vmseq.vv v11, v8, v11
+; CHECK-NEXT:    vmor.mm v10, v10, v11
+; CHECK-NEXT:    vrgather.vi v11, v9, 5
+; CHECK-NEXT:    vmseq.vv v11, v8, v11
+; CHECK-NEXT:    vmor.mm v10, v10, v11
+; CHECK-NEXT:    vrgather.vi v11, v9, 6
+; CHECK-NEXT:    vmseq.vv v11, v8, v11
+; CHECK-NEXT:    vmor.mm v10, v10, v11
+; CHECK-NEXT:    vmseq.vx v11, v8, a0
+; CHECK-NEXT:    vrgather.vi v12, v9, 7
+; CHECK-NEXT:    vmseq.vv v9, v8, v12
 ; CHECK-NEXT:    vmor.mm v9, v10, v9
-; CHECK-NEXT:    vmseq.vx v10, v8, a4
+; CHECK-NEXT:    vmseq.vx v10, v8, a1
 ; CHECK-NEXT:    vmor.mm v9, v9, v11
-; CHECK-NEXT:    vmseq.vx v11, v8, a5
+; CHECK-NEXT:    vmseq.vx v11, v8, a2
 ; CHECK-NEXT:    vmor.mm v9, v9, v10
-; CHECK-NEXT:    vmseq.vx v10, v8, a6
+; CHECK-NEXT:    vmseq.vx v10, v8, a3
 ; CHECK-NEXT:    vmor.mm v9, v9, v11
-; CHECK-NEXT:    vmseq.vx v11, v8, a7
+; CHECK-NEXT:    vmseq.vx v11, v8, a4
 ; CHECK-NEXT:    vmor.mm v9, v9, v10
-; CHECK-NEXT:    vmseq.vx v10, v8, t0
+; CHECK-NEXT:    vmseq.vx v10, v8, a5
 ; CHECK-NEXT:    vmor.mm v9, v9, v11
-; CHECK-NEXT:    vmseq.vx v11, v8, t1
+; CHECK-NEXT:    vmseq.vx v11, v8, a6
 ; CHECK-NEXT:    vmor.mm v9, v9, v10
-; CHECK-NEXT:    vmseq.vx v10, v8, t2
 ; CHECK-NEXT:    vmor.mm v9, v9, v11
-; CHECK-NEXT:    vmseq.vx v11, v8, t3
-; CHECK-NEXT:    vmor.mm v9, v9, v10
-; CHECK-NEXT:    vmseq.vx v10, v8, t4
-; CHECK-NEXT:    vmor.mm v9, v9, v11
-; CHECK-NEXT:    vmseq.vx v11, v8, t5
-; CHECK-NEXT:    vmor.mm v9, v9, v10
-; CHECK-NEXT:    vmseq.vx v10, v8, a0
-; CHECK-NEXT:    vmor.mm v9, v9, v11
-; CHECK-NEXT:    vmor.mm v9, v9, v10
-; CHECK-NEXT:    vmseq.vx v8, v8, a1
+; CHECK-NEXT:    vmseq.vx v8, v8, a7
 ; CHECK-NEXT:    vmor.mm v8, v9, v8
 ; CHECK-NEXT:    vmand.mm v0, v8, v0
 ; CHECK-NEXT:    ret
@@ -456,387 +449,251 @@ define <8 x i1> @match_v8i8_v16i8(<8 x i8> %op1, <16 x i8> %op2, <8 x i1> %mask)
 define <vscale x 16 x i1> @match_nxv16i8_v32i8(<vscale x 16 x i8> %op1, <32 x i8> %op2, <vscale x 16 x i1> %mask) {
 ; RV32-LABEL: match_nxv16i8_v32i8:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -64
-; RV32-NEXT:    .cfi_def_cfa_offset 64
-; RV32-NEXT:    sw ra, 60(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s0, 56(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s1, 52(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s2, 48(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s3, 44(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s4, 40(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s5, 36(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s6, 32(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s7, 28(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s8, 24(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s9, 20(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s10, 16(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s11, 12(sp) # 4-byte Folded Spill
-; RV32-NEXT:    .cfi_offset ra, -4
-; RV32-NEXT:    .cfi_offset s0, -8
-; RV32-NEXT:    .cfi_offset s1, -12
-; RV32-NEXT:    .cfi_offset s2, -16
-; RV32-NEXT:    .cfi_offset s3, -20
-; RV32-NEXT:    .cfi_offset s4, -24
-; RV32-NEXT:    .cfi_offset s5, -28
-; RV32-NEXT:    .cfi_offset s6, -32
-; RV32-NEXT:    .cfi_offset s7, -36
-; RV32-NEXT:    .cfi_offset s8, -40
-; RV32-NEXT:    .cfi_offset s9, -44
-; RV32-NEXT:    .cfi_offset s10, -48
-; RV32-NEXT:    .cfi_offset s11, -52
-; RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; RV32-NEXT:    vmv.x.s a0, v10
-; RV32-NEXT:    sw a0, 8(sp) # 4-byte Folded Spill
-; RV32-NEXT:    vslidedown.vi v12, v10, 1
-; RV32-NEXT:    vslidedown.vi v13, v10, 2
-; RV32-NEXT:    vslidedown.vi v14, v10, 3
-; RV32-NEXT:    vslidedown.vi v15, v10, 4
-; RV32-NEXT:    vslidedown.vi v16, v10, 5
-; RV32-NEXT:    vslidedown.vi v17, v10, 6
-; RV32-NEXT:    vslidedown.vi v18, v10, 7
-; RV32-NEXT:    vslidedown.vi v19, v10, 8
-; RV32-NEXT:    vslidedown.vi v20, v10, 9
-; RV32-NEXT:    vslidedown.vi v21, v10, 10
-; RV32-NEXT:    vslidedown.vi v22, v10, 11
-; RV32-NEXT:    vslidedown.vi v23, v10, 12
-; RV32-NEXT:    vsetivli zero, 1, e8, m2, ta, ma
-; RV32-NEXT:    vslidedown.vi v24, v10, 16
-; RV32-NEXT:    vmv.x.s a1, v24
-; RV32-NEXT:    vslidedown.vi v24, v10, 17
-; RV32-NEXT:    vmv.x.s a2, v24
-; RV32-NEXT:    vslidedown.vi v24, v10, 18
-; RV32-NEXT:    vmv.x.s a3, v24
-; RV32-NEXT:    vslidedown.vi v24, v10, 19
-; RV32-NEXT:    vmv.x.s a4, v24
-; RV32-NEXT:    vslidedown.vi v24, v10, 20
-; RV32-NEXT:    vmv.x.s a5, v24
-; RV32-NEXT:    vslidedown.vi v24, v10, 21
-; RV32-NEXT:    vmv.x.s a6, v24
-; RV32-NEXT:    vslidedown.vi v24, v10, 22
-; RV32-NEXT:    vmv.x.s a7, v24
-; RV32-NEXT:    vslidedown.vi v24, v10, 23
-; RV32-NEXT:    vmv.x.s t0, v24
-; RV32-NEXT:    vslidedown.vi v24, v10, 24
-; RV32-NEXT:    vmv.x.s t1, v24
-; RV32-NEXT:    vslidedown.vi v24, v10, 25
-; RV32-NEXT:    vmv.x.s t2, v24
-; RV32-NEXT:    vslidedown.vi v24, v10, 26
-; RV32-NEXT:    vmv.x.s t3, v24
-; RV32-NEXT:    vslidedown.vi v24, v10, 27
-; RV32-NEXT:    vmv.x.s t4, v24
-; RV32-NEXT:    vslidedown.vi v24, v10, 28
-; RV32-NEXT:    vmv.x.s t5, v24
-; RV32-NEXT:    vslidedown.vi v24, v10, 29
-; RV32-NEXT:    vmv.x.s t6, v24
-; RV32-NEXT:    vslidedown.vi v24, v10, 30
-; RV32-NEXT:    vmv.x.s s0, v24
-; RV32-NEXT:    vslidedown.vi v24, v10, 31
-; RV32-NEXT:    vmv.x.s s1, v24
-; RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; RV32-NEXT:    vslidedown.vi v11, v10, 13
-; RV32-NEXT:    vslidedown.vi v24, v10, 14
-; RV32-NEXT:    vslidedown.vi v10, v10, 15
-; RV32-NEXT:    vmv.x.s s2, v12
-; RV32-NEXT:    vmv.x.s s3, v13
-; RV32-NEXT:    vmv.x.s s4, v14
-; RV32-NEXT:    vmv.x.s s5, v15
-; RV32-NEXT:    vmv.x.s s6, v16
-; RV32-NEXT:    vmv.x.s s7, v17
-; RV32-NEXT:    vmv.x.s s8, v18
-; RV32-NEXT:    vmv.x.s s9, v19
-; RV32-NEXT:    vmv.x.s s10, v20
-; RV32-NEXT:    vmv.x.s s11, v21
-; RV32-NEXT:    vmv.x.s ra, v22
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT:    .cfi_offset s0, -4
 ; RV32-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
-; RV32-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
-; RV32-NEXT:    vmseq.vx v12, v8, a0
-; RV32-NEXT:    vmv.x.s a0, v23
-; RV32-NEXT:    vmseq.vx v13, v8, s2
-; RV32-NEXT:    vmv.x.s s2, v11
-; RV32-NEXT:    vmseq.vx v11, v8, s3
-; RV32-NEXT:    vmv.x.s s3, v24
-; RV32-NEXT:    vmseq.vx v14, v8, s4
-; RV32-NEXT:    vmv.x.s s4, v10
-; RV32-NEXT:    vmseq.vx v10, v8, s5
-; RV32-NEXT:    vmor.mm v12, v12, v13
-; RV32-NEXT:    vmseq.vx v13, v8, s6
-; RV32-NEXT:    vmor.mm v11, v12, v11
-; RV32-NEXT:    vmseq.vx v12, v8, s7
+; RV32-NEXT:    vrgather.vi v14, v10, 1
+; RV32-NEXT:    vrgather.vi v16, v10, 0
+; RV32-NEXT:    vrgather.vi v18, v10, 2
+; RV32-NEXT:    vrgather.vi v20, v10, 3
+; RV32-NEXT:    vrgather.vi v22, v10, 4
+; RV32-NEXT:    vrgather.vi v24, v10, 5
+; RV32-NEXT:    vrgather.vi v26, v10, 6
+; RV32-NEXT:    vrgather.vi v28, v10, 7
+; RV32-NEXT:    vmseq.vv v12, v8, v14
+; RV32-NEXT:    vmseq.vv v13, v8, v16
+; RV32-NEXT:    vrgather.vi v30, v10, 8
+; RV32-NEXT:    vmseq.vv v14, v8, v18
+; RV32-NEXT:    vmseq.vv v15, v8, v20
+; RV32-NEXT:    vrgather.vi v6, v10, 9
+; RV32-NEXT:    vmseq.vv v16, v8, v22
+; RV32-NEXT:    vmseq.vv v17, v8, v24
+; RV32-NEXT:    vrgather.vi v24, v10, 10
+; RV32-NEXT:    vmseq.vv v18, v8, v26
+; RV32-NEXT:    vmseq.vv v19, v8, v28
+; RV32-NEXT:    vrgather.vi v26, v10, 11
+; RV32-NEXT:    vmseq.vv v20, v8, v30
+; RV32-NEXT:    vmseq.vv v21, v8, v6
+; RV32-NEXT:    vrgather.vi v28, v10, 12
+; RV32-NEXT:    vmseq.vv v22, v8, v24
+; RV32-NEXT:    vmseq.vv v23, v8, v26
+; RV32-NEXT:    vrgather.vi v26, v10, 13
+; RV32-NEXT:    vmseq.vv v25, v8, v28
+; RV32-NEXT:    vmseq.vv v24, v8, v26
+; RV32-NEXT:    vslidedown.vi v26, v10, 16
+; RV32-NEXT:    vmv.x.s a0, v26
+; RV32-NEXT:    vslidedown.vi v26, v10, 17
+; RV32-NEXT:    vmv.x.s a1, v26
+; RV32-NEXT:    vslidedown.vi v26, v10, 18
+; RV32-NEXT:    vmv.x.s a2, v26
+; RV32-NEXT:    vslidedown.vi v26, v10, 19
+; RV32-NEXT:    vmv.x.s a3, v26
+; RV32-NEXT:    vslidedown.vi v26, v10, 20
+; RV32-NEXT:    vmv.x.s a4, v26
+; RV32-NEXT:    vslidedown.vi v26, v10, 21
+; RV32-NEXT:    vmv.x.s a5, v26
+; RV32-NEXT:    vslidedown.vi v26, v10, 22
+; RV32-NEXT:    vmv.x.s a6, v26
+; RV32-NEXT:    vslidedown.vi v26, v10, 23
+; RV32-NEXT:    vmv.x.s a7, v26
+; RV32-NEXT:    vslidedown.vi v26, v10, 24
+; RV32-NEXT:    vmv.x.s t0, v26
+; RV32-NEXT:    vslidedown.vi v26, v10, 25
+; RV32-NEXT:    vmv.x.s t1, v26
+; RV32-NEXT:    vslidedown.vi v26, v10, 26
+; RV32-NEXT:    vmv.x.s t2, v26
+; RV32-NEXT:    vslidedown.vi v26, v10, 27
+; RV32-NEXT:    vmv.x.s t3, v26
+; RV32-NEXT:    vslidedown.vi v26, v10, 28
+; RV32-NEXT:    vmv.x.s t4, v26
+; RV32-NEXT:    vslidedown.vi v26, v10, 29
+; RV32-NEXT:    vmv.x.s t5, v26
+; RV32-NEXT:    vslidedown.vi v26, v10, 30
+; RV32-NEXT:    vmv.x.s t6, v26
+; RV32-NEXT:    vslidedown.vi v26, v10, 31
+; RV32-NEXT:    vmv.x.s s0, v26
+; RV32-NEXT:    vrgather.vi v26, v10, 14
+; RV32-NEXT:    vmseq.vv v28, v8, v26
+; RV32-NEXT:    vrgather.vi v26, v10, 15
+; RV32-NEXT:    vmseq.vv v10, v8, v26
+; RV32-NEXT:    vmor.mm v11, v13, v12
 ; RV32-NEXT:    vmor.mm v11, v11, v14
-; RV32-NEXT:    vmseq.vx v14, v8, s8
+; RV32-NEXT:    vmor.mm v11, v11, v15
+; RV32-NEXT:    vmor.mm v11, v11, v16
+; RV32-NEXT:    vmor.mm v11, v11, v17
+; RV32-NEXT:    vmor.mm v11, v11, v18
+; RV32-NEXT:    vmor.mm v11, v11, v19
+; RV32-NEXT:    vmor.mm v11, v11, v20
+; RV32-NEXT:    vmor.mm v11, v11, v21
+; RV32-NEXT:    vmor.mm v11, v11, v22
+; RV32-NEXT:    vmor.mm v11, v11, v23
+; RV32-NEXT:    vmor.mm v11, v11, v25
+; RV32-NEXT:    vmseq.vx v12, v8, a0
+; RV32-NEXT:    vmor.mm v11, v11, v24
+; RV32-NEXT:    vmseq.vx v13, v8, a1
+; RV32-NEXT:    vmor.mm v11, v11, v28
+; RV32-NEXT:    vmseq.vx v14, v8, a2
 ; RV32-NEXT:    vmor.mm v10, v11, v10
-; RV32-NEXT:    vmseq.vx v11, v8, s9
-; RV32-NEXT:    vmor.mm v10, v10, v13
-; RV32-NEXT:    vmseq.vx v13, v8, s10
+; RV32-NEXT:    vmseq.vx v11, v8, a3
 ; RV32-NEXT:    vmor.mm v10, v10, v12
-; RV32-NEXT:    vmseq.vx v12, v8, s11
-; RV32-NEXT:    vmor.mm v10, v10, v14
-; RV32-NEXT:    vmseq.vx v14, v8, ra
-; RV32-NEXT:    vmor.mm v10, v10, v11
-; RV32-NEXT:    vmseq.vx v11, v8, a0
+; RV32-NEXT:    vmseq.vx v12, v8, a4
 ; RV32-NEXT:    vmor.mm v10, v10, v13
-; RV32-NEXT:    vmseq.vx v13, v8, s2
-; RV32-NEXT:    vmor.mm v10, v10, v12
-; RV32-NEXT:    vmseq.vx v12, v8, s3
+; RV32-NEXT:    vmseq.vx v13, v8, a5
 ; RV32-NEXT:    vmor.mm v10, v10, v14
-; RV32-NEXT:    vmseq.vx v14, v8, s4
+; RV32-NEXT:    vmseq.vx v14, v8, a6
 ; RV32-NEXT:    vmor.mm v10, v10, v11
-; RV32-NEXT:    vmseq.vx v11, v8, a1
-; RV32-NEXT:    vmor.mm v10, v10, v13
-; RV32-NEXT:    vmseq.vx v13, v8, a2
+; RV32-NEXT:    vmseq.vx v11, v8, a7
 ; RV32-NEXT:    vmor.mm v10, v10, v12
-; RV32-NEXT:    vmseq.vx v12, v8, a3
-; RV32-NEXT:    vmor.mm v10, v10, v14
-; RV32-NEXT:    vmseq.vx v14, v8, a4
-; RV32-NEXT:    vmor.mm v10, v10, v11
-; RV32-NEXT:    vmseq.vx v11, v8, a5
+; RV32-NEXT:    vmseq.vx v12, v8, t0
 ; RV32-NEXT:    vmor.mm v10, v10, v13
-; RV32-NEXT:    vmseq.vx v13, v8, a6
-; RV32-NEXT:    vmor.mm v10, v10, v12
-; RV32-NEXT:    vmseq.vx v12, v8, a7
+; RV32-NEXT:    vmseq.vx v13, v8, t1
 ; RV32-NEXT:    vmor.mm v10, v10, v14
-; RV32-NEXT:    vmseq.vx v14, v8, t0
+; RV32-NEXT:    vmseq.vx v14, v8, t2
 ; RV32-NEXT:    vmor.mm v10, v10, v11
-; RV32-NEXT:    vmseq.vx v11, v8, t1
-; RV32-NEXT:    vmor.mm v10, v10, v13
-; RV32-NEXT:    vmseq.vx v13, v8, t2
+; RV32-NEXT:    vmseq.vx v11, v8, t3
 ; RV32-NEXT:    vmor.mm v10, v10, v12
-; RV32-NEXT:    vmseq.vx v12, v8, t3
-; RV32-NEXT:    vmor.mm v10, v10, v14
-; RV32-NEXT:    vmseq.vx v14, v8, t4
-; RV32-NEXT:    vmor.mm v10, v10, v11
-; RV32-NEXT:    vmseq.vx v11, v8, t5
+; RV32-NEXT:    vmseq.vx v12, v8, t4
 ; RV32-NEXT:    vmor.mm v10, v10, v13
-; RV32-NEXT:    vmseq.vx v13, v8, t6
-; RV32-NEXT:    vmor.mm v10, v10, v12
-; RV32-NEXT:    vmseq.vx v12, v8, s0
+; RV32-NEXT:    vmseq.vx v13, v8, t5
 ; RV32-NEXT:    vmor.mm v10, v10, v14
+; RV32-NEXT:    vmseq.vx v14, v8, t6
 ; RV32-NEXT:    vmor.mm v10, v10, v11
-; RV32-NEXT:    vmor.mm v10, v10, v13
 ; RV32-NEXT:    vmor.mm v10, v10, v12
-; RV32-NEXT:    vmseq.vx v11, v8, s1
+; RV32-NEXT:    vmor.mm v10, v10, v13
+; RV32-NEXT:    vmor.mm v10, v10, v14
+; RV32-NEXT:    vmseq.vx v11, v8, s0
 ; RV32-NEXT:    vmor.mm v8, v10, v11
 ; RV32-NEXT:    vmand.mm v0, v8, v0
-; RV32-NEXT:    lw ra, 60(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s0, 56(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s1, 52(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s2, 48(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s3, 44(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s4, 40(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s5, 36(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s6, 32(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s7, 28(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s8, 24(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s9, 20(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s10, 16(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s11, 12(sp) # 4-byte Folded Reload
-; RV32-NEXT:    .cfi_restore ra
+; RV32-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    .cfi_restore s0
-; RV32-NEXT:    .cfi_restore s1
-; RV32-NEXT:    .cfi_restore s2
-; RV32-NEXT:    .cfi_restore s3
-; RV32-NEXT:    .cfi_restore s4
-; RV32-NEXT:    .cfi_restore s5
-; RV32-NEXT:    .cfi_restore s6
-; RV32-NEXT:    .cfi_restore s7
-; RV32-NEXT:    .cfi_restore s8
-; RV32-NEXT:    .cfi_restore s9
-; RV32-NEXT:    .cfi_restore s10
-; RV32-NEXT:    .cfi_restore s11
-; RV32-NEXT:    addi sp, sp, 64
+; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: match_nxv16i8_v32i8:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    addi sp, sp, -112
-; RV64-NEXT:    .cfi_def_cfa_offset 112
-; RV64-NEXT:    sd ra, 104(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s0, 96(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s1, 88(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s2, 80(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s3, 72(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s4, 64(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s5, 56(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s6, 48(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s7, 40(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s8, 32(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s9, 24(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s10, 16(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s11, 8(sp) # 8-byte Folded Spill
-; RV64-NEXT:    .cfi_offset ra, -8
-; RV64-NEXT:    .cfi_offset s0, -16
-; RV64-NEXT:    .cfi_offset s1, -24
-; RV64-NEXT:    .cfi_offset s2, -32
-; RV64-NEXT:    .cfi_offset s3, -40
-; RV64-NEXT:    .cfi_offset s4, -48
-; RV64-NEXT:    .cfi_offset s5, -56
-; RV64-NEXT:    .cfi_offset s6, -64
-; RV64-NEXT:    .cfi_offset s7, -72
-; RV64-NEXT:    .cfi_offset s8, -80
-; RV64-NEXT:    .cfi_offset s9, -88
-; RV64-NEXT:    .cfi_offset s10, -96
-; RV64-NEXT:    .cfi_offset s11, -104
-; RV64-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; RV64-NEXT:    vmv.x.s a0, v10
-; RV64-NEXT:    sd a0, 0(sp) # 8-byte Folded Spill
-; RV64-NEXT:    vslidedown.vi v12, v10, 1
-; RV64-NEXT:    vslidedown.vi v13, v10, 2
-; RV64-NEXT:    vslidedown.vi v14, v10, 3
-; RV64-NEXT:    vslidedown.vi v15, v10, 4
-; RV64-NEXT:    vslidedown.vi v16, v10, 5
-; RV64-NEXT:    vslidedown.vi v17, v10, 6
-; RV64-NEXT:    vslidedown.vi v18, v10, 7
-; RV64-NEXT:    vslidedown.vi v19, v10, 8
-; RV64-NEXT:    vslidedown.vi v20, v10, 9
-; RV64-NEXT:    vslidedown.vi v21, v10, 10
-; RV64-NEXT:    vslidedown.vi v22, v10, 11
-; RV64-NEXT:    vslidedown.vi v23, v10, 12
-; RV64-NEXT:    vsetivli zero, 1, e8, m2, ta, ma
-; RV64-NEXT:    vslidedown.vi v24, v10, 16
-; RV64-NEXT:    vmv.x.s a1, v24
-; RV64-NEXT:    vslidedown.vi v24, v10, 17
-; RV64-NEXT:    vmv.x.s a2, v24
-; RV64-NEXT:    vslidedown.vi v24, v10, 18
-; RV64-NEXT:    vmv.x.s a3, v24
-; RV64-NEXT:    vslidedown.vi v24, v10, 19
-; RV64-NEXT:    vmv.x.s a4, v24
-; RV64-NEXT:    vslidedown.vi v24, v10, 20
-; RV64-NEXT:    vmv.x.s a5, v24
-; RV64-NEXT:    vslidedown.vi v24, v10, 21
-; RV64-NEXT:    vmv.x.s a6, v24
-; RV64-NEXT:    vslidedown.vi v24, v10, 22
-; RV64-NEXT:    vmv.x.s a7, v24
-; RV64-NEXT:    vslidedown.vi v24, v10, 23
-; RV64-NEXT:    vmv.x.s t0, v24
-; RV64-NEXT:    vslidedown.vi v24, v10, 24
-; RV64-NEXT:    vmv.x.s t1, v24
-; RV64-NEXT:    vslidedown.vi v24, v10, 25
-; RV64-NEXT:    vmv.x.s t2, v24
-; RV64-NEXT:    vslidedown.vi v24, v10, 26
-; RV64-NEXT:    vmv.x.s t3, v24
-; RV64-NEXT:    vslidedown.vi v24, v10, 27
-; RV64-NEXT:    vmv.x.s t4, v24
-; RV64-NEXT:    vslidedown.vi v24, v10, 28
-; RV64-NEXT:    vmv.x.s t5, v24
-; RV64-NEXT:    vslidedown.vi v24, v10, 29
-; RV64-NEXT:    vmv.x.s t6, v24
-; RV64-NEXT:    vslidedown.vi v24, v10, 30
-; RV64-NEXT:    vmv.x.s s0, v24
-; RV64-NEXT:    vslidedown.vi v24, v10, 31
-; RV64-NEXT:    vmv.x.s s1, v24
-; RV64-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; RV64-NEXT:    vslidedown.vi v11, v10, 13
-; RV64-NEXT:    vslidedown.vi v24, v10, 14
-; RV64-NEXT:    vslidedown.vi v10, v10, 15
-; RV64-NEXT:    vmv.x.s s2, v12
-; RV64-NEXT:    vmv.x.s s3, v13
-; RV64-NEXT:    vmv.x.s s4, v14
-; RV64-NEXT:    vmv.x.s s5, v15
-; RV64-NEXT:    vmv.x.s s6, v16
-; RV64-NEXT:    vmv.x.s s7, v17
-; RV64-NEXT:    vmv.x.s s8, v18
-; RV64-NEXT:    vmv.x.s s9, v19
-; RV64-NEXT:    vmv.x.s s10, v20
-; RV64-NEXT:    vmv.x.s s11, v21
-; RV64-NEXT:    vmv.x.s ra, v22
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    .cfi_def_cfa_offset 16
+; RV64-NEXT:    sd s0, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT:    .cfi_offset s0, -8
 ; RV64-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
-; RV64-NEXT:    ld a0, 0(sp) # 8-byte Folded Reload
-; RV64-NEXT:    vmseq.vx v12, v8, a0
-; RV64-NEXT:    vmv.x.s a0, v23
-; RV64-NEXT:    vmseq.vx v13, v8, s2
-; RV64-NEXT:    vmv.x.s s2, v11
-; RV64-NEXT:    vmseq.vx v11, v8, s3
-; RV64-NEXT:    vmv.x.s s3, v24
-; RV64-NEXT:    vmseq.vx v14, v8, s4
-; RV64-NEXT:    vmv.x.s s4, v10
-; RV64-NEXT:    vmseq.vx v10, v8, s5
-; RV64-NEXT:    vmor.mm v12, v12, v13
-; RV64-NEXT:    vmseq.vx v13, v8, s6
-; RV64-NEXT:    vmor.mm v11, v12, v11
-; RV64-NEXT:    vmseq.vx v12, v8, s7
+; RV64-NEXT:    vrgather.vi v14, v10, 1
+; RV64-NEXT:    vrgather.vi v16, v10, 0
+; RV64-NEXT:    vrgather.vi v18, v10, 2
+; RV64-NEXT:    vrgather.vi v20, v10, 3
+; RV64-NEXT:    vrgather.vi v22, v10, 4
+; RV64-NEXT:    vrgather.vi v24, v10, 5
+; RV64-NEXT:    vrgather.vi v26, v10, 6
+; RV64-NEXT:    vrgather.vi v28, v10, 7
+; RV64-NEXT:    vmseq.vv v12, v8, v14
+; RV64-NEXT:    vmseq.vv v13, v8, v16
+; RV64-NEXT:    vrgather.vi v30, v10, 8
+; RV64-NEXT:    vmseq.vv v14, v8, v18
+; RV64-NEXT:    vmseq.vv v15, v8, v20
+; RV64-NEXT:    vrgather.vi v6, v10, 9
+; RV64-NEXT:    vmseq.vv v16, v8, v22
+; RV64-NEXT:    vmseq.vv v17, v8, v24
+; RV64-NEXT:    vrgather.vi v24, v10, 10
+; RV64-NEXT:    vmseq.vv v18, v8, v26
+; RV64-NEXT:    vmseq.vv v19, v8, v28
+; RV64-NEXT:    vrgather.vi v26, v10, 11
+; RV64-NEXT:    vmseq.vv v20, v8, v30
+; RV64-NEXT:    vmseq.vv v21, v8, v6
+; RV64-NEXT:    vrgather.vi v28, v10, 12
+; RV64-NEXT:    vmseq.vv v22, v8, v24
+; RV64-NEXT:    vmseq.vv v23, v8, v26
+; RV64-NEXT:    vrgather.vi v26, v10, 13
+; RV64-NEXT:    vmseq.vv v25, v8, v28
+; RV64-NEXT:    vmseq.vv v24, v8, v26
+; RV64-NEXT:    vslidedown.vi v26, v10, 16
+; RV64-NEXT:    vmv.x.s a0, v26
+; RV64-NEXT:    vslidedown.vi v26, v10, 17
+; RV64-NEXT:    vmv.x.s a1, v26
+; RV64-NEXT:    vslidedown.vi v26, v10, 18
+; RV64-NEXT:    vmv.x.s a2, v26
+; RV64-NEXT:    vslidedown.vi v26, v10, 19
+; RV64-NEXT:    vmv.x.s a3, v26
+; RV64-NEXT:    vslidedown.vi v26, v10, 20
+; RV64-NEXT:    vmv.x.s a4, v26
+; RV64-NEXT:    vslidedown.vi v26, v10, 21
+; RV64-NEXT:    vmv.x.s a5, v26
+; RV64-NEXT:    vslidedown.vi v26, v10, 22
+; RV64-NEXT:    vmv.x.s a6, v26
+; RV64-NEXT:    vslidedown.vi v26, v10, 23
+; RV64-NEXT:    vmv.x.s a7, v26
+; RV64-NEXT:    vslidedown.vi v26, v10, 24
+; RV64-NEXT:    vmv.x.s t0, v26
+; RV64-NEXT:    vslidedown.vi v26, v10, 25
+; RV64-NEXT:    vmv.x.s t1, v26
+; RV64-NEXT:    vslidedown.vi v26, v10, 26
+; RV64-NEXT:    vmv.x.s t2, v26
+; RV64-NEXT:    vslidedown.vi v26, v10, 27
+; RV64-NEXT:    vmv.x.s t3, v26
+; RV64-NEXT:    vslidedown.vi v26, v10, 28
+; RV64-NEXT:    vmv.x.s t4, v26
+; RV64-NEXT:    vslidedown.vi v26, v10, 29
+; RV64-NEXT:    vmv.x.s t5, v26
+; RV64-NEXT:    vslidedown.vi v26, v10, 30
+; RV64-NEXT:    vmv.x.s t6, v26
+; RV64-NEXT:    vslidedown.vi v26, v10, 31
+; RV64-NEXT:    vmv.x.s s0, v26
+; RV64-NEXT:    vrgather.vi v26, v10, 14
+; RV64-NEXT:    vmseq.vv v28, v8, v26
+; RV64-NEXT:    vrgather.vi v26, v10, 15
+; RV64-NEXT:    vmseq.vv v10, v8, v26
+; RV64-NEXT:    vmor.mm v11, v13, v12
 ; RV64-NEXT:    vmor.mm v11, v11, v14
-; RV64-NEXT:    vmseq.vx v14, v8, s8
+; RV64-NEXT:    vmor.mm v11, v11, v15
+; RV64-NEXT:    vmor.mm v11, v11, v16
+; RV64-NEXT:    vmor.mm v11, v11, v17
+; RV64-NEXT:    vmor.mm v11, v11, v18
+; RV64-NEXT:    vmor.mm v11, v11, v19
+; RV64-NEXT:    vmor.mm v11, v11, v20
+; RV64-NEXT:    vmor.mm v11, v11, v21
+; RV64-NEXT:    vmor.mm v11, v11, v22
+; RV64-NEXT:    vmor.mm v11, v11, v23
+; RV64-NEXT:    vmor.mm v11, v11, v25
+; RV64-NEXT:    vmseq.vx v12, v8, a0
+; RV64-NEXT:    vmor.mm v11, v11, v24
+; RV64-NEXT:    vmseq.vx v13, v8, a1
+; RV64-NEXT:    vmor.mm v11, v11, v28
+; RV64-NEXT:    vmseq.vx v14, v8, a2
 ; RV64-NEXT:    vmor.mm v10, v11, v10
-; RV64-NEXT:    vmseq.vx v11, v8, s9
-; RV64-NEXT:    vmor.mm v10, v10, v13
-; RV64-NEXT:    vmseq.vx v13, v8, s10
+; RV64-NEXT:    vmseq.vx v11, v8, a3
 ; RV64-NEXT:    vmor.mm v10, v10, v12
-; RV64-NEXT:    vmseq.vx v12, v8, s11
-; RV64-NEXT:    vmor.mm v10, v10, v14
-; RV64-NEXT:    vmseq.vx v14, v8, ra
-; RV64-NEXT:    vmor.mm v10, v10, v11
-; RV64-NEXT:    vmseq.vx v11, v8, a0
+; RV64-NEXT:    vmseq.vx v12, v8, a4
 ; RV64-NEXT:    vmor.mm v10, v10, v13
-; RV64-NEXT:    vmseq.vx v13, v8, s2
-; RV64-NEXT:    vmor.mm v10, v10, v12
-; RV64-NEXT:    vmseq.vx v12, v8, s3
+; RV64-NEXT:    vmseq.vx v13, v8, a5
 ; RV64-NEXT:    vmor.mm v10, v10, v14
-; RV64-NEXT:    vmseq.vx v14, v8, s4
+; RV64-NEXT:    vmseq.vx v14, v8, a6
 ; RV64-NEXT:    vmor.mm v10, v10, v11
-; RV64-NEXT:    vmseq.vx v11, v8, a1
-; RV64-NEXT:    vmor.mm v10, v10, v13
-; RV64-NEXT:    vmseq.vx v13, v8, a2
+; RV64-NEXT:    vmseq.vx v11, v8, a7
 ; RV64-NEXT:    vmor.mm v10, v10, v12
-; RV64-NEXT:    vmseq.vx v12, v8, a3
-; RV64-NEXT:    vmor.mm v10, v10, v14
-; RV64-NEXT:    vmseq.vx v14, v8, a4
-; RV64-NEXT:    vmor.mm v10, v10, v11
-; RV64-NEXT:    vmseq.vx v11, v8, a5
+; RV64-NEXT:    vmseq.vx v12, v8, t0
 ; RV64-NEXT:    vmor.mm v10, v10, v13
-; RV64-NEXT:    vmseq.vx v13, v8, a6
-; RV64-NEXT:    vmor.mm v10, v10, v12
-; RV64-NEXT:    vmseq.vx v12, v8, a7
+; RV64-NEXT:    vmseq.vx v13, v8, t1
 ; RV64-NEXT:    vmor.mm v10, v10, v14
-; RV64-NEXT:    vmseq.vx v14, v8, t0
+; RV64-NEXT:    vmseq.vx v14, v8, t2
 ; RV64-NEXT:    vmor.mm v10, v10, v11
-; RV64-NEXT:    vmseq.vx v11, v8, t1
-; RV64-NEXT:    vmor.mm v10, v10, v13
-; RV64-NEXT:    vmseq.vx v13, v8, t2
+; RV64-NEXT:    vmseq.vx v11, v8, t3
 ; RV64-NEXT:    vmor.mm v10, v10, v12
-; RV64-NEXT:    vmseq.vx v12, v8, t3
-; RV64-NEXT:    vmor.mm v10, v10, v14
-; RV64-NEXT:    vmseq.vx v14, v8, t4
-; RV64-NEXT:    vmor.mm v10, v10, v11
-; RV64-NEXT:    vmseq.vx v11, v8, t5
+; RV64-NEXT:    vmseq.vx v12, v8, t4
 ; RV64-NEXT:    vmor.mm v10, v10, v13
-; RV64-NEXT:    vmseq.vx v13, v8, t6
-; RV64-NEXT:    vmor.mm v10, v10, v12
-; RV64-NEXT:    vmseq.vx v12, v8, s0
+; RV64-NEXT:    vmseq.vx v13, v8, t5
 ; RV64-NEXT:    vmor.mm v10, v10, v14
+; RV64-NEXT:    vmseq.vx v14, v8, t6
 ; RV64-NEXT:    vmor.mm v10, v10, v11
-; RV64-NEXT:    vmor.mm v10, v10, v13
 ; RV64-NEXT:    vmor.mm v10, v10, v12
-; RV64-NEXT:    vmseq.vx v11, v8, s1
+; RV64-NEXT:    vmor.mm v10, v10, v13
+; RV64-NEXT:    vmor.mm v10, v10, v14
+; RV64-NEXT:    vmseq.vx v11, v8, s0
 ; RV64-NEXT:    vmor.mm v8, v10, v11
 ; RV64-NEXT:    vmand.mm v0, v8, v0
-; RV64-NEXT:    ld ra, 104(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s0, 96(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s1, 88(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s2, 80(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s3, 72(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s4, 64(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s5, 56(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s6, 48(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s7, 40(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s8, 32(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s9, 24(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s10, 16(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s11, 8(sp) # 8-byte Folded Reload
-; RV64-NEXT:    .cfi_restore ra
+; RV64-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    .cfi_restore s0
-; RV64-NEXT:    .cfi_restore s1
-; RV64-NEXT:    .cfi_restore s2
-; RV64-NEXT:    .cfi_restore s3
-; RV64-NEXT:    .cfi_restore s4
-; RV64-NEXT:    .cfi_restore s5
-; RV64-NEXT:    .cfi_restore s6
-; RV64-NEXT:    .cfi_restore s7
-; RV64-NEXT:    .cfi_restore s8
-; RV64-NEXT:    .cfi_restore s9
-; RV64-NEXT:    .cfi_restore s10
-; RV64-NEXT:    .cfi_restore s11
-; RV64-NEXT:    addi sp, sp, 112
+; RV64-NEXT:    addi sp, sp, 16
 ; RV64-NEXT:    .cfi_def_cfa_offset 0
 ; RV64-NEXT:    ret
   %r = tail call <vscale x 16 x i1> @llvm.experimental.vector.match(<vscale x 16 x i8> %op1, <32 x i8> %op2, <vscale x 16 x i1> %mask)
@@ -846,381 +703,255 @@ define <vscale x 16 x i1> @match_nxv16i8_v32i8(<vscale x 16 x i8> %op1, <32 x i8
 define <16 x i1> @match_v16i8_v32i8(<16 x i8> %op1, <32 x i8> %op2, <16 x i1> %mask) {
 ; RV32-LABEL: match_v16i8_v32i8:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -64
-; RV32-NEXT:    .cfi_def_cfa_offset 64
-; RV32-NEXT:    sw ra, 60(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s0, 56(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s1, 52(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s2, 48(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s3, 44(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s4, 40(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s5, 36(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s6, 32(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s7, 28(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s8, 24(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s9, 20(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s10, 16(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s11, 12(sp) # 4-byte Folded Spill
-; RV32-NEXT:    .cfi_offset ra, -4
-; RV32-NEXT:    .cfi_offset s0, -8
-; RV32-NEXT:    .cfi_offset s1, -12
-; RV32-NEXT:    .cfi_offset s2, -16
-; RV32-NEXT:    .cfi_offset s3, -20
-; RV32-NEXT:    .cfi_offset s4, -24
-; RV32-NEXT:    .cfi_offset s5, -28
-; RV32-NEXT:    .cfi_offset s6, -32
-; RV32-NEXT:    .cfi_offset s7, -36
-; RV32-NEXT:    .cfi_offset s8, -40
-; RV32-NEXT:    .cfi_offset s9, -44
-; RV32-NEXT:    .cfi_offset s10, -48
-; RV32-NEXT:    .cfi_offset s11, -52
-; RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; RV32-NEXT:    vmv.x.s a0, v10
-; RV32-NEXT:    vslidedown.vi v9, v10, 1
-; RV32-NEXT:    vslidedown.vi v12, v10, 2
-; RV32-NEXT:    vslidedown.vi v13, v10, 3
-; RV32-NEXT:    vslidedown.vi v14, v10, 4
-; RV32-NEXT:    vslidedown.vi v15, v10, 5
-; RV32-NEXT:    vslidedown.vi v16, v10, 6
-; RV32-NEXT:    vslidedown.vi v17, v10, 7
-; RV32-NEXT:    vslidedown.vi v18, v10, 8
-; RV32-NEXT:    vslidedown.vi v19, v10, 9
-; RV32-NEXT:    vslidedown.vi v20, v10, 10
-; RV32-NEXT:    vslidedown.vi v21, v10, 11
-; RV32-NEXT:    vslidedown.vi v22, v10, 12
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT:    .cfi_offset s0, -4
+; RV32-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; RV32-NEXT:    vrgather.vi v9, v10, 1
+; RV32-NEXT:    vrgather.vi v12, v10, 0
+; RV32-NEXT:    vrgather.vi v13, v10, 2
+; RV32-NEXT:    vrgather.vi v14, v10, 3
+; RV32-NEXT:    vrgather.vi v15, v10, 4
+; RV32-NEXT:    vrgather.vi v16, v10, 5
+; RV32-NEXT:    vrgather.vi v17, v10, 6
+; RV32-NEXT:    vrgather.vi v18, v10, 7
+; RV32-NEXT:    vrgather.vi v19, v10, 8
+; RV32-NEXT:    vrgather.vi v20, v10, 9
+; RV32-NEXT:    vrgather.vi v21, v10, 10
+; RV32-NEXT:    vrgather.vi v22, v10, 11
+; RV32-NEXT:    vrgather.vi v23, v10, 12
 ; RV32-NEXT:    vsetivli zero, 1, e8, m2, ta, ma
 ; RV32-NEXT:    vslidedown.vi v24, v10, 16
-; RV32-NEXT:    vmv.x.s a1, v24
+; RV32-NEXT:    vmv.x.s a0, v24
 ; RV32-NEXT:    vslidedown.vi v24, v10, 17
-; RV32-NEXT:    vmv.x.s a2, v24
+; RV32-NEXT:    vmv.x.s a1, v24
 ; RV32-NEXT:    vslidedown.vi v24, v10, 18
-; RV32-NEXT:    vmv.x.s a3, v24
+; RV32-NEXT:    vmv.x.s a2, v24
 ; RV32-NEXT:    vslidedown.vi v24, v10, 19
-; RV32-NEXT:    vmv.x.s a4, v24
+; RV32-NEXT:    vmv.x.s a3, v24
 ; RV32-NEXT:    vslidedown.vi v24, v10, 20
-; RV32-NEXT:    vmv.x.s a5, v24
+; RV32-NEXT:    vmv.x.s a4, v24
 ; RV32-NEXT:    vslidedown.vi v24, v10, 21
-; RV32-NEXT:    vmv.x.s a6, v24
+; RV32-NEXT:    vmv.x.s a5, v24
 ; RV32-NEXT:    vslidedown.vi v24, v10, 22
-; RV32-NEXT:    vmv.x.s a7, v24
+; RV32-NEXT:    vmv.x.s a6, v24
 ; RV32-NEXT:    vslidedown.vi v24, v10, 23
-; RV32-NEXT:    vmv.x.s t0, v24
+; RV32-NEXT:    vmv.x.s a7, v24
 ; RV32-NEXT:    vslidedown.vi v24, v10, 24
-; RV32-NEXT:    vmv.x.s t1, v24
+; RV32-NEXT:    vmv.x.s t0, v24
 ; RV32-NEXT:    vslidedown.vi v24, v10, 25
-; RV32-NEXT:    vmv.x.s t2, v24
+; RV32-NEXT:    vmv.x.s t1, v24
 ; RV32-NEXT:    vslidedown.vi v24, v10, 26
-; RV32-NEXT:    vmv.x.s t3, v24
+; RV32-NEXT:    vmv.x.s t2, v24
 ; RV32-NEXT:    vslidedown.vi v24, v10, 27
-; RV32-NEXT:    vmv.x.s t4, v24
+; RV32-NEXT:    vmv.x.s t3, v24
 ; RV32-NEXT:    vslidedown.vi v24, v10, 28
-; RV32-NEXT:    vmv.x.s t5, v24
+; RV32-NEXT:    vmv.x.s t4, v24
 ; RV32-NEXT:    vslidedown.vi v24, v10, 29
-; RV32-NEXT:    vmv.x.s t6, v24
+; RV32-NEXT:    vmv.x.s t5, v24
 ; RV32-NEXT:    vslidedown.vi v24, v10, 30
-; RV32-NEXT:    vmv.x.s s0, v24
+; RV32-NEXT:    vmv.x.s t6, v24
 ; RV32-NEXT:    vslidedown.vi v24, v10, 31
-; RV32-NEXT:    vmv.x.s s1, v24
+; RV32-NEXT:    vmv.x.s s0, v24
 ; RV32-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
-; RV32-NEXT:    vslidedown.vi v11, v10, 13
-; RV32-NEXT:    vslidedown.vi v23, v10, 14
-; RV32-NEXT:    vslidedown.vi v10, v10, 15
-; RV32-NEXT:    vmv.x.s s2, v9
-; RV32-NEXT:    vmv.x.s s3, v12
-; RV32-NEXT:    vmv.x.s s4, v13
-; RV32-NEXT:    vmv.x.s s5, v14
-; RV32-NEXT:    vmv.x.s s6, v15
-; RV32-NEXT:    vmv.x.s s7, v16
-; RV32-NEXT:    vmv.x.s s8, v17
-; RV32-NEXT:    vmv.x.s s9, v18
-; RV32-NEXT:    vmv.x.s s10, v19
-; RV32-NEXT:    vmv.x.s s11, v20
-; RV32-NEXT:    vmv.x.s ra, v21
-; RV32-NEXT:    vmseq.vx v9, v8, a0
-; RV32-NEXT:    vmv.x.s a0, v22
-; RV32-NEXT:    vmseq.vx v12, v8, s2
-; RV32-NEXT:    vmv.x.s s2, v11
-; RV32-NEXT:    vmseq.vx v11, v8, s3
-; RV32-NEXT:    vmv.x.s s3, v23
-; RV32-NEXT:    vmseq.vx v13, v8, s4
-; RV32-NEXT:    vmv.x.s s4, v10
-; RV32-NEXT:    vmseq.vx v10, v8, s5
-; RV32-NEXT:    vmor.mm v9, v9, v12
-; RV32-NEXT:    vmseq.vx v12, v8, s6
-; RV32-NEXT:    vmor.mm v9, v9, v11
-; RV32-NEXT:    vmseq.vx v11, v8, s7
-; RV32-NEXT:    vmor.mm v9, v9, v13
-; RV32-NEXT:    vmseq.vx v13, v8, s8
+; RV32-NEXT:    vrgather.vi v11, v10, 13
+; RV32-NEXT:    vrgather.vi v24, v10, 14
+; RV32-NEXT:    vrgather.vi v25, v10, 15
+; RV32-NEXT:    vmseq.vv v9, v8, v9
+; RV32-NEXT:    vmseq.vv v10, v8, v12
+; RV32-NEXT:    vmor.mm v9, v10, v9
+; RV32-NEXT:    vmseq.vv v10, v8, v13
 ; RV32-NEXT:    vmor.mm v9, v9, v10
-; RV32-NEXT:    vmseq.vx v10, v8, s9
-; RV32-NEXT:    vmor.mm v9, v9, v12
-; RV32-NEXT:    vmseq.vx v12, v8, s10
-; RV32-NEXT:    vmor.mm v9, v9, v11
-; RV32-NEXT:    vmseq.vx v11, v8, s11
-; RV32-NEXT:    vmor.mm v9, v9, v13
-; RV32-NEXT:    vmseq.vx v13, v8, ra
+; RV32-NEXT:    vmseq.vv v10, v8, v14
+; RV32-NEXT:    vmor.mm v9, v9, v10
+; RV32-NEXT:    vmseq.vv v10, v8, v15
+; RV32-NEXT:    vmor.mm v9, v9, v10
+; RV32-NEXT:    vmseq.vv v10, v8, v16
+; RV32-NEXT:    vmor.mm v9, v9, v10
+; RV32-NEXT:    vmseq.vv v10, v8, v17
+; RV32-NEXT:    vmor.mm v9, v9, v10
+; RV32-NEXT:    vmseq.vv v10, v8, v18
+; RV32-NEXT:    vmor.mm v9, v9, v10
+; RV32-NEXT:    vmseq.vv v10, v8, v19
+; RV32-NEXT:    vmor.mm v9, v9, v10
+; RV32-NEXT:    vmseq.vv v10, v8, v20
+; RV32-NEXT:    vmor.mm v9, v9, v10
+; RV32-NEXT:    vmseq.vv v10, v8, v21
+; RV32-NEXT:    vmor.mm v9, v9, v10
+; RV32-NEXT:    vmseq.vv v10, v8, v22
+; RV32-NEXT:    vmor.mm v9, v9, v10
+; RV32-NEXT:    vmseq.vv v10, v8, v23
 ; RV32-NEXT:    vmor.mm v9, v9, v10
 ; RV32-NEXT:    vmseq.vx v10, v8, a0
-; RV32-NEXT:    vmor.mm v9, v9, v12
-; RV32-NEXT:    vmseq.vx v12, v8, s2
+; RV32-NEXT:    vmseq.vv v11, v8, v11
 ; RV32-NEXT:    vmor.mm v9, v9, v11
-; RV32-NEXT:    vmseq.vx v11, v8, s3
-; RV32-NEXT:    vmor.mm v9, v9, v13
-; RV32-NEXT:    vmseq.vx v13, v8, s4
-; RV32-NEXT:    vmor.mm v9, v9, v10
-; RV32-NEXT:    vmseq.vx v10, v8, a1
+; RV32-NEXT:    vmseq.vx v11, v8, a1
+; RV32-NEXT:    vmseq.vv v12, v8, v24
 ; RV32-NEXT:    vmor.mm v9, v9, v12
 ; RV32-NEXT:    vmseq.vx v12, v8, a2
-; RV32-NEXT:    vmor.mm v9, v9, v11
-; RV32-NEXT:    vmseq.vx v11, v8, a3
+; RV32-NEXT:    vmseq.vv v13, v8, v25
 ; RV32-NEXT:    vmor.mm v9, v9, v13
-; RV32-NEXT:    vmseq.vx v13, v8, a4
+; RV32-NEXT:    vmseq.vx v13, v8, a3
 ; RV32-NEXT:    vmor.mm v9, v9, v10
-; RV32-NEXT:    vmseq.vx v10, v8, a5
+; RV32-NEXT:    vmseq.vx v10, v8, a4
+; RV32-NEXT:    vmor.mm v9, v9, v11
+; RV32-NEXT:    vmseq.vx v11, v8, a5
 ; RV32-NEXT:    vmor.mm v9, v9, v12
 ; RV32-NEXT:    vmseq.vx v12, v8, a6
-; RV32-NEXT:    vmor.mm v9, v9, v11
-; RV32-NEXT:    vmseq.vx v11, v8, a7
 ; RV32-NEXT:    vmor.mm v9, v9, v13
-; RV32-NEXT:    vmseq.vx v13, v8, t0
+; RV32-NEXT:    vmseq.vx v13, v8, a7
 ; RV32-NEXT:    vmor.mm v9, v9, v10
-; RV32-NEXT:    vmseq.vx v10, v8, t1
+; RV32-NEXT:    vmseq.vx v10, v8, t0
+; RV32-NEXT:    vmor.mm v9, v9, v11
+; RV32-NEXT:    vmseq.vx v11, v8, t1
 ; RV32-NEXT:    vmor.mm v9, v9, v12
 ; RV32-NEXT:    vmseq.vx v12, v8, t2
-; RV32-NEXT:    vmor.mm v9, v9, v11
-; RV32-NEXT:    vmseq.vx v11, v8, t3
 ; RV32-NEXT:    vmor.mm v9, v9, v13
-; RV32-NEXT:    vmseq.vx v13, v8, t4
+; RV32-NEXT:    vmseq.vx v13, v8, t3
 ; RV32-NEXT:    vmor.mm v9, v9, v10
-; RV32-NEXT:    vmseq.vx v10, v8, t5
+; RV32-NEXT:    vmseq.vx v10, v8, t4
+; RV32-NEXT:    vmor.mm v9, v9, v11
+; RV32-NEXT:    vmseq.vx v11, v8, t5
 ; RV32-NEXT:    vmor.mm v9, v9, v12
 ; RV32-NEXT:    vmseq.vx v12, v8, t6
-; RV32-NEXT:    vmor.mm v9, v9, v11
-; RV32-NEXT:    vmseq.vx v11, v8, s0
 ; RV32-NEXT:    vmor.mm v9, v9, v13
 ; RV32-NEXT:    vmor.mm v9, v9, v10
-; RV32-NEXT:    vmor.mm v9, v9, v12
 ; RV32-NEXT:    vmor.mm v9, v9, v11
-; RV32-NEXT:    vmseq.vx v8, v8, s1
+; RV32-NEXT:    vmor.mm v9, v9, v12
+; RV32-NEXT:    vmseq.vx v8, v8, s0
 ; RV32-NEXT:    vmor.mm v8, v9, v8
 ; RV32-NEXT:    vmand.mm v0, v8, v0
-; RV32-NEXT:    lw ra, 60(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s0, 56(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s1, 52(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s2, 48(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s3, 44(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s4, 40(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s5, 36(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s6, 32(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s7, 28(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s8, 24(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s9, 20(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s10, 16(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s11, 12(sp) # 4-byte Folded Reload
-; RV32-NEXT:    .cfi_restore ra
+; RV32-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    .cfi_restore s0
-; RV32-NEXT:    .cfi_restore s1
-; RV32-NEXT:    .cfi_restore s2
-; RV32-NEXT:    .cfi_restore s3
-; RV32-NEXT:    .cfi_restore s4
-; RV32-NEXT:    .cfi_restore s5
-; RV32-NEXT:    .cfi_restore s6
-; RV32-NEXT:    .cfi_restore s7
-; RV32-NEXT:    .cfi_restore s8
-; RV32-NEXT:    .cfi_restore s9
-; RV32-NEXT:    .cfi_restore s10
-; RV32-NEXT:    .cfi_restore s11
-; RV32-NEXT:    addi sp, sp, 64
+; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: match_v16i8_v32i8:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    addi sp, sp, -112
-; RV64-NEXT:    .cfi_def_cfa_offset 112
-; RV64-NEXT:    sd ra, 104(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s0, 96(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s1, 88(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s2, 80(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s3, 72(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s4, 64(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s5, 56(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s6, 48(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s7, 40(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s8, 32(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s9, 24(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s10, 16(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s11, 8(sp) # 8-byte Folded Spill
-; RV64-NEXT:    .cfi_offset ra, -8
-; RV64-NEXT:    .cfi_offset s0, -16
-; RV64-NEXT:    .cfi_offset s1, -24
-; RV64-NEXT:    .cfi_offset s2, -32
-; RV64-NEXT:    .cfi_offset s3, -40
-; RV64-NEXT:    .cfi_offset s4, -48
-; RV64-NEXT:    .cfi_offset s5, -56
-; RV64-NEXT:    .cfi_offset s6, -64
-; RV64-NEXT:    .cfi_offset s7, -72
-; RV64-NEXT:    .cfi_offset s8, -80
-; RV64-NEXT:    .cfi_offset s9, -88
-; RV64-NEXT:    .cfi_offset s10, -96
-; RV64-NEXT:    .cfi_offset s11, -104
-; RV64-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; RV64-NEXT:    vmv.x.s a0, v10
-; RV64-NEXT:    vslidedown.vi v9, v10, 1
-; RV64-NEXT:    vslidedown.vi v12, v10, 2
-; RV64-NEXT:    vslidedown.vi v13, v10, 3
-; RV64-NEXT:    vslidedown.vi v14, v10, 4
-; RV64-NEXT:    vslidedown.vi v15, v10, 5
-; RV64-NEXT:    vslidedown.vi v16, v10, 6
-; RV64-NEXT:    vslidedown.vi v17, v10, 7
-; RV64-NEXT:    vslidedown.vi v18, v10, 8
-; RV64-NEXT:    vslidedown.vi v19, v10, 9
-; RV64-NEXT:    vslidedown.vi v20, v10, 10
-; RV64-NEXT:    vslidedown.vi v21, v10, 11
-; RV64-NEXT:    vslidedown.vi v22, v10, 12
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    .cfi_def_cfa_offset 16
+; RV64-NEXT:    sd s0, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT:    .cfi_offset s0, -8
+; RV64-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; RV64-NEXT:    vrgather.vi v9, v10, 1
+; RV64-NEXT:    vrgather.vi v12, v10, 0
+; RV64-NEXT:    vrgather.vi v13, v10, 2
+; RV64-NEXT:    vrgather.vi v14, v10, 3
+; RV64-NEXT:    vrgather.vi v15, v10, 4
+; RV64-NEXT:    vrgather.vi v16, v10, 5
+; RV64-NEXT:    vrgather.vi v17, v10, 6
+; RV64-NEXT:    vrgather.vi v18, v10, 7
+; RV64-NEXT:    vrgather.vi v19, v10, 8
+; RV64-NEXT:    vrgather.vi v20, v10, 9
+; RV64-NEXT:    vrgather.vi v21, v10, 10
+; RV64-NEXT:    vrgather.vi v22, v10, 11
+; RV64-NEXT:    vrgather.vi v23, v10, 12
 ; RV64-NEXT:    vsetivli zero, 1, e8, m2, ta, ma
 ; RV64-NEXT:    vslidedown.vi v24, v10, 16
-; RV64-NEXT:    vmv.x.s a1, v24
+; RV64-NEXT:    vmv.x.s a0, v24
 ; RV64-NEXT:    vslidedown.vi v24, v10, 17
-; RV64-NEXT:    vmv.x.s a2, v24
+; RV64-NEXT:    vmv.x.s a1, v24
 ; RV64-NEXT:    vslidedown.vi v24, v10, 18
-; RV64-NEXT:    vmv.x.s a3, v24
+; RV64-NEXT:    vmv.x.s a2, v24
 ; RV64-NEXT:    vslidedown.vi v24, v10, 19
-; RV64-NEXT:    vmv.x.s a4, v24
+; RV64-NEXT:    vmv.x.s a3, v24
 ; RV64-NEXT:    vslidedown.vi v24, v10, 20
-; RV64-NEXT:    vmv.x.s a5, v24
+; RV64-NEXT:    vmv.x.s a4, v24
 ; RV64-NEXT:    vslidedown.vi v24, v10, 21
-; RV64-NEXT:    vmv.x.s a6, v24
+; RV64-NEXT:    vmv.x.s a5, v24
 ; RV64-NEXT:    vslidedown.vi v24, v10, 22
-; RV64-NEXT:    vmv.x.s a7, v24
+; RV64-NEXT:    vmv.x.s a6, v24
 ; RV64-NEXT:    vslidedown.vi v24, v10, 23
-; RV64-NEXT:    vmv.x.s t0, v24
+; RV64-NEXT:    vmv.x.s a7, v24
 ; RV64-NEXT:    vslidedown.vi v24, v10, 24
-; RV64-NEXT:    vmv.x.s t1, v24
+; RV64-NEXT:    vmv.x.s t0, v24
 ; RV64-NEXT:    vslidedown.vi v24, v10, 25
-; RV64-NEXT:    vmv.x.s t2, v24
+; RV64-NEXT:    vmv.x.s t1, v24
 ; RV64-NEXT:    vslidedown.vi v24, v10, 26
-; RV64-NEXT:    vmv.x.s t3, v24
+; RV64-NEXT:    vmv.x.s t2, v24
 ; RV64-NEXT:    vslidedown.vi v24, v10, 27
-; RV64-NEXT:    vmv.x.s t4, v24
+; RV64-NEXT:    vmv.x.s t3, v24
 ; RV64-NEXT:    vslidedown.vi v24, v10, 28
-; RV64-NEXT:    vmv.x.s t5, v24
+; RV64-NEXT:    vmv.x.s t4, v24
 ; RV64-NEXT:    vslidedown.vi v24, v10, 29
-; RV64-NEXT:    vmv.x.s t6, v24
+; RV64-NEXT:    vmv.x.s t5, v24
 ; RV64-NEXT:    vslidedown.vi v24, v10, 30
-; RV64-NEXT:    vmv.x.s s0, v24
+; RV64-NEXT:    vmv.x.s t6, v24
 ; RV64-NEXT:    vslidedown.vi v24, v10, 31
-; RV64-NEXT:    vmv.x.s s1, v24
+; RV64-NEXT:    vmv.x.s s0, v24
 ; RV64-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
-; RV64-NEXT:    vslidedown.vi v11, v10, 13
-; RV64-NEXT:    vslidedown.vi v23, v10, 14
-; RV64-NEXT:    vslidedown.vi v10, v10, 15
-; RV64-NEXT:    vmv.x.s s2, v9
-; RV64-NEXT:    vmv.x.s s3, v12
-; RV64-NEXT:    vmv.x.s s4, v13
-; RV64-NEXT:    vmv.x.s s5, v14
-; RV64-NEXT:    vmv.x.s s6, v15
-; RV64-NEXT:    vmv.x.s s7, v16
-; RV64-NEXT:    vmv.x.s s8, v17
-; RV64-NEXT:    vmv.x.s s9, v18
-; RV64-NEXT:    vmv.x.s s10, v19
-; RV64-NEXT:    vmv.x.s s11, v20
-; RV64-NEXT:    vmv.x.s ra, v21
-; RV64-NEXT:    vmseq.vx v9, v8, a0
-; RV64-NEXT:    vmv.x.s a0, v22
-; RV64-NEXT:    vmseq.vx v12, v8, s2
-; RV64-NEXT:    vmv.x.s s2, v11
-; RV64-NEXT:    vmseq.vx v11, v8, s3
-; RV64-NEXT:    vmv.x.s s3, v23
-; RV64-NEXT:    vmseq.vx v13, v8, s4
-; RV64-NEXT:    vmv.x.s s4, v10
-; RV64-NEXT:    vmseq.vx v10, v8, s5
-; RV64-NEXT:    vmor.mm v9, v9, v12
-; RV64-NEXT:    vmseq.vx v12, v8, s6
-; RV64-NEXT:    vmor.mm v9, v9, v11
-; RV64-NEXT:    vmseq.vx v11, v8, s7
-; RV64-NEXT:    vmor.mm v9, v9, v13
-; RV64-NEXT:    vmseq.vx v13, v8, s8
+; RV64-NEXT:    vrgather.vi v11, v10, 13
+; RV64-NEXT:    vrgather.vi v24, v10, 14
+; RV64-NEXT:    vrgather.vi v25, v10, 15
+; RV64-NEXT:    vmseq.vv v9, v8, v9
+; RV64-NEXT:    vmseq.vv v10, v8, v12
+; RV64-NEXT:    vmor.mm v9, v10, v9
+; RV64-NEXT:    vmseq.vv v10, v8, v13
 ; RV64-NEXT:    vmor.mm v9, v9, v10
-; RV64-NEXT:    vmseq.vx v10, v8, s9
-; RV64-NEXT:    vmor.mm v9, v9, v12
-; RV64-NEXT:    vmseq.vx v12, v8, s10
-; RV64-NEXT:    vmor.mm v9, v9, v11
-; RV64-NEXT:    vmseq.vx v11, v8, s11
-; RV64-NEXT:    vmor.mm v9, v9, v13
-; RV64-NEXT:    vmseq.vx v13, v8, ra
+; RV64-NEXT:    vmseq.vv v10, v8, v14
+; RV64-NEXT:    vmor.mm v9, v9, v10
+; RV64-NEXT:    vmseq.vv v10, v8, v15
+; RV64-NEXT:    vmor.mm v9, v9, v10
+; RV64-NEXT:    vmseq.vv v10, v8, v16
+; RV64-NEXT:    vmor.mm v9, v9, v10
+; RV64-NEXT:    vmseq.vv v10, v8, v17
+; RV64-NEXT:    vmor.mm v9, v9, v10
+; RV64-NEXT:    vmseq.vv v10, v8, v18
+; RV64-NEXT:    vmor.mm v9, v9, v10
+; RV64-NEXT:    vmseq.vv v10, v8, v19
+; RV64-NEXT:    vmor.mm v9, v9, v10
+; RV64-NEXT:    vmseq.vv v10, v8, v20
+; RV64-NEXT:    vmor.mm v9, v9, v10
+; RV64-NEXT:    vmseq.vv v10, v8, v21
+; RV64-NEXT:    vmor.mm v9, v9, v10
+; RV64-NEXT:    vmseq.vv v10, v8, v22
+; RV64-NEXT:    vmor.mm v9, v9, v10
+; RV64-NEXT:    vmseq.vv v10, v8, v23
 ; RV64-NEXT:    vmor.mm v9, v9, v10
 ; RV64-NEXT:    vmseq.vx v10, v8, a0
-; RV64-NEXT:    vmor.mm v9, v9, v12
-; RV64-NEXT:    vmseq.vx v12, v8, s2
+; RV64-NEXT:    vmseq.vv v11, v8, v11
 ; RV64-NEXT:    vmor.mm v9, v9, v11
-; RV64-NEXT:    vmseq.vx v11, v8, s3
-; RV64-NEXT:    vmor.mm v9, v9, v13
-; RV64-NEXT:    vmseq.vx v13, v8, s4
-; RV64-NEXT:    vmor.mm v9, v9, v10
-; RV64-NEXT:    vmseq.vx v10, v8, a1
+; RV64-NEXT:    vmseq.vx v11, v8, a1
+; RV64-NEXT:    vmseq.vv v12, v8, v24
 ; RV64-NEXT:    vmor.mm v9, v9, v12
 ; RV64-NEXT:    vmseq.vx v12, v8, a2
-; RV64-NEXT:    vmor.mm v9, v9, v11
-; RV64-NEXT:    vmseq.vx v11, v8, a3
+; RV64-NEXT:    vmseq.vv v13, v8, v25
 ; RV64-NEXT:    vmor.mm v9, v9, v13
-; RV64-NEXT:    vmseq.vx v13, v8, a4
+; RV64-NEXT:    vmseq.vx v13, v8, a3
 ; RV64-NEXT:    vmor.mm v9, v9, v10
-; RV64-NEXT:    vmseq.vx v10, v8, a5
+; RV64-NEXT:    vmseq.vx v10, v8, a4
+; RV64-NEXT:    vmor.mm v9, v9, v11
+; RV64-NEXT:    vmseq.vx v11, v8, a5
 ; RV64-NEXT:    vmor.mm v9, v9, v12
 ; RV64-NEXT:    vmseq.vx v12, v8, a6
-; RV64-NEXT:    vmor.mm v9, v9, v11
-; RV64-NEXT:    vmseq.vx v11, v8, a7
 ; RV64-NEXT:    vmor.mm v9, v9, v13
-; RV64-NEXT:    vmseq.vx v13, v8, t0
+; RV64-NEXT:    vmseq.vx v13, v8, a7
 ; RV64-NEXT:    vmor.mm v9, v9, v10
-; RV64-NEXT:    vmseq.vx v10, v8, t1
+; RV64-NEXT:    vmseq.vx v10, v8, t0
+; RV64-NEXT:    vmor.mm v9, v9, v11
+; RV64-NEXT:    vmseq.vx v11, v8, t1
 ; RV64-NEXT:    vmor.mm v9, v9, v12
 ; RV64-NEXT:    vmseq.vx v12, v8, t2
-; RV64-NEXT:    vmor.mm v9, v9, v11
-; RV64-NEXT:    vmseq.vx v11, v8, t3
 ; RV64-NEXT:    vmor.mm v9, v9, v13
-; RV64-NEXT:    vmseq.vx v13, v8, t4
+; RV64-NEXT:    vmseq.vx v13, v8, t3
 ; RV64-NEXT:    vmor.mm v9, v9, v10
-; RV64-NEXT:    vmseq.vx v10, v8, t5
+; RV64-NEXT:    vmseq.vx v10, v8, t4
+; RV64-NEXT:    vmor.mm v9, v9, v11
+; RV64-NEXT:    vmseq.vx v11, v8, t5
 ; RV64-NEXT:    vmor.mm v9, v9, v12
 ; RV64-NEXT:    vmseq.vx v12, v8, t6
-; RV64-NEXT:    vmor.mm v9, v9, v11
-; RV64-NEXT:    vmseq.vx v11, v8, s0
 ; RV64-NEXT:    vmor.mm v9, v9, v13
 ; RV64-NEXT:    vmor.mm v9, v9, v10
-; RV64-NEXT:    vmor.mm v9, v9, v12
 ; RV64-NEXT:    vmor.mm v9, v9, v11
-; RV64-NEXT:    vmseq.vx v8, v8, s1
+; RV64-NEXT:    vmor.mm v9, v9, v12
+; RV64-NEXT:    vmseq.vx v8, v8, s0
 ; RV64-NEXT:    vmor.mm v8, v9, v8
 ; RV64-NEXT:    vmand.mm v0, v8, v0
-; RV64-NEXT:    ld ra, 104(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s0, 96(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s1, 88(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s2, 80(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s3, 72(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s4, 64(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s5, 56(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s6, 48(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s7, 40(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s8, 32(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s9, 24(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s10, 16(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s11, 8(sp) # 8-byte Folded Reload
-; RV64-NEXT:    .cfi_restore ra
+; RV64-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    .cfi_restore s0
-; RV64-NEXT:    .cfi_restore s1
-; RV64-NEXT:    .cfi_restore s2
-; RV64-NEXT:    .cfi_restore s3
-; RV64-NEXT:    .cfi_restore s4
-; RV64-NEXT:    .cfi_restore s5
-; RV64-NEXT:    .cfi_restore s6
-; RV64-NEXT:    .cfi_restore s7
-; RV64-NEXT:    .cfi_restore s8
-; RV64-NEXT:    .cfi_restore s9
-; RV64-NEXT:    .cfi_restore s10
-; RV64-NEXT:    .cfi_restore s11
-; RV64-NEXT:    addi sp, sp, 112
+; RV64-NEXT:    addi sp, sp, 16
 ; RV64-NEXT:    .cfi_def_cfa_offset 0
 ; RV64-NEXT:    ret
   %r = tail call <16 x i1> @llvm.experimental.vector.match(<16 x i8> %op1, <32 x i8> %op2, <16 x i1> %mask)

diff  --git a/llvm/test/CodeGen/RISCV/rvv/splat-vectors.ll b/llvm/test/CodeGen/RISCV/rvv/splat-vectors.ll
index a556c3125c85dbb..47db3da3fbe7ae5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/splat-vectors.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/splat-vectors.ll
@@ -165,8 +165,8 @@ define <vscale x 4 x i8> @splat_idx_nxv8i8_nxv4i8_constant_0(<vscale x 8 x i8> %
 ; CHECK-LABEL: splat_idx_nxv8i8_nxv4i8_constant_0:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vmv.x.s a0, v8
-; CHECK-NEXT:    vmv.v.x v8, a0
+; CHECK-NEXT:    vrgather.vi v9, v8, 0
+; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
   %x = extractelement <vscale x 8 x i8> %v, i64 0
   %ins = insertelement <vscale x 4 x i8> poison, i8 %x, i32 0
@@ -177,11 +177,9 @@ define <vscale x 4 x i8> @splat_idx_nxv8i8_nxv4i8_constant_0(<vscale x 8 x i8> %
 define <vscale x 4 x i8> @splat_idx_nxv8i8_nxv4i8_constant_3(<vscale x 8 x i8> %v) {
 ; CHECK-LABEL: splat_idx_nxv8i8_nxv4i8_constant_3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v8, v8, 3
-; CHECK-NEXT:    vmv.x.s a0, v8
-; CHECK-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.x v8, a0
+; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vrgather.vi v9, v8, 3
+; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
   %x = extractelement <vscale x 8 x i8> %v, i64 3
   %ins = insertelement <vscale x 4 x i8> poison, i8 %x, i32 0
@@ -210,8 +208,8 @@ define <8 x float> @splat_idx_nxv4f32_v8f32_constant_0(<vscale x 4 x float> %v)
 ; CHECK-LABEL: splat_idx_nxv4f32_v8f32_constant_0:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vfmv.f.s fa5, v8
-; CHECK-NEXT:    vfmv.v.f v8, fa5
+; CHECK-NEXT:    vrgather.vi v10, v8, 0
+; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
   %x = extractelement <vscale x 4 x float> %v, i64 0
   %ins = insertelement <8 x float> poison, float %x, i32 0
@@ -222,11 +220,9 @@ define <8 x float> @splat_idx_nxv4f32_v8f32_constant_0(<vscale x 4 x float> %v)
 define <8 x float> @splat_idx_nxv4f32_v8f32_constant_7(<vscale x 4 x float> %v) {
 ; CHECK-LABEL: splat_idx_nxv4f32_v8f32_constant_7:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v8, v8, 7
-; CHECK-NEXT:    vfmv.f.s fa5, v8
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vfmv.v.f v8, fa5
+; CHECK-NEXT:    vrgather.vi v10, v8, 7
+; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
   %x = extractelement <vscale x 4 x float> %v, i64 7
   %ins = insertelement <8 x float> poison, float %x, i32 0


        


More information about the llvm-commits mailing list