[llvm] [RISCV] Update matchSplatAsGather to use the index of extract_elt if in-bounds (PR #118873)
Mikhail R. Gadelha via llvm-commits
llvm-commits at lists.llvm.org
Tue Jan 14 07:31:25 PST 2025
https://github.com/mikhailramalho updated https://github.com/llvm/llvm-project/pull/118873
>From 22e7af0866d984acde86740eedcd4811d35c929d Mon Sep 17 00:00:00 2001
From: "Mikhail R. Gadelha" <mikhail at igalia.com>
Date: Thu, 9 Jan 2025 21:07:41 -0300
Subject: [PATCH 1/7] test
Signed-off-by: Mikhail R. Gadelha <mikhail at igalia.com>
---
llvm/test/CodeGen/RISCV/rvv/splat-vectors.ll | 26 ++++++++++++++++++++
1 file changed, 26 insertions(+)
diff --git a/llvm/test/CodeGen/RISCV/rvv/splat-vectors.ll b/llvm/test/CodeGen/RISCV/rvv/splat-vectors.ll
index 2e6df118401792..eb57b973eacfee 100644
--- a/llvm/test/CodeGen/RISCV/rvv/splat-vectors.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/splat-vectors.ll
@@ -105,3 +105,29 @@ define <vscale x 4 x float> @splat_idx_nxv4f32(<vscale x 4 x float> %v, i64 %idx
%splat = shufflevector <vscale x 4 x float> %ins, <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
ret <vscale x 4 x float> %splat
}
+
+define <vscale x 4 x float> @splat_idx_constant_nxv8f32(<vscale x 8 x float> %v) {
+; CHECK-LABEL: splat_idx_constant_nxv8f32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, ma
+; CHECK-NEXT: vrgather.vi v10, v8, 0
+; CHECK-NEXT: vmv.v.v v8, v10
+; CHECK-NEXT: ret
+ %x = extractelement <vscale x 8 x float> %v, i64 0
+ %ins = insertelement <vscale x 4 x float> poison, float %x, i32 0
+ %splat = shufflevector <vscale x 4 x float> %ins, <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
+ ret <vscale x 4 x float> %splat
+}
+
+define <vscale x 4 x i8> @splat_idx_constant_nxv4i8(<vscale x 8 x i8> %v) {
+; CHECK-LABEL: splat_idx_constant_nxv4i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e8, mf2, ta, ma
+; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: vmv.v.x v8, a0
+; CHECK-NEXT: ret
+ %x = extractelement <vscale x 8 x i8> %v, i64 0
+ %ins = insertelement <vscale x 4 x i8> poison, i8 %x, i32 0
+ %splat = shufflevector <vscale x 4 x i8> %ins, <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer
+ ret <vscale x 4 x i8> %splat
+}
>From 1901289feb6c4405d1a1c2e925230015495ea301 Mon Sep 17 00:00:00 2001
From: "Mikhail R. Gadelha" <mikhail at igalia.com>
Date: Thu, 5 Dec 2024 13:46:33 -0300
Subject: [PATCH 2/7] [RISCV] Update matchSplatAsGather to use the index of
extract_elt if it is in-bounds
This a follow up to #117878 and allows the usage of vrgather if the
index we are accessing in VT is a constant and within bounds.
This patch replaces the previous behavior of bailing out if the length
of the search vector was greater than the vector of elements we are
searching for. Since matchSplatAsGather works on EXTRACT_VECTOR_ELT, and
we know the index where the element is being extracted from, we can use
safely use vrgather.
---
llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 2eeca45ac414bd..8cd7e4f321e39d 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -3526,9 +3526,9 @@ static SDValue matchSplatAsGather(SDValue SplatVal, MVT VT, const SDLoc &DL,
return SDValue();
// Check that Index lies within VT
- // TODO: Can we check if the Index is constant and known in-bounds?
- if (!TypeSize::isKnownLE(Vec.getValueSizeInBits(), VT.getSizeInBits()))
- return SDValue();
+ if (auto *CIdx = dyn_cast<ConstantSDNode>(Idx))
+ if (VT.getVectorElementCount().getKnownMinValue() <= CIdx->getZExtValue())
+ return SDValue();
MVT ContainerVT = VT;
if (VT.isFixedLengthVector())
>From 9fdc30528f6c70e23d61da15f1768539d64e4350 Mon Sep 17 00:00:00 2001
From: "Mikhail R. Gadelha" <mikhail at igalia.com>
Date: Thu, 5 Dec 2024 20:26:42 -0300
Subject: [PATCH 3/7] Let's still check the vector lenghts if the index is not
constant
Signed-off-by: Mikhail R. Gadelha <mikhail at igalia.com>
---
llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 8cd7e4f321e39d..85904dd9c2d5f1 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -3526,9 +3526,11 @@ static SDValue matchSplatAsGather(SDValue SplatVal, MVT VT, const SDLoc &DL,
return SDValue();
// Check that Index lies within VT
- if (auto *CIdx = dyn_cast<ConstantSDNode>(Idx))
+ if (auto *CIdx = dyn_cast<ConstantSDNode>(Idx)) {
if (VT.getVectorElementCount().getKnownMinValue() <= CIdx->getZExtValue())
return SDValue();
+ } else if (!TypeSize::isKnownLE(Vec.getValueSizeInBits(), VT.getSizeInBits()))
+ return SDValue();
MVT ContainerVT = VT;
if (VT.isFixedLengthVector())
>From f7ee8a66faaf9769553bd8508edc2ec8567e9c1f Mon Sep 17 00:00:00 2001
From: "Mikhail R. Gadelha" <mikhail at igalia.com>
Date: Fri, 10 Jan 2025 18:55:59 -0300
Subject: [PATCH 4/7] Try to extract the subvector when the vector of elements
we are searching has a greater lenght
Signed-off-by: Mikhail R. Gadelha <mikhail at igalia.com>
---
llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 16 +++++++++-------
1 file changed, 9 insertions(+), 7 deletions(-)
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 85904dd9c2d5f1..72be770449acc2 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -3517,20 +3517,22 @@ static SDValue matchSplatAsGather(SDValue SplatVal, MVT VT, const SDLoc &DL,
// different
// FIXME: Support i1 vectors, maybe by promoting to i8?
MVT EltTy = VT.getVectorElementType();
- if (EltTy == MVT::i1 ||
- EltTy != Vec.getSimpleValueType().getVectorElementType())
+ MVT VecVT = Vec.getSimpleValueType();
+ if (EltTy == MVT::i1 || EltTy != VecVT.getVectorElementType())
return SDValue();
SDValue Idx = SplatVal.getOperand(1);
// The index must be a legal type.
if (Idx.getValueType() != Subtarget.getXLenVT())
return SDValue();
- // Check that Index lies within VT
- if (auto *CIdx = dyn_cast<ConstantSDNode>(Idx)) {
- if (VT.getVectorElementCount().getKnownMinValue() <= CIdx->getZExtValue())
+ // If the search vector is smaller than the vector of elements we are searching for,
+ // try to extract the subvector from it
+ if (VT.getVectorMinNumElements() < VecVT.getVectorMinNumElements()) {
+ if (!(VT.isFixedLengthVector() || VecVT.isScalableVector()))
return SDValue();
- } else if (!TypeSize::isKnownLE(Vec.getValueSizeInBits(), VT.getSizeInBits()))
- return SDValue();
+ Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Vec,
+ DAG.getVectorIdxConstant(0, DL));
+ }
MVT ContainerVT = VT;
if (VT.isFixedLengthVector())
>From 3c0b414a4134a5e415b7b045d3c82419d7cdbd54 Mon Sep 17 00:00:00 2001
From: "Mikhail R. Gadelha" <mikhail at igalia.com>
Date: Fri, 10 Jan 2025 18:56:22 -0300
Subject: [PATCH 5/7] Update tests
Signed-off-by: Mikhail R. Gadelha <mikhail at igalia.com>
---
.../RISCV/rvv/intrinsic-vector-match.ll | 643 ++++++------------
llvm/test/CodeGen/RISCV/rvv/splat-vectors.ll | 4 +-
2 files changed, 204 insertions(+), 443 deletions(-)
diff --git a/llvm/test/CodeGen/RISCV/rvv/intrinsic-vector-match.ll b/llvm/test/CodeGen/RISCV/rvv/intrinsic-vector-match.ll
index 5d730da09ef83f..773cbf46dc8d27 100644
--- a/llvm/test/CodeGen/RISCV/rvv/intrinsic-vector-match.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/intrinsic-vector-match.ll
@@ -143,9 +143,8 @@ define <vscale x 16 x i1> @match_nxv16i8_v16i8(<vscale x 16 x i8> %op1, <16 x i8
define <16 x i1> @match_v16i8_v1i8(<16 x i8> %op1, <1 x i8> %op2, <16 x i1> %mask) {
; CHECK-LABEL: match_v16i8_v1i8:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e8, m1, ta, ma
-; CHECK-NEXT: vrgather.vi v10, v9, 0
; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-NEXT: vrgather.vi v10, v9, 0
; CHECK-NEXT: vmseq.vv v8, v8, v10
; CHECK-NEXT: vmand.mm v0, v8, v0
; CHECK-NEXT: ret
@@ -383,69 +382,53 @@ define <8 x i1> @match_v8i16(<8 x i16> %op1, <8 x i16> %op2, <8 x i1> %mask) {
define <8 x i1> @match_v8i8_v16i8(<8 x i8> %op1, <16 x i8> %op2, <8 x i1> %mask) {
; CHECK-LABEL: match_v8i8_v16i8:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-NEXT: vmv.x.s a0, v9
-; CHECK-NEXT: vslidedown.vi v10, v9, 1
-; CHECK-NEXT: vslidedown.vi v11, v9, 2
-; CHECK-NEXT: vmv.x.s a1, v10
-; CHECK-NEXT: vslidedown.vi v10, v9, 3
-; CHECK-NEXT: vmv.x.s a2, v11
-; CHECK-NEXT: vslidedown.vi v11, v9, 4
-; CHECK-NEXT: vmv.x.s a3, v10
-; CHECK-NEXT: vslidedown.vi v10, v9, 5
-; CHECK-NEXT: vmv.x.s a4, v11
-; CHECK-NEXT: vslidedown.vi v11, v9, 6
-; CHECK-NEXT: vmv.x.s a5, v10
-; CHECK-NEXT: vslidedown.vi v10, v9, 7
-; CHECK-NEXT: vmv.x.s a6, v11
-; CHECK-NEXT: vslidedown.vi v11, v9, 8
-; CHECK-NEXT: vmv.x.s a7, v10
-; CHECK-NEXT: vslidedown.vi v10, v9, 9
-; CHECK-NEXT: vmv.x.s t0, v11
-; CHECK-NEXT: vslidedown.vi v11, v9, 10
-; CHECK-NEXT: vmv.x.s t1, v10
-; CHECK-NEXT: vslidedown.vi v10, v9, 11
-; CHECK-NEXT: vmv.x.s t2, v11
-; CHECK-NEXT: vslidedown.vi v11, v9, 12
-; CHECK-NEXT: vmv.x.s t3, v10
-; CHECK-NEXT: vslidedown.vi v10, v9, 13
-; CHECK-NEXT: vmv.x.s t4, v11
-; CHECK-NEXT: vslidedown.vi v11, v9, 14
-; CHECK-NEXT: vslidedown.vi v9, v9, 15
-; CHECK-NEXT: vmv.x.s t5, v10
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT: vmseq.vx v10, v8, a0
-; CHECK-NEXT: vmv.x.s a0, v11
-; CHECK-NEXT: vmseq.vx v11, v8, a1
-; CHECK-NEXT: vmv.x.s a1, v9
-; CHECK-NEXT: vmseq.vx v9, v8, a2
+; CHECK-NEXT: vrgather.vi v10, v9, 1
+; CHECK-NEXT: vrgather.vi v11, v9, 0
+; CHECK-NEXT: vmseq.vv v10, v8, v10
+; CHECK-NEXT: vmseq.vv v11, v8, v11
+; CHECK-NEXT: vmor.mm v10, v11, v10
+; CHECK-NEXT: vrgather.vi v11, v9, 2
+; CHECK-NEXT: vmseq.vv v11, v8, v11
+; CHECK-NEXT: vmor.mm v10, v10, v11
+; CHECK-NEXT: vrgather.vi v11, v9, 3
+; CHECK-NEXT: vmseq.vv v11, v8, v11
+; CHECK-NEXT: vmor.mm v10, v10, v11
+; CHECK-NEXT: vrgather.vi v11, v9, 4
+; CHECK-NEXT: vmseq.vv v11, v8, v11
+; CHECK-NEXT: vmor.mm v10, v10, v11
+; CHECK-NEXT: vrgather.vi v11, v9, 5
+; CHECK-NEXT: vmseq.vv v11, v8, v11
; CHECK-NEXT: vmor.mm v10, v10, v11
-; CHECK-NEXT: vmseq.vx v11, v8, a3
+; CHECK-NEXT: vrgather.vi v11, v9, 6
+; CHECK-NEXT: vmseq.vv v11, v8, v11
+; CHECK-NEXT: vmor.mm v10, v10, v11
+; CHECK-NEXT: vrgather.vi v11, v9, 7
+; CHECK-NEXT: vmseq.vv v11, v8, v11
+; CHECK-NEXT: vmor.mm v10, v10, v11
+; CHECK-NEXT: vrgather.vi v11, v9, 8
+; CHECK-NEXT: vmseq.vv v11, v8, v11
+; CHECK-NEXT: vmor.mm v10, v10, v11
+; CHECK-NEXT: vrgather.vi v11, v9, 9
+; CHECK-NEXT: vmseq.vv v11, v8, v11
+; CHECK-NEXT: vmor.mm v10, v10, v11
+; CHECK-NEXT: vrgather.vi v11, v9, 10
+; CHECK-NEXT: vmseq.vv v11, v8, v11
+; CHECK-NEXT: vmor.mm v10, v10, v11
+; CHECK-NEXT: vrgather.vi v11, v9, 11
+; CHECK-NEXT: vmseq.vv v11, v8, v11
+; CHECK-NEXT: vmor.mm v10, v10, v11
+; CHECK-NEXT: vrgather.vi v11, v9, 12
+; CHECK-NEXT: vmseq.vv v11, v8, v11
+; CHECK-NEXT: vmor.mm v10, v10, v11
+; CHECK-NEXT: vrgather.vi v11, v9, 13
+; CHECK-NEXT: vmseq.vv v11, v8, v11
+; CHECK-NEXT: vmor.mm v10, v10, v11
+; CHECK-NEXT: vrgather.vi v11, v9, 14
+; CHECK-NEXT: vrgather.vi v12, v9, 15
+; CHECK-NEXT: vmseq.vv v9, v8, v11
; CHECK-NEXT: vmor.mm v9, v10, v9
-; CHECK-NEXT: vmseq.vx v10, v8, a4
-; CHECK-NEXT: vmor.mm v9, v9, v11
-; CHECK-NEXT: vmseq.vx v11, v8, a5
-; CHECK-NEXT: vmor.mm v9, v9, v10
-; CHECK-NEXT: vmseq.vx v10, v8, a6
-; CHECK-NEXT: vmor.mm v9, v9, v11
-; CHECK-NEXT: vmseq.vx v11, v8, a7
-; CHECK-NEXT: vmor.mm v9, v9, v10
-; CHECK-NEXT: vmseq.vx v10, v8, t0
-; CHECK-NEXT: vmor.mm v9, v9, v11
-; CHECK-NEXT: vmseq.vx v11, v8, t1
-; CHECK-NEXT: vmor.mm v9, v9, v10
-; CHECK-NEXT: vmseq.vx v10, v8, t2
-; CHECK-NEXT: vmor.mm v9, v9, v11
-; CHECK-NEXT: vmseq.vx v11, v8, t3
-; CHECK-NEXT: vmor.mm v9, v9, v10
-; CHECK-NEXT: vmseq.vx v10, v8, t4
-; CHECK-NEXT: vmor.mm v9, v9, v11
-; CHECK-NEXT: vmseq.vx v11, v8, t5
-; CHECK-NEXT: vmor.mm v9, v9, v10
-; CHECK-NEXT: vmseq.vx v10, v8, a0
-; CHECK-NEXT: vmor.mm v9, v9, v11
-; CHECK-NEXT: vmor.mm v9, v9, v10
-; CHECK-NEXT: vmseq.vx v8, v8, a1
+; CHECK-NEXT: vmseq.vv v8, v8, v12
; CHECK-NEXT: vmor.mm v8, v9, v8
; CHECK-NEXT: vmand.mm v0, v8, v0
; CHECK-NEXT: ret
@@ -844,385 +827,142 @@ define <vscale x 16 x i1> @match_nxv16i8_v32i8(<vscale x 16 x i8> %op1, <32 x i8
}
define <16 x i1> @match_v16i8_v32i8(<16 x i8> %op1, <32 x i8> %op2, <16 x i1> %mask) {
-; RV32-LABEL: match_v16i8_v32i8:
-; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -64
-; RV32-NEXT: .cfi_def_cfa_offset 64
-; RV32-NEXT: sw ra, 60(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s0, 56(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s1, 52(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s2, 48(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s3, 44(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s4, 40(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s5, 36(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s6, 32(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s7, 28(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s8, 24(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s9, 20(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s10, 16(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s11, 12(sp) # 4-byte Folded Spill
-; RV32-NEXT: .cfi_offset ra, -4
-; RV32-NEXT: .cfi_offset s0, -8
-; RV32-NEXT: .cfi_offset s1, -12
-; RV32-NEXT: .cfi_offset s2, -16
-; RV32-NEXT: .cfi_offset s3, -20
-; RV32-NEXT: .cfi_offset s4, -24
-; RV32-NEXT: .cfi_offset s5, -28
-; RV32-NEXT: .cfi_offset s6, -32
-; RV32-NEXT: .cfi_offset s7, -36
-; RV32-NEXT: .cfi_offset s8, -40
-; RV32-NEXT: .cfi_offset s9, -44
-; RV32-NEXT: .cfi_offset s10, -48
-; RV32-NEXT: .cfi_offset s11, -52
-; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; RV32-NEXT: vmv.x.s a0, v10
-; RV32-NEXT: vslidedown.vi v9, v10, 1
-; RV32-NEXT: vslidedown.vi v12, v10, 2
-; RV32-NEXT: vslidedown.vi v13, v10, 3
-; RV32-NEXT: vslidedown.vi v14, v10, 4
-; RV32-NEXT: vslidedown.vi v15, v10, 5
-; RV32-NEXT: vslidedown.vi v16, v10, 6
-; RV32-NEXT: vslidedown.vi v17, v10, 7
-; RV32-NEXT: vslidedown.vi v18, v10, 8
-; RV32-NEXT: vslidedown.vi v19, v10, 9
-; RV32-NEXT: vslidedown.vi v20, v10, 10
-; RV32-NEXT: vslidedown.vi v21, v10, 11
-; RV32-NEXT: vslidedown.vi v22, v10, 12
-; RV32-NEXT: vsetivli zero, 1, e8, m2, ta, ma
-; RV32-NEXT: vslidedown.vi v24, v10, 16
-; RV32-NEXT: vmv.x.s a1, v24
-; RV32-NEXT: vslidedown.vi v24, v10, 17
-; RV32-NEXT: vmv.x.s a2, v24
-; RV32-NEXT: vslidedown.vi v24, v10, 18
-; RV32-NEXT: vmv.x.s a3, v24
-; RV32-NEXT: vslidedown.vi v24, v10, 19
-; RV32-NEXT: vmv.x.s a4, v24
-; RV32-NEXT: vslidedown.vi v24, v10, 20
-; RV32-NEXT: vmv.x.s a5, v24
-; RV32-NEXT: vslidedown.vi v24, v10, 21
-; RV32-NEXT: vmv.x.s a6, v24
-; RV32-NEXT: vslidedown.vi v24, v10, 22
-; RV32-NEXT: vmv.x.s a7, v24
-; RV32-NEXT: vslidedown.vi v24, v10, 23
-; RV32-NEXT: vmv.x.s t0, v24
-; RV32-NEXT: vslidedown.vi v24, v10, 24
-; RV32-NEXT: vmv.x.s t1, v24
-; RV32-NEXT: vslidedown.vi v24, v10, 25
-; RV32-NEXT: vmv.x.s t2, v24
-; RV32-NEXT: vslidedown.vi v24, v10, 26
-; RV32-NEXT: vmv.x.s t3, v24
-; RV32-NEXT: vslidedown.vi v24, v10, 27
-; RV32-NEXT: vmv.x.s t4, v24
-; RV32-NEXT: vslidedown.vi v24, v10, 28
-; RV32-NEXT: vmv.x.s t5, v24
-; RV32-NEXT: vslidedown.vi v24, v10, 29
-; RV32-NEXT: vmv.x.s t6, v24
-; RV32-NEXT: vslidedown.vi v24, v10, 30
-; RV32-NEXT: vmv.x.s s0, v24
-; RV32-NEXT: vslidedown.vi v24, v10, 31
-; RV32-NEXT: vmv.x.s s1, v24
-; RV32-NEXT: vsetivli zero, 16, e8, m1, ta, ma
-; RV32-NEXT: vslidedown.vi v11, v10, 13
-; RV32-NEXT: vslidedown.vi v23, v10, 14
-; RV32-NEXT: vslidedown.vi v10, v10, 15
-; RV32-NEXT: vmv.x.s s2, v9
-; RV32-NEXT: vmv.x.s s3, v12
-; RV32-NEXT: vmv.x.s s4, v13
-; RV32-NEXT: vmv.x.s s5, v14
-; RV32-NEXT: vmv.x.s s6, v15
-; RV32-NEXT: vmv.x.s s7, v16
-; RV32-NEXT: vmv.x.s s8, v17
-; RV32-NEXT: vmv.x.s s9, v18
-; RV32-NEXT: vmv.x.s s10, v19
-; RV32-NEXT: vmv.x.s s11, v20
-; RV32-NEXT: vmv.x.s ra, v21
-; RV32-NEXT: vmseq.vx v9, v8, a0
-; RV32-NEXT: vmv.x.s a0, v22
-; RV32-NEXT: vmseq.vx v12, v8, s2
-; RV32-NEXT: vmv.x.s s2, v11
-; RV32-NEXT: vmseq.vx v11, v8, s3
-; RV32-NEXT: vmv.x.s s3, v23
-; RV32-NEXT: vmseq.vx v13, v8, s4
-; RV32-NEXT: vmv.x.s s4, v10
-; RV32-NEXT: vmseq.vx v10, v8, s5
-; RV32-NEXT: vmor.mm v9, v9, v12
-; RV32-NEXT: vmseq.vx v12, v8, s6
-; RV32-NEXT: vmor.mm v9, v9, v11
-; RV32-NEXT: vmseq.vx v11, v8, s7
-; RV32-NEXT: vmor.mm v9, v9, v13
-; RV32-NEXT: vmseq.vx v13, v8, s8
-; RV32-NEXT: vmor.mm v9, v9, v10
-; RV32-NEXT: vmseq.vx v10, v8, s9
-; RV32-NEXT: vmor.mm v9, v9, v12
-; RV32-NEXT: vmseq.vx v12, v8, s10
-; RV32-NEXT: vmor.mm v9, v9, v11
-; RV32-NEXT: vmseq.vx v11, v8, s11
-; RV32-NEXT: vmor.mm v9, v9, v13
-; RV32-NEXT: vmseq.vx v13, v8, ra
-; RV32-NEXT: vmor.mm v9, v9, v10
-; RV32-NEXT: vmseq.vx v10, v8, a0
-; RV32-NEXT: vmor.mm v9, v9, v12
-; RV32-NEXT: vmseq.vx v12, v8, s2
-; RV32-NEXT: vmor.mm v9, v9, v11
-; RV32-NEXT: vmseq.vx v11, v8, s3
-; RV32-NEXT: vmor.mm v9, v9, v13
-; RV32-NEXT: vmseq.vx v13, v8, s4
-; RV32-NEXT: vmor.mm v9, v9, v10
-; RV32-NEXT: vmseq.vx v10, v8, a1
-; RV32-NEXT: vmor.mm v9, v9, v12
-; RV32-NEXT: vmseq.vx v12, v8, a2
-; RV32-NEXT: vmor.mm v9, v9, v11
-; RV32-NEXT: vmseq.vx v11, v8, a3
-; RV32-NEXT: vmor.mm v9, v9, v13
-; RV32-NEXT: vmseq.vx v13, v8, a4
-; RV32-NEXT: vmor.mm v9, v9, v10
-; RV32-NEXT: vmseq.vx v10, v8, a5
-; RV32-NEXT: vmor.mm v9, v9, v12
-; RV32-NEXT: vmseq.vx v12, v8, a6
-; RV32-NEXT: vmor.mm v9, v9, v11
-; RV32-NEXT: vmseq.vx v11, v8, a7
-; RV32-NEXT: vmor.mm v9, v9, v13
-; RV32-NEXT: vmseq.vx v13, v8, t0
-; RV32-NEXT: vmor.mm v9, v9, v10
-; RV32-NEXT: vmseq.vx v10, v8, t1
-; RV32-NEXT: vmor.mm v9, v9, v12
-; RV32-NEXT: vmseq.vx v12, v8, t2
-; RV32-NEXT: vmor.mm v9, v9, v11
-; RV32-NEXT: vmseq.vx v11, v8, t3
-; RV32-NEXT: vmor.mm v9, v9, v13
-; RV32-NEXT: vmseq.vx v13, v8, t4
-; RV32-NEXT: vmor.mm v9, v9, v10
-; RV32-NEXT: vmseq.vx v10, v8, t5
-; RV32-NEXT: vmor.mm v9, v9, v12
-; RV32-NEXT: vmseq.vx v12, v8, t6
-; RV32-NEXT: vmor.mm v9, v9, v11
-; RV32-NEXT: vmseq.vx v11, v8, s0
-; RV32-NEXT: vmor.mm v9, v9, v13
-; RV32-NEXT: vmor.mm v9, v9, v10
-; RV32-NEXT: vmor.mm v9, v9, v12
-; RV32-NEXT: vmor.mm v9, v9, v11
-; RV32-NEXT: vmseq.vx v8, v8, s1
-; RV32-NEXT: vmor.mm v8, v9, v8
-; RV32-NEXT: vmand.mm v0, v8, v0
-; RV32-NEXT: lw ra, 60(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s0, 56(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s1, 52(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s2, 48(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s3, 44(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s4, 40(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s5, 36(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s6, 32(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s7, 28(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s8, 24(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s9, 20(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s10, 16(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s11, 12(sp) # 4-byte Folded Reload
-; RV32-NEXT: .cfi_restore ra
-; RV32-NEXT: .cfi_restore s0
-; RV32-NEXT: .cfi_restore s1
-; RV32-NEXT: .cfi_restore s2
-; RV32-NEXT: .cfi_restore s3
-; RV32-NEXT: .cfi_restore s4
-; RV32-NEXT: .cfi_restore s5
-; RV32-NEXT: .cfi_restore s6
-; RV32-NEXT: .cfi_restore s7
-; RV32-NEXT: .cfi_restore s8
-; RV32-NEXT: .cfi_restore s9
-; RV32-NEXT: .cfi_restore s10
-; RV32-NEXT: .cfi_restore s11
-; RV32-NEXT: addi sp, sp, 64
-; RV32-NEXT: .cfi_def_cfa_offset 0
-; RV32-NEXT: ret
-;
-; RV64-LABEL: match_v16i8_v32i8:
-; RV64: # %bb.0:
-; RV64-NEXT: addi sp, sp, -112
-; RV64-NEXT: .cfi_def_cfa_offset 112
-; RV64-NEXT: sd ra, 104(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s0, 96(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s1, 88(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s2, 80(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s3, 72(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s4, 64(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s5, 56(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s6, 48(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s7, 40(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s8, 32(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s9, 24(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s10, 16(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s11, 8(sp) # 8-byte Folded Spill
-; RV64-NEXT: .cfi_offset ra, -8
-; RV64-NEXT: .cfi_offset s0, -16
-; RV64-NEXT: .cfi_offset s1, -24
-; RV64-NEXT: .cfi_offset s2, -32
-; RV64-NEXT: .cfi_offset s3, -40
-; RV64-NEXT: .cfi_offset s4, -48
-; RV64-NEXT: .cfi_offset s5, -56
-; RV64-NEXT: .cfi_offset s6, -64
-; RV64-NEXT: .cfi_offset s7, -72
-; RV64-NEXT: .cfi_offset s8, -80
-; RV64-NEXT: .cfi_offset s9, -88
-; RV64-NEXT: .cfi_offset s10, -96
-; RV64-NEXT: .cfi_offset s11, -104
-; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; RV64-NEXT: vmv.x.s a0, v10
-; RV64-NEXT: vslidedown.vi v9, v10, 1
-; RV64-NEXT: vslidedown.vi v12, v10, 2
-; RV64-NEXT: vslidedown.vi v13, v10, 3
-; RV64-NEXT: vslidedown.vi v14, v10, 4
-; RV64-NEXT: vslidedown.vi v15, v10, 5
-; RV64-NEXT: vslidedown.vi v16, v10, 6
-; RV64-NEXT: vslidedown.vi v17, v10, 7
-; RV64-NEXT: vslidedown.vi v18, v10, 8
-; RV64-NEXT: vslidedown.vi v19, v10, 9
-; RV64-NEXT: vslidedown.vi v20, v10, 10
-; RV64-NEXT: vslidedown.vi v21, v10, 11
-; RV64-NEXT: vslidedown.vi v22, v10, 12
-; RV64-NEXT: vsetivli zero, 1, e8, m2, ta, ma
-; RV64-NEXT: vslidedown.vi v24, v10, 16
-; RV64-NEXT: vmv.x.s a1, v24
-; RV64-NEXT: vslidedown.vi v24, v10, 17
-; RV64-NEXT: vmv.x.s a2, v24
-; RV64-NEXT: vslidedown.vi v24, v10, 18
-; RV64-NEXT: vmv.x.s a3, v24
-; RV64-NEXT: vslidedown.vi v24, v10, 19
-; RV64-NEXT: vmv.x.s a4, v24
-; RV64-NEXT: vslidedown.vi v24, v10, 20
-; RV64-NEXT: vmv.x.s a5, v24
-; RV64-NEXT: vslidedown.vi v24, v10, 21
-; RV64-NEXT: vmv.x.s a6, v24
-; RV64-NEXT: vslidedown.vi v24, v10, 22
-; RV64-NEXT: vmv.x.s a7, v24
-; RV64-NEXT: vslidedown.vi v24, v10, 23
-; RV64-NEXT: vmv.x.s t0, v24
-; RV64-NEXT: vslidedown.vi v24, v10, 24
-; RV64-NEXT: vmv.x.s t1, v24
-; RV64-NEXT: vslidedown.vi v24, v10, 25
-; RV64-NEXT: vmv.x.s t2, v24
-; RV64-NEXT: vslidedown.vi v24, v10, 26
-; RV64-NEXT: vmv.x.s t3, v24
-; RV64-NEXT: vslidedown.vi v24, v10, 27
-; RV64-NEXT: vmv.x.s t4, v24
-; RV64-NEXT: vslidedown.vi v24, v10, 28
-; RV64-NEXT: vmv.x.s t5, v24
-; RV64-NEXT: vslidedown.vi v24, v10, 29
-; RV64-NEXT: vmv.x.s t6, v24
-; RV64-NEXT: vslidedown.vi v24, v10, 30
-; RV64-NEXT: vmv.x.s s0, v24
-; RV64-NEXT: vslidedown.vi v24, v10, 31
-; RV64-NEXT: vmv.x.s s1, v24
-; RV64-NEXT: vsetivli zero, 16, e8, m1, ta, ma
-; RV64-NEXT: vslidedown.vi v11, v10, 13
-; RV64-NEXT: vslidedown.vi v23, v10, 14
-; RV64-NEXT: vslidedown.vi v10, v10, 15
-; RV64-NEXT: vmv.x.s s2, v9
-; RV64-NEXT: vmv.x.s s3, v12
-; RV64-NEXT: vmv.x.s s4, v13
-; RV64-NEXT: vmv.x.s s5, v14
-; RV64-NEXT: vmv.x.s s6, v15
-; RV64-NEXT: vmv.x.s s7, v16
-; RV64-NEXT: vmv.x.s s8, v17
-; RV64-NEXT: vmv.x.s s9, v18
-; RV64-NEXT: vmv.x.s s10, v19
-; RV64-NEXT: vmv.x.s s11, v20
-; RV64-NEXT: vmv.x.s ra, v21
-; RV64-NEXT: vmseq.vx v9, v8, a0
-; RV64-NEXT: vmv.x.s a0, v22
-; RV64-NEXT: vmseq.vx v12, v8, s2
-; RV64-NEXT: vmv.x.s s2, v11
-; RV64-NEXT: vmseq.vx v11, v8, s3
-; RV64-NEXT: vmv.x.s s3, v23
-; RV64-NEXT: vmseq.vx v13, v8, s4
-; RV64-NEXT: vmv.x.s s4, v10
-; RV64-NEXT: vmseq.vx v10, v8, s5
-; RV64-NEXT: vmor.mm v9, v9, v12
-; RV64-NEXT: vmseq.vx v12, v8, s6
-; RV64-NEXT: vmor.mm v9, v9, v11
-; RV64-NEXT: vmseq.vx v11, v8, s7
-; RV64-NEXT: vmor.mm v9, v9, v13
-; RV64-NEXT: vmseq.vx v13, v8, s8
-; RV64-NEXT: vmor.mm v9, v9, v10
-; RV64-NEXT: vmseq.vx v10, v8, s9
-; RV64-NEXT: vmor.mm v9, v9, v12
-; RV64-NEXT: vmseq.vx v12, v8, s10
-; RV64-NEXT: vmor.mm v9, v9, v11
-; RV64-NEXT: vmseq.vx v11, v8, s11
-; RV64-NEXT: vmor.mm v9, v9, v13
-; RV64-NEXT: vmseq.vx v13, v8, ra
-; RV64-NEXT: vmor.mm v9, v9, v10
-; RV64-NEXT: vmseq.vx v10, v8, a0
-; RV64-NEXT: vmor.mm v9, v9, v12
-; RV64-NEXT: vmseq.vx v12, v8, s2
-; RV64-NEXT: vmor.mm v9, v9, v11
-; RV64-NEXT: vmseq.vx v11, v8, s3
-; RV64-NEXT: vmor.mm v9, v9, v13
-; RV64-NEXT: vmseq.vx v13, v8, s4
-; RV64-NEXT: vmor.mm v9, v9, v10
-; RV64-NEXT: vmseq.vx v10, v8, a1
-; RV64-NEXT: vmor.mm v9, v9, v12
-; RV64-NEXT: vmseq.vx v12, v8, a2
-; RV64-NEXT: vmor.mm v9, v9, v11
-; RV64-NEXT: vmseq.vx v11, v8, a3
-; RV64-NEXT: vmor.mm v9, v9, v13
-; RV64-NEXT: vmseq.vx v13, v8, a4
-; RV64-NEXT: vmor.mm v9, v9, v10
-; RV64-NEXT: vmseq.vx v10, v8, a5
-; RV64-NEXT: vmor.mm v9, v9, v12
-; RV64-NEXT: vmseq.vx v12, v8, a6
-; RV64-NEXT: vmor.mm v9, v9, v11
-; RV64-NEXT: vmseq.vx v11, v8, a7
-; RV64-NEXT: vmor.mm v9, v9, v13
-; RV64-NEXT: vmseq.vx v13, v8, t0
-; RV64-NEXT: vmor.mm v9, v9, v10
-; RV64-NEXT: vmseq.vx v10, v8, t1
-; RV64-NEXT: vmor.mm v9, v9, v12
-; RV64-NEXT: vmseq.vx v12, v8, t2
-; RV64-NEXT: vmor.mm v9, v9, v11
-; RV64-NEXT: vmseq.vx v11, v8, t3
-; RV64-NEXT: vmor.mm v9, v9, v13
-; RV64-NEXT: vmseq.vx v13, v8, t4
-; RV64-NEXT: vmor.mm v9, v9, v10
-; RV64-NEXT: vmseq.vx v10, v8, t5
-; RV64-NEXT: vmor.mm v9, v9, v12
-; RV64-NEXT: vmseq.vx v12, v8, t6
-; RV64-NEXT: vmor.mm v9, v9, v11
-; RV64-NEXT: vmseq.vx v11, v8, s0
-; RV64-NEXT: vmor.mm v9, v9, v13
-; RV64-NEXT: vmor.mm v9, v9, v10
-; RV64-NEXT: vmor.mm v9, v9, v12
-; RV64-NEXT: vmor.mm v9, v9, v11
-; RV64-NEXT: vmseq.vx v8, v8, s1
-; RV64-NEXT: vmor.mm v8, v9, v8
-; RV64-NEXT: vmand.mm v0, v8, v0
-; RV64-NEXT: ld ra, 104(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s0, 96(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s1, 88(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s2, 80(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s3, 72(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s4, 64(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s5, 56(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s6, 48(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s7, 40(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s8, 32(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s9, 24(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s10, 16(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s11, 8(sp) # 8-byte Folded Reload
-; RV64-NEXT: .cfi_restore ra
-; RV64-NEXT: .cfi_restore s0
-; RV64-NEXT: .cfi_restore s1
-; RV64-NEXT: .cfi_restore s2
-; RV64-NEXT: .cfi_restore s3
-; RV64-NEXT: .cfi_restore s4
-; RV64-NEXT: .cfi_restore s5
-; RV64-NEXT: .cfi_restore s6
-; RV64-NEXT: .cfi_restore s7
-; RV64-NEXT: .cfi_restore s8
-; RV64-NEXT: .cfi_restore s9
-; RV64-NEXT: .cfi_restore s10
-; RV64-NEXT: .cfi_restore s11
-; RV64-NEXT: addi sp, sp, 112
-; RV64-NEXT: .cfi_def_cfa_offset 0
-; RV64-NEXT: ret
+; CHECK-LABEL: match_v16i8_v32i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a1, a0, 1
+; CHECK-NEXT: add a0, a1, a0
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x03, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 3 * vlenb
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 1
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vs1r.v v0, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-NEXT: vrgather.vi v0, v10, 1
+; CHECK-NEXT: vrgather.vi v9, v10, 0
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vs1r.v v9, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vrgather.vi v9, v10, 2
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs1r.v v9, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vrgather.vi v13, v10, 3
+; CHECK-NEXT: vrgather.vi v14, v10, 4
+; CHECK-NEXT: vrgather.vi v15, v10, 5
+; CHECK-NEXT: vrgather.vi v16, v10, 6
+; CHECK-NEXT: vrgather.vi v17, v10, 7
+; CHECK-NEXT: vrgather.vi v18, v10, 8
+; CHECK-NEXT: vrgather.vi v19, v10, 9
+; CHECK-NEXT: vrgather.vi v20, v10, 10
+; CHECK-NEXT: vrgather.vi v21, v10, 11
+; CHECK-NEXT: vrgather.vi v22, v10, 12
+; CHECK-NEXT: vrgather.vi v23, v10, 13
+; CHECK-NEXT: vrgather.vi v24, v10, 14
+; CHECK-NEXT: vrgather.vi v25, v10, 15
+; CHECK-NEXT: vrgather.vi v26, v10, 16
+; CHECK-NEXT: vrgather.vi v27, v10, 17
+; CHECK-NEXT: vrgather.vi v28, v10, 18
+; CHECK-NEXT: vrgather.vi v29, v10, 19
+; CHECK-NEXT: vrgather.vi v30, v10, 20
+; CHECK-NEXT: vrgather.vi v31, v10, 21
+; CHECK-NEXT: vrgather.vi v7, v10, 22
+; CHECK-NEXT: vrgather.vi v6, v10, 23
+; CHECK-NEXT: vrgather.vi v5, v10, 24
+; CHECK-NEXT: vrgather.vi v4, v10, 25
+; CHECK-NEXT: vrgather.vi v3, v10, 26
+; CHECK-NEXT: vrgather.vi v2, v10, 27
+; CHECK-NEXT: vrgather.vi v1, v10, 28
+; CHECK-NEXT: vrgather.vi v12, v10, 29
+; CHECK-NEXT: vrgather.vi v9, v10, 30
+; CHECK-NEXT: vrgather.vi v11, v10, 31
+; CHECK-NEXT: vmseq.vv v10, v8, v0
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl1r.v v0, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vmseq.vv v0, v8, v0
+; CHECK-NEXT: vmor.mm v10, v0, v10
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vl1r.v v0, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vmseq.vv v0, v8, v0
+; CHECK-NEXT: vmor.mm v10, v10, v0
+; CHECK-NEXT: vmseq.vv v13, v8, v13
+; CHECK-NEXT: vmor.mm v10, v10, v13
+; CHECK-NEXT: vmseq.vv v13, v8, v14
+; CHECK-NEXT: vmor.mm v10, v10, v13
+; CHECK-NEXT: vmseq.vv v13, v8, v15
+; CHECK-NEXT: vmor.mm v10, v10, v13
+; CHECK-NEXT: vmseq.vv v13, v8, v16
+; CHECK-NEXT: vmor.mm v10, v10, v13
+; CHECK-NEXT: vmseq.vv v13, v8, v17
+; CHECK-NEXT: vmor.mm v10, v10, v13
+; CHECK-NEXT: vmseq.vv v13, v8, v18
+; CHECK-NEXT: vmor.mm v10, v10, v13
+; CHECK-NEXT: vmseq.vv v13, v8, v19
+; CHECK-NEXT: vmor.mm v10, v10, v13
+; CHECK-NEXT: vmseq.vv v13, v8, v20
+; CHECK-NEXT: vmor.mm v10, v10, v13
+; CHECK-NEXT: vmseq.vv v13, v8, v21
+; CHECK-NEXT: vmor.mm v10, v10, v13
+; CHECK-NEXT: vmseq.vv v13, v8, v22
+; CHECK-NEXT: vmor.mm v10, v10, v13
+; CHECK-NEXT: vmseq.vv v13, v8, v23
+; CHECK-NEXT: vmor.mm v10, v10, v13
+; CHECK-NEXT: vmseq.vv v13, v8, v24
+; CHECK-NEXT: vmor.mm v10, v10, v13
+; CHECK-NEXT: vmseq.vv v13, v8, v25
+; CHECK-NEXT: vmor.mm v10, v10, v13
+; CHECK-NEXT: vmseq.vv v13, v8, v26
+; CHECK-NEXT: vmor.mm v10, v10, v13
+; CHECK-NEXT: vmseq.vv v13, v8, v27
+; CHECK-NEXT: vmor.mm v10, v10, v13
+; CHECK-NEXT: vmseq.vv v13, v8, v28
+; CHECK-NEXT: vmor.mm v10, v10, v13
+; CHECK-NEXT: vmseq.vv v13, v8, v29
+; CHECK-NEXT: vmor.mm v10, v10, v13
+; CHECK-NEXT: vmseq.vv v13, v8, v30
+; CHECK-NEXT: vmor.mm v10, v10, v13
+; CHECK-NEXT: vmseq.vv v13, v8, v31
+; CHECK-NEXT: vmor.mm v10, v10, v13
+; CHECK-NEXT: vmseq.vv v13, v8, v7
+; CHECK-NEXT: vmor.mm v10, v10, v13
+; CHECK-NEXT: vmseq.vv v13, v8, v6
+; CHECK-NEXT: vmor.mm v10, v10, v13
+; CHECK-NEXT: vmseq.vv v13, v8, v5
+; CHECK-NEXT: vmor.mm v10, v10, v13
+; CHECK-NEXT: vmseq.vv v13, v8, v4
+; CHECK-NEXT: vmor.mm v10, v10, v13
+; CHECK-NEXT: vmseq.vv v13, v8, v3
+; CHECK-NEXT: vmor.mm v10, v10, v13
+; CHECK-NEXT: vmseq.vv v13, v8, v2
+; CHECK-NEXT: vmor.mm v10, v10, v13
+; CHECK-NEXT: vmseq.vv v13, v8, v1
+; CHECK-NEXT: vmor.mm v10, v10, v13
+; CHECK-NEXT: vmseq.vv v12, v8, v12
+; CHECK-NEXT: vmseq.vv v9, v8, v9
+; CHECK-NEXT: vmor.mm v10, v10, v12
+; CHECK-NEXT: vmor.mm v9, v10, v9
+; CHECK-NEXT: vmseq.vv v8, v8, v11
+; CHECK-NEXT: vmor.mm v8, v9, v8
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 1
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl1r.v v9, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vmand.mm v0, v8, v9
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a1, a0, 1
+; CHECK-NEXT: add a0, a1, a0
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: .cfi_def_cfa sp, 16
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: ret
%r = tail call <16 x i1> @llvm.experimental.vector.match(<16 x i8> %op1, <32 x i8> %op2, <16 x i1> %mask)
ret <16 x i1> %r
}
@@ -1298,3 +1038,24 @@ define <2 x i1> @match_v2xi64_v2i64(<2 x i64> %op1, <2 x i64> %op2, <2 x i1> %ma
%r = tail call <2 x i1> @llvm.experimental.vector.match(<2 x i64> %op1, <2 x i64> %op2, <2 x i1> %mask)
ret <2 x i1> %r
}
+
+define <2 x i1> @match_v2xi64_v4i64(<2 x i64> %op1, <4 x i64> %op2, <2 x i1> %mask) {
+; CHECK-LABEL: match_v2xi64_v4i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-NEXT: vrgather.vi v9, v10, 1
+; CHECK-NEXT: vrgather.vi v11, v10, 0
+; CHECK-NEXT: vrgather.vi v12, v10, 2
+; CHECK-NEXT: vrgather.vi v13, v10, 3
+; CHECK-NEXT: vmseq.vv v9, v8, v9
+; CHECK-NEXT: vmseq.vv v10, v8, v11
+; CHECK-NEXT: vmseq.vv v11, v8, v12
+; CHECK-NEXT: vmor.mm v9, v10, v9
+; CHECK-NEXT: vmor.mm v9, v9, v11
+; CHECK-NEXT: vmseq.vv v8, v8, v13
+; CHECK-NEXT: vmor.mm v8, v9, v8
+; CHECK-NEXT: vmand.mm v0, v8, v0
+; CHECK-NEXT: ret
+ %r = tail call <2 x i1> @llvm.experimental.vector.match(<2 x i64> %op1, <4 x i64> %op2, <2 x i1> %mask)
+ ret <2 x i1> %r
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/splat-vectors.ll b/llvm/test/CodeGen/RISCV/rvv/splat-vectors.ll
index eb57b973eacfee..93d02c8615da0b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/splat-vectors.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/splat-vectors.ll
@@ -123,8 +123,8 @@ define <vscale x 4 x i8> @splat_idx_constant_nxv4i8(<vscale x 8 x i8> %v) {
; CHECK-LABEL: splat_idx_constant_nxv4i8:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli a0, zero, e8, mf2, ta, ma
-; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: vmv.v.x v8, a0
+; CHECK-NEXT: vrgather.vi v9, v8, 0
+; CHECK-NEXT: vmv1r.v v8, v9
; CHECK-NEXT: ret
%x = extractelement <vscale x 8 x i8> %v, i64 0
%ins = insertelement <vscale x 4 x i8> poison, i8 %x, i32 0
>From 7440e761e05af69043ff5a58e681affce6cf69bb Mon Sep 17 00:00:00 2001
From: "Mikhail R. Gadelha" <mikhail at igalia.com>
Date: Tue, 14 Jan 2025 12:20:05 -0300
Subject: [PATCH 6/7] Extract the subvector from the splat vector if it's
larger than the src vector
Signed-off-by: Mikhail R. Gadelha <mikhail at igalia.com>
---
llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 33 +-
.../RISCV/rvv/intrinsic-vector-match.ll | 1062 +++++++++--------
2 files changed, 572 insertions(+), 523 deletions(-)
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 72be770449acc2..1cbec35c5214dc 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -3525,23 +3525,36 @@ static SDValue matchSplatAsGather(SDValue SplatVal, MVT VT, const SDLoc &DL,
if (Idx.getValueType() != Subtarget.getXLenVT())
return SDValue();
- // If the search vector is smaller than the vector of elements we are searching for,
- // try to extract the subvector from it
- if (VT.getVectorMinNumElements() < VecVT.getVectorMinNumElements()) {
- if (!(VT.isFixedLengthVector() || VecVT.isScalableVector()))
- return SDValue();
- Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Vec,
- DAG.getVectorIdxConstant(0, DL));
+ // Check that we know Idx lies within VT
+ if (auto *CIdx = dyn_cast<ConstantSDNode>(Idx)) {
+ if (CIdx->getZExtValue() >= VT.getVectorElementCount().getKnownMinValue())
+ return SDValue();
}
+ else if (!TypeSize::isKnownLE(Vec.getValueSizeInBits(), VT.getSizeInBits()))
+ return SDValue();
+ // Convert fixed length vectors to scalable
MVT ContainerVT = VT;
if (VT.isFixedLengthVector())
ContainerVT = getContainerForFixedLengthVector(DAG, VT, Subtarget);
- Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ContainerVT,
- DAG.getUNDEF(ContainerVT), Vec,
- DAG.getVectorIdxConstant(0, DL));
+ MVT ContainerVecVT = VecVT;
+ if (VecVT.isFixedLengthVector()) {
+ ContainerVecVT = getContainerForFixedLengthVector(DAG, VecVT, Subtarget);
+ Vec = convertToScalableVector(ContainerVecVT, Vec, DAG, Subtarget);
+ }
+
+ // Put Vec in a VT sized vector
+ if (ContainerVecVT.getVectorMinNumElements() <
+ ContainerVT.getVectorMinNumElements())
+ Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ContainerVT,
+ DAG.getUNDEF(ContainerVT), Vec,
+ DAG.getVectorIdxConstant(0, DL));
+ else
+ Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ContainerVT, Vec,
+ DAG.getVectorIdxConstant(0, DL));
+ // We checked that Idx fits inside VT earlier
auto [Mask, VL] = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget);
SDValue Gather = DAG.getNode(RISCVISD::VRGATHER_VX_VL, DL, ContainerVT, Vec,
diff --git a/llvm/test/CodeGen/RISCV/rvv/intrinsic-vector-match.ll b/llvm/test/CodeGen/RISCV/rvv/intrinsic-vector-match.ll
index 773cbf46dc8d27..e3a46840b8b275 100644
--- a/llvm/test/CodeGen/RISCV/rvv/intrinsic-vector-match.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/intrinsic-vector-match.ll
@@ -384,6 +384,24 @@ define <8 x i1> @match_v8i8_v16i8(<8 x i8> %op1, <16 x i8> %op2, <8 x i1> %mask)
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
; CHECK-NEXT: vrgather.vi v10, v9, 1
+; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT: vslidedown.vi v11, v9, 8
+; CHECK-NEXT: vmv.x.s a0, v11
+; CHECK-NEXT: vslidedown.vi v11, v9, 9
+; CHECK-NEXT: vmv.x.s a1, v11
+; CHECK-NEXT: vslidedown.vi v11, v9, 10
+; CHECK-NEXT: vmv.x.s a2, v11
+; CHECK-NEXT: vslidedown.vi v11, v9, 11
+; CHECK-NEXT: vmv.x.s a3, v11
+; CHECK-NEXT: vslidedown.vi v11, v9, 12
+; CHECK-NEXT: vmv.x.s a4, v11
+; CHECK-NEXT: vslidedown.vi v11, v9, 13
+; CHECK-NEXT: vmv.x.s a5, v11
+; CHECK-NEXT: vslidedown.vi v11, v9, 14
+; CHECK-NEXT: vmv.x.s a6, v11
+; CHECK-NEXT: vslidedown.vi v11, v9, 15
+; CHECK-NEXT: vmv.x.s a7, v11
+; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
; CHECK-NEXT: vrgather.vi v11, v9, 0
; CHECK-NEXT: vmseq.vv v10, v8, v10
; CHECK-NEXT: vmseq.vv v11, v8, v11
@@ -403,32 +421,24 @@ define <8 x i1> @match_v8i8_v16i8(<8 x i8> %op1, <16 x i8> %op2, <8 x i1> %mask)
; CHECK-NEXT: vrgather.vi v11, v9, 6
; CHECK-NEXT: vmseq.vv v11, v8, v11
; CHECK-NEXT: vmor.mm v10, v10, v11
-; CHECK-NEXT: vrgather.vi v11, v9, 7
-; CHECK-NEXT: vmseq.vv v11, v8, v11
-; CHECK-NEXT: vmor.mm v10, v10, v11
-; CHECK-NEXT: vrgather.vi v11, v9, 8
-; CHECK-NEXT: vmseq.vv v11, v8, v11
-; CHECK-NEXT: vmor.mm v10, v10, v11
-; CHECK-NEXT: vrgather.vi v11, v9, 9
-; CHECK-NEXT: vmseq.vv v11, v8, v11
-; CHECK-NEXT: vmor.mm v10, v10, v11
-; CHECK-NEXT: vrgather.vi v11, v9, 10
-; CHECK-NEXT: vmseq.vv v11, v8, v11
-; CHECK-NEXT: vmor.mm v10, v10, v11
-; CHECK-NEXT: vrgather.vi v11, v9, 11
-; CHECK-NEXT: vmseq.vv v11, v8, v11
-; CHECK-NEXT: vmor.mm v10, v10, v11
-; CHECK-NEXT: vrgather.vi v11, v9, 12
-; CHECK-NEXT: vmseq.vv v11, v8, v11
-; CHECK-NEXT: vmor.mm v10, v10, v11
-; CHECK-NEXT: vrgather.vi v11, v9, 13
-; CHECK-NEXT: vmseq.vv v11, v8, v11
-; CHECK-NEXT: vmor.mm v10, v10, v11
-; CHECK-NEXT: vrgather.vi v11, v9, 14
-; CHECK-NEXT: vrgather.vi v12, v9, 15
-; CHECK-NEXT: vmseq.vv v9, v8, v11
+; CHECK-NEXT: vmseq.vx v11, v8, a0
+; CHECK-NEXT: vrgather.vi v12, v9, 7
+; CHECK-NEXT: vmseq.vv v9, v8, v12
; CHECK-NEXT: vmor.mm v9, v10, v9
-; CHECK-NEXT: vmseq.vv v8, v8, v12
+; CHECK-NEXT: vmseq.vx v10, v8, a1
+; CHECK-NEXT: vmor.mm v9, v9, v11
+; CHECK-NEXT: vmseq.vx v11, v8, a2
+; CHECK-NEXT: vmor.mm v9, v9, v10
+; CHECK-NEXT: vmseq.vx v10, v8, a3
+; CHECK-NEXT: vmor.mm v9, v9, v11
+; CHECK-NEXT: vmseq.vx v11, v8, a4
+; CHECK-NEXT: vmor.mm v9, v9, v10
+; CHECK-NEXT: vmseq.vx v10, v8, a5
+; CHECK-NEXT: vmor.mm v9, v9, v11
+; CHECK-NEXT: vmseq.vx v11, v8, a6
+; CHECK-NEXT: vmor.mm v9, v9, v10
+; CHECK-NEXT: vmor.mm v9, v9, v11
+; CHECK-NEXT: vmseq.vx v8, v8, a7
; CHECK-NEXT: vmor.mm v8, v9, v8
; CHECK-NEXT: vmand.mm v0, v8, v0
; CHECK-NEXT: ret
@@ -439,530 +449,511 @@ define <8 x i1> @match_v8i8_v16i8(<8 x i8> %op1, <16 x i8> %op2, <8 x i1> %mask)
define <vscale x 16 x i1> @match_nxv16i8_v32i8(<vscale x 16 x i8> %op1, <32 x i8> %op2, <vscale x 16 x i1> %mask) {
; RV32-LABEL: match_nxv16i8_v32i8:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -64
-; RV32-NEXT: .cfi_def_cfa_offset 64
-; RV32-NEXT: sw ra, 60(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s0, 56(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s1, 52(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s2, 48(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s3, 44(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s4, 40(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s5, 36(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s6, 32(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s7, 28(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s8, 24(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s9, 20(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s10, 16(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s11, 12(sp) # 4-byte Folded Spill
-; RV32-NEXT: .cfi_offset ra, -4
-; RV32-NEXT: .cfi_offset s0, -8
-; RV32-NEXT: .cfi_offset s1, -12
-; RV32-NEXT: .cfi_offset s2, -16
-; RV32-NEXT: .cfi_offset s3, -20
-; RV32-NEXT: .cfi_offset s4, -24
-; RV32-NEXT: .cfi_offset s5, -28
-; RV32-NEXT: .cfi_offset s6, -32
-; RV32-NEXT: .cfi_offset s7, -36
-; RV32-NEXT: .cfi_offset s8, -40
-; RV32-NEXT: .cfi_offset s9, -44
-; RV32-NEXT: .cfi_offset s10, -48
-; RV32-NEXT: .cfi_offset s11, -52
-; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; RV32-NEXT: vmv.x.s a0, v10
-; RV32-NEXT: sw a0, 8(sp) # 4-byte Folded Spill
-; RV32-NEXT: vslidedown.vi v12, v10, 1
-; RV32-NEXT: vslidedown.vi v13, v10, 2
-; RV32-NEXT: vslidedown.vi v14, v10, 3
-; RV32-NEXT: vslidedown.vi v15, v10, 4
-; RV32-NEXT: vslidedown.vi v16, v10, 5
-; RV32-NEXT: vslidedown.vi v17, v10, 6
-; RV32-NEXT: vslidedown.vi v18, v10, 7
-; RV32-NEXT: vslidedown.vi v19, v10, 8
-; RV32-NEXT: vslidedown.vi v20, v10, 9
-; RV32-NEXT: vslidedown.vi v21, v10, 10
-; RV32-NEXT: vslidedown.vi v22, v10, 11
-; RV32-NEXT: vslidedown.vi v23, v10, 12
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: sw s0, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT: .cfi_offset s0, -4
+; RV32-NEXT: vsetvli a0, zero, e8, m2, ta, ma
+; RV32-NEXT: vrgather.vi v14, v10, 1
+; RV32-NEXT: vrgather.vi v16, v10, 0
+; RV32-NEXT: vrgather.vi v18, v10, 2
+; RV32-NEXT: vrgather.vi v20, v10, 3
+; RV32-NEXT: vrgather.vi v22, v10, 4
+; RV32-NEXT: vrgather.vi v24, v10, 5
+; RV32-NEXT: vrgather.vi v26, v10, 6
+; RV32-NEXT: vrgather.vi v28, v10, 7
+; RV32-NEXT: vmseq.vv v12, v8, v14
+; RV32-NEXT: vmseq.vv v13, v8, v16
+; RV32-NEXT: vrgather.vi v30, v10, 8
+; RV32-NEXT: vmseq.vv v14, v8, v18
+; RV32-NEXT: vmseq.vv v15, v8, v20
+; RV32-NEXT: vrgather.vi v6, v10, 9
+; RV32-NEXT: vmseq.vv v16, v8, v22
+; RV32-NEXT: vmseq.vv v17, v8, v24
+; RV32-NEXT: vrgather.vi v24, v10, 10
+; RV32-NEXT: vmseq.vv v18, v8, v26
+; RV32-NEXT: vmseq.vv v19, v8, v28
+; RV32-NEXT: vrgather.vi v26, v10, 11
+; RV32-NEXT: vmseq.vv v20, v8, v30
+; RV32-NEXT: vmseq.vv v21, v8, v6
+; RV32-NEXT: vrgather.vi v28, v10, 12
+; RV32-NEXT: vmseq.vv v22, v8, v24
+; RV32-NEXT: vmseq.vv v23, v8, v26
+; RV32-NEXT: vrgather.vi v26, v10, 13
+; RV32-NEXT: vmseq.vv v25, v8, v28
+; RV32-NEXT: vmseq.vv v24, v8, v26
+; RV32-NEXT: vslidedown.vi v26, v10, 16
+; RV32-NEXT: vmv.x.s a0, v26
+; RV32-NEXT: vslidedown.vi v26, v10, 17
+; RV32-NEXT: vmv.x.s a1, v26
+; RV32-NEXT: vslidedown.vi v26, v10, 18
+; RV32-NEXT: vmv.x.s a2, v26
+; RV32-NEXT: vslidedown.vi v26, v10, 19
+; RV32-NEXT: vmv.x.s a3, v26
+; RV32-NEXT: vslidedown.vi v26, v10, 20
+; RV32-NEXT: vmv.x.s a4, v26
+; RV32-NEXT: vslidedown.vi v26, v10, 21
+; RV32-NEXT: vmv.x.s a5, v26
+; RV32-NEXT: vslidedown.vi v26, v10, 22
+; RV32-NEXT: vmv.x.s a6, v26
+; RV32-NEXT: vslidedown.vi v26, v10, 23
+; RV32-NEXT: vmv.x.s a7, v26
+; RV32-NEXT: vslidedown.vi v26, v10, 24
+; RV32-NEXT: vmv.x.s t0, v26
+; RV32-NEXT: vslidedown.vi v26, v10, 25
+; RV32-NEXT: vmv.x.s t1, v26
+; RV32-NEXT: vslidedown.vi v26, v10, 26
+; RV32-NEXT: vmv.x.s t2, v26
+; RV32-NEXT: vslidedown.vi v26, v10, 27
+; RV32-NEXT: vmv.x.s t3, v26
+; RV32-NEXT: vslidedown.vi v26, v10, 28
+; RV32-NEXT: vmv.x.s t4, v26
+; RV32-NEXT: vslidedown.vi v26, v10, 29
+; RV32-NEXT: vmv.x.s t5, v26
+; RV32-NEXT: vslidedown.vi v26, v10, 30
+; RV32-NEXT: vmv.x.s t6, v26
+; RV32-NEXT: vslidedown.vi v26, v10, 31
+; RV32-NEXT: vmv.x.s s0, v26
+; RV32-NEXT: vrgather.vi v26, v10, 14
+; RV32-NEXT: vmseq.vv v28, v8, v26
+; RV32-NEXT: vrgather.vi v26, v10, 15
+; RV32-NEXT: vmseq.vv v10, v8, v26
+; RV32-NEXT: vmor.mm v11, v13, v12
+; RV32-NEXT: vmor.mm v11, v11, v14
+; RV32-NEXT: vmor.mm v11, v11, v15
+; RV32-NEXT: vmor.mm v11, v11, v16
+; RV32-NEXT: vmor.mm v11, v11, v17
+; RV32-NEXT: vmor.mm v11, v11, v18
+; RV32-NEXT: vmor.mm v11, v11, v19
+; RV32-NEXT: vmor.mm v11, v11, v20
+; RV32-NEXT: vmor.mm v11, v11, v21
+; RV32-NEXT: vmor.mm v11, v11, v22
+; RV32-NEXT: vmor.mm v11, v11, v23
+; RV32-NEXT: vmor.mm v11, v11, v25
+; RV32-NEXT: vmseq.vx v12, v8, a0
+; RV32-NEXT: vmor.mm v11, v11, v24
+; RV32-NEXT: vmseq.vx v13, v8, a1
+; RV32-NEXT: vmor.mm v11, v11, v28
+; RV32-NEXT: vmseq.vx v14, v8, a2
+; RV32-NEXT: vmor.mm v10, v11, v10
+; RV32-NEXT: vmseq.vx v11, v8, a3
+; RV32-NEXT: vmor.mm v10, v10, v12
+; RV32-NEXT: vmseq.vx v12, v8, a4
+; RV32-NEXT: vmor.mm v10, v10, v13
+; RV32-NEXT: vmseq.vx v13, v8, a5
+; RV32-NEXT: vmor.mm v10, v10, v14
+; RV32-NEXT: vmseq.vx v14, v8, a6
+; RV32-NEXT: vmor.mm v10, v10, v11
+; RV32-NEXT: vmseq.vx v11, v8, a7
+; RV32-NEXT: vmor.mm v10, v10, v12
+; RV32-NEXT: vmseq.vx v12, v8, t0
+; RV32-NEXT: vmor.mm v10, v10, v13
+; RV32-NEXT: vmseq.vx v13, v8, t1
+; RV32-NEXT: vmor.mm v10, v10, v14
+; RV32-NEXT: vmseq.vx v14, v8, t2
+; RV32-NEXT: vmor.mm v10, v10, v11
+; RV32-NEXT: vmseq.vx v11, v8, t3
+; RV32-NEXT: vmor.mm v10, v10, v12
+; RV32-NEXT: vmseq.vx v12, v8, t4
+; RV32-NEXT: vmor.mm v10, v10, v13
+; RV32-NEXT: vmseq.vx v13, v8, t5
+; RV32-NEXT: vmor.mm v10, v10, v14
+; RV32-NEXT: vmseq.vx v14, v8, t6
+; RV32-NEXT: vmor.mm v10, v10, v11
+; RV32-NEXT: vmor.mm v10, v10, v12
+; RV32-NEXT: vmor.mm v10, v10, v13
+; RV32-NEXT: vmor.mm v10, v10, v14
+; RV32-NEXT: vmseq.vx v11, v8, s0
+; RV32-NEXT: vmor.mm v8, v10, v11
+; RV32-NEXT: vmand.mm v0, v8, v0
+; RV32-NEXT: lw s0, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT: .cfi_restore s0
+; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: .cfi_def_cfa_offset 0
+; RV32-NEXT: ret
+;
+; RV64-LABEL: match_nxv16i8_v32i8:
+; RV64: # %bb.0:
+; RV64-NEXT: addi sp, sp, -16
+; RV64-NEXT: .cfi_def_cfa_offset 16
+; RV64-NEXT: sd s0, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT: .cfi_offset s0, -8
+; RV64-NEXT: vsetvli a0, zero, e8, m2, ta, ma
+; RV64-NEXT: vrgather.vi v14, v10, 1
+; RV64-NEXT: vrgather.vi v16, v10, 0
+; RV64-NEXT: vrgather.vi v18, v10, 2
+; RV64-NEXT: vrgather.vi v20, v10, 3
+; RV64-NEXT: vrgather.vi v22, v10, 4
+; RV64-NEXT: vrgather.vi v24, v10, 5
+; RV64-NEXT: vrgather.vi v26, v10, 6
+; RV64-NEXT: vrgather.vi v28, v10, 7
+; RV64-NEXT: vmseq.vv v12, v8, v14
+; RV64-NEXT: vmseq.vv v13, v8, v16
+; RV64-NEXT: vrgather.vi v30, v10, 8
+; RV64-NEXT: vmseq.vv v14, v8, v18
+; RV64-NEXT: vmseq.vv v15, v8, v20
+; RV64-NEXT: vrgather.vi v6, v10, 9
+; RV64-NEXT: vmseq.vv v16, v8, v22
+; RV64-NEXT: vmseq.vv v17, v8, v24
+; RV64-NEXT: vrgather.vi v24, v10, 10
+; RV64-NEXT: vmseq.vv v18, v8, v26
+; RV64-NEXT: vmseq.vv v19, v8, v28
+; RV64-NEXT: vrgather.vi v26, v10, 11
+; RV64-NEXT: vmseq.vv v20, v8, v30
+; RV64-NEXT: vmseq.vv v21, v8, v6
+; RV64-NEXT: vrgather.vi v28, v10, 12
+; RV64-NEXT: vmseq.vv v22, v8, v24
+; RV64-NEXT: vmseq.vv v23, v8, v26
+; RV64-NEXT: vrgather.vi v26, v10, 13
+; RV64-NEXT: vmseq.vv v25, v8, v28
+; RV64-NEXT: vmseq.vv v24, v8, v26
+; RV64-NEXT: vslidedown.vi v26, v10, 16
+; RV64-NEXT: vmv.x.s a0, v26
+; RV64-NEXT: vslidedown.vi v26, v10, 17
+; RV64-NEXT: vmv.x.s a1, v26
+; RV64-NEXT: vslidedown.vi v26, v10, 18
+; RV64-NEXT: vmv.x.s a2, v26
+; RV64-NEXT: vslidedown.vi v26, v10, 19
+; RV64-NEXT: vmv.x.s a3, v26
+; RV64-NEXT: vslidedown.vi v26, v10, 20
+; RV64-NEXT: vmv.x.s a4, v26
+; RV64-NEXT: vslidedown.vi v26, v10, 21
+; RV64-NEXT: vmv.x.s a5, v26
+; RV64-NEXT: vslidedown.vi v26, v10, 22
+; RV64-NEXT: vmv.x.s a6, v26
+; RV64-NEXT: vslidedown.vi v26, v10, 23
+; RV64-NEXT: vmv.x.s a7, v26
+; RV64-NEXT: vslidedown.vi v26, v10, 24
+; RV64-NEXT: vmv.x.s t0, v26
+; RV64-NEXT: vslidedown.vi v26, v10, 25
+; RV64-NEXT: vmv.x.s t1, v26
+; RV64-NEXT: vslidedown.vi v26, v10, 26
+; RV64-NEXT: vmv.x.s t2, v26
+; RV64-NEXT: vslidedown.vi v26, v10, 27
+; RV64-NEXT: vmv.x.s t3, v26
+; RV64-NEXT: vslidedown.vi v26, v10, 28
+; RV64-NEXT: vmv.x.s t4, v26
+; RV64-NEXT: vslidedown.vi v26, v10, 29
+; RV64-NEXT: vmv.x.s t5, v26
+; RV64-NEXT: vslidedown.vi v26, v10, 30
+; RV64-NEXT: vmv.x.s t6, v26
+; RV64-NEXT: vslidedown.vi v26, v10, 31
+; RV64-NEXT: vmv.x.s s0, v26
+; RV64-NEXT: vrgather.vi v26, v10, 14
+; RV64-NEXT: vmseq.vv v28, v8, v26
+; RV64-NEXT: vrgather.vi v26, v10, 15
+; RV64-NEXT: vmseq.vv v10, v8, v26
+; RV64-NEXT: vmor.mm v11, v13, v12
+; RV64-NEXT: vmor.mm v11, v11, v14
+; RV64-NEXT: vmor.mm v11, v11, v15
+; RV64-NEXT: vmor.mm v11, v11, v16
+; RV64-NEXT: vmor.mm v11, v11, v17
+; RV64-NEXT: vmor.mm v11, v11, v18
+; RV64-NEXT: vmor.mm v11, v11, v19
+; RV64-NEXT: vmor.mm v11, v11, v20
+; RV64-NEXT: vmor.mm v11, v11, v21
+; RV64-NEXT: vmor.mm v11, v11, v22
+; RV64-NEXT: vmor.mm v11, v11, v23
+; RV64-NEXT: vmor.mm v11, v11, v25
+; RV64-NEXT: vmseq.vx v12, v8, a0
+; RV64-NEXT: vmor.mm v11, v11, v24
+; RV64-NEXT: vmseq.vx v13, v8, a1
+; RV64-NEXT: vmor.mm v11, v11, v28
+; RV64-NEXT: vmseq.vx v14, v8, a2
+; RV64-NEXT: vmor.mm v10, v11, v10
+; RV64-NEXT: vmseq.vx v11, v8, a3
+; RV64-NEXT: vmor.mm v10, v10, v12
+; RV64-NEXT: vmseq.vx v12, v8, a4
+; RV64-NEXT: vmor.mm v10, v10, v13
+; RV64-NEXT: vmseq.vx v13, v8, a5
+; RV64-NEXT: vmor.mm v10, v10, v14
+; RV64-NEXT: vmseq.vx v14, v8, a6
+; RV64-NEXT: vmor.mm v10, v10, v11
+; RV64-NEXT: vmseq.vx v11, v8, a7
+; RV64-NEXT: vmor.mm v10, v10, v12
+; RV64-NEXT: vmseq.vx v12, v8, t0
+; RV64-NEXT: vmor.mm v10, v10, v13
+; RV64-NEXT: vmseq.vx v13, v8, t1
+; RV64-NEXT: vmor.mm v10, v10, v14
+; RV64-NEXT: vmseq.vx v14, v8, t2
+; RV64-NEXT: vmor.mm v10, v10, v11
+; RV64-NEXT: vmseq.vx v11, v8, t3
+; RV64-NEXT: vmor.mm v10, v10, v12
+; RV64-NEXT: vmseq.vx v12, v8, t4
+; RV64-NEXT: vmor.mm v10, v10, v13
+; RV64-NEXT: vmseq.vx v13, v8, t5
+; RV64-NEXT: vmor.mm v10, v10, v14
+; RV64-NEXT: vmseq.vx v14, v8, t6
+; RV64-NEXT: vmor.mm v10, v10, v11
+; RV64-NEXT: vmor.mm v10, v10, v12
+; RV64-NEXT: vmor.mm v10, v10, v13
+; RV64-NEXT: vmor.mm v10, v10, v14
+; RV64-NEXT: vmseq.vx v11, v8, s0
+; RV64-NEXT: vmor.mm v8, v10, v11
+; RV64-NEXT: vmand.mm v0, v8, v0
+; RV64-NEXT: ld s0, 8(sp) # 8-byte Folded Reload
+; RV64-NEXT: .cfi_restore s0
+; RV64-NEXT: addi sp, sp, 16
+; RV64-NEXT: .cfi_def_cfa_offset 0
+; RV64-NEXT: ret
+ %r = tail call <vscale x 16 x i1> @llvm.experimental.vector.match(<vscale x 16 x i8> %op1, <32 x i8> %op2, <vscale x 16 x i1> %mask)
+ ret <vscale x 16 x i1> %r
+}
+
+define <16 x i1> @match_v16i8_v32i8(<16 x i8> %op1, <32 x i8> %op2, <16 x i1> %mask) {
+; RV32-LABEL: match_v16i8_v32i8:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: sw s0, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT: .cfi_offset s0, -4
+; RV32-NEXT: vsetivli zero, 16, e8, m1, ta, ma
+; RV32-NEXT: vrgather.vi v9, v10, 1
+; RV32-NEXT: vrgather.vi v12, v10, 0
+; RV32-NEXT: vrgather.vi v13, v10, 2
+; RV32-NEXT: vrgather.vi v14, v10, 3
+; RV32-NEXT: vrgather.vi v15, v10, 4
+; RV32-NEXT: vrgather.vi v16, v10, 5
+; RV32-NEXT: vrgather.vi v17, v10, 6
+; RV32-NEXT: vrgather.vi v18, v10, 7
+; RV32-NEXT: vrgather.vi v19, v10, 8
+; RV32-NEXT: vrgather.vi v20, v10, 9
+; RV32-NEXT: vrgather.vi v21, v10, 10
+; RV32-NEXT: vrgather.vi v22, v10, 11
+; RV32-NEXT: vrgather.vi v23, v10, 12
; RV32-NEXT: vsetivli zero, 1, e8, m2, ta, ma
; RV32-NEXT: vslidedown.vi v24, v10, 16
-; RV32-NEXT: vmv.x.s a1, v24
+; RV32-NEXT: vmv.x.s a0, v24
; RV32-NEXT: vslidedown.vi v24, v10, 17
-; RV32-NEXT: vmv.x.s a2, v24
+; RV32-NEXT: vmv.x.s a1, v24
; RV32-NEXT: vslidedown.vi v24, v10, 18
-; RV32-NEXT: vmv.x.s a3, v24
+; RV32-NEXT: vmv.x.s a2, v24
; RV32-NEXT: vslidedown.vi v24, v10, 19
-; RV32-NEXT: vmv.x.s a4, v24
+; RV32-NEXT: vmv.x.s a3, v24
; RV32-NEXT: vslidedown.vi v24, v10, 20
-; RV32-NEXT: vmv.x.s a5, v24
+; RV32-NEXT: vmv.x.s a4, v24
; RV32-NEXT: vslidedown.vi v24, v10, 21
-; RV32-NEXT: vmv.x.s a6, v24
+; RV32-NEXT: vmv.x.s a5, v24
; RV32-NEXT: vslidedown.vi v24, v10, 22
-; RV32-NEXT: vmv.x.s a7, v24
+; RV32-NEXT: vmv.x.s a6, v24
; RV32-NEXT: vslidedown.vi v24, v10, 23
-; RV32-NEXT: vmv.x.s t0, v24
+; RV32-NEXT: vmv.x.s a7, v24
; RV32-NEXT: vslidedown.vi v24, v10, 24
-; RV32-NEXT: vmv.x.s t1, v24
+; RV32-NEXT: vmv.x.s t0, v24
; RV32-NEXT: vslidedown.vi v24, v10, 25
-; RV32-NEXT: vmv.x.s t2, v24
+; RV32-NEXT: vmv.x.s t1, v24
; RV32-NEXT: vslidedown.vi v24, v10, 26
-; RV32-NEXT: vmv.x.s t3, v24
+; RV32-NEXT: vmv.x.s t2, v24
; RV32-NEXT: vslidedown.vi v24, v10, 27
-; RV32-NEXT: vmv.x.s t4, v24
+; RV32-NEXT: vmv.x.s t3, v24
; RV32-NEXT: vslidedown.vi v24, v10, 28
-; RV32-NEXT: vmv.x.s t5, v24
+; RV32-NEXT: vmv.x.s t4, v24
; RV32-NEXT: vslidedown.vi v24, v10, 29
-; RV32-NEXT: vmv.x.s t6, v24
+; RV32-NEXT: vmv.x.s t5, v24
; RV32-NEXT: vslidedown.vi v24, v10, 30
-; RV32-NEXT: vmv.x.s s0, v24
+; RV32-NEXT: vmv.x.s t6, v24
; RV32-NEXT: vslidedown.vi v24, v10, 31
-; RV32-NEXT: vmv.x.s s1, v24
-; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; RV32-NEXT: vslidedown.vi v11, v10, 13
-; RV32-NEXT: vslidedown.vi v24, v10, 14
-; RV32-NEXT: vslidedown.vi v10, v10, 15
-; RV32-NEXT: vmv.x.s s2, v12
-; RV32-NEXT: vmv.x.s s3, v13
-; RV32-NEXT: vmv.x.s s4, v14
-; RV32-NEXT: vmv.x.s s5, v15
-; RV32-NEXT: vmv.x.s s6, v16
-; RV32-NEXT: vmv.x.s s7, v17
-; RV32-NEXT: vmv.x.s s8, v18
-; RV32-NEXT: vmv.x.s s9, v19
-; RV32-NEXT: vmv.x.s s10, v20
-; RV32-NEXT: vmv.x.s s11, v21
-; RV32-NEXT: vmv.x.s ra, v22
-; RV32-NEXT: vsetvli a0, zero, e8, m2, ta, ma
-; RV32-NEXT: lw a0, 8(sp) # 4-byte Folded Reload
-; RV32-NEXT: vmseq.vx v12, v8, a0
-; RV32-NEXT: vmv.x.s a0, v23
-; RV32-NEXT: vmseq.vx v13, v8, s2
-; RV32-NEXT: vmv.x.s s2, v11
-; RV32-NEXT: vmseq.vx v11, v8, s3
-; RV32-NEXT: vmv.x.s s3, v24
-; RV32-NEXT: vmseq.vx v14, v8, s4
-; RV32-NEXT: vmv.x.s s4, v10
-; RV32-NEXT: vmseq.vx v10, v8, s5
-; RV32-NEXT: vmor.mm v12, v12, v13
-; RV32-NEXT: vmseq.vx v13, v8, s6
-; RV32-NEXT: vmor.mm v11, v12, v11
-; RV32-NEXT: vmseq.vx v12, v8, s7
-; RV32-NEXT: vmor.mm v11, v11, v14
-; RV32-NEXT: vmseq.vx v14, v8, s8
-; RV32-NEXT: vmor.mm v10, v11, v10
-; RV32-NEXT: vmseq.vx v11, v8, s9
-; RV32-NEXT: vmor.mm v10, v10, v13
-; RV32-NEXT: vmseq.vx v13, v8, s10
-; RV32-NEXT: vmor.mm v10, v10, v12
-; RV32-NEXT: vmseq.vx v12, v8, s11
-; RV32-NEXT: vmor.mm v10, v10, v14
-; RV32-NEXT: vmseq.vx v14, v8, ra
-; RV32-NEXT: vmor.mm v10, v10, v11
-; RV32-NEXT: vmseq.vx v11, v8, a0
-; RV32-NEXT: vmor.mm v10, v10, v13
-; RV32-NEXT: vmseq.vx v13, v8, s2
-; RV32-NEXT: vmor.mm v10, v10, v12
-; RV32-NEXT: vmseq.vx v12, v8, s3
-; RV32-NEXT: vmor.mm v10, v10, v14
-; RV32-NEXT: vmseq.vx v14, v8, s4
-; RV32-NEXT: vmor.mm v10, v10, v11
+; RV32-NEXT: vmv.x.s s0, v24
+; RV32-NEXT: vsetivli zero, 16, e8, m1, ta, ma
+; RV32-NEXT: vrgather.vi v11, v10, 13
+; RV32-NEXT: vrgather.vi v24, v10, 14
+; RV32-NEXT: vrgather.vi v25, v10, 15
+; RV32-NEXT: vmseq.vv v9, v8, v9
+; RV32-NEXT: vmseq.vv v10, v8, v12
+; RV32-NEXT: vmor.mm v9, v10, v9
+; RV32-NEXT: vmseq.vv v10, v8, v13
+; RV32-NEXT: vmor.mm v9, v9, v10
+; RV32-NEXT: vmseq.vv v10, v8, v14
+; RV32-NEXT: vmor.mm v9, v9, v10
+; RV32-NEXT: vmseq.vv v10, v8, v15
+; RV32-NEXT: vmor.mm v9, v9, v10
+; RV32-NEXT: vmseq.vv v10, v8, v16
+; RV32-NEXT: vmor.mm v9, v9, v10
+; RV32-NEXT: vmseq.vv v10, v8, v17
+; RV32-NEXT: vmor.mm v9, v9, v10
+; RV32-NEXT: vmseq.vv v10, v8, v18
+; RV32-NEXT: vmor.mm v9, v9, v10
+; RV32-NEXT: vmseq.vv v10, v8, v19
+; RV32-NEXT: vmor.mm v9, v9, v10
+; RV32-NEXT: vmseq.vv v10, v8, v20
+; RV32-NEXT: vmor.mm v9, v9, v10
+; RV32-NEXT: vmseq.vv v10, v8, v21
+; RV32-NEXT: vmor.mm v9, v9, v10
+; RV32-NEXT: vmseq.vv v10, v8, v22
+; RV32-NEXT: vmor.mm v9, v9, v10
+; RV32-NEXT: vmseq.vv v10, v8, v23
+; RV32-NEXT: vmor.mm v9, v9, v10
+; RV32-NEXT: vmseq.vx v10, v8, a0
+; RV32-NEXT: vmseq.vv v11, v8, v11
+; RV32-NEXT: vmor.mm v9, v9, v11
; RV32-NEXT: vmseq.vx v11, v8, a1
-; RV32-NEXT: vmor.mm v10, v10, v13
-; RV32-NEXT: vmseq.vx v13, v8, a2
-; RV32-NEXT: vmor.mm v10, v10, v12
-; RV32-NEXT: vmseq.vx v12, v8, a3
-; RV32-NEXT: vmor.mm v10, v10, v14
-; RV32-NEXT: vmseq.vx v14, v8, a4
-; RV32-NEXT: vmor.mm v10, v10, v11
+; RV32-NEXT: vmseq.vv v12, v8, v24
+; RV32-NEXT: vmor.mm v9, v9, v12
+; RV32-NEXT: vmseq.vx v12, v8, a2
+; RV32-NEXT: vmseq.vv v13, v8, v25
+; RV32-NEXT: vmor.mm v9, v9, v13
+; RV32-NEXT: vmseq.vx v13, v8, a3
+; RV32-NEXT: vmor.mm v9, v9, v10
+; RV32-NEXT: vmseq.vx v10, v8, a4
+; RV32-NEXT: vmor.mm v9, v9, v11
; RV32-NEXT: vmseq.vx v11, v8, a5
-; RV32-NEXT: vmor.mm v10, v10, v13
-; RV32-NEXT: vmseq.vx v13, v8, a6
-; RV32-NEXT: vmor.mm v10, v10, v12
-; RV32-NEXT: vmseq.vx v12, v8, a7
-; RV32-NEXT: vmor.mm v10, v10, v14
-; RV32-NEXT: vmseq.vx v14, v8, t0
-; RV32-NEXT: vmor.mm v10, v10, v11
+; RV32-NEXT: vmor.mm v9, v9, v12
+; RV32-NEXT: vmseq.vx v12, v8, a6
+; RV32-NEXT: vmor.mm v9, v9, v13
+; RV32-NEXT: vmseq.vx v13, v8, a7
+; RV32-NEXT: vmor.mm v9, v9, v10
+; RV32-NEXT: vmseq.vx v10, v8, t0
+; RV32-NEXT: vmor.mm v9, v9, v11
; RV32-NEXT: vmseq.vx v11, v8, t1
-; RV32-NEXT: vmor.mm v10, v10, v13
-; RV32-NEXT: vmseq.vx v13, v8, t2
-; RV32-NEXT: vmor.mm v10, v10, v12
-; RV32-NEXT: vmseq.vx v12, v8, t3
-; RV32-NEXT: vmor.mm v10, v10, v14
-; RV32-NEXT: vmseq.vx v14, v8, t4
-; RV32-NEXT: vmor.mm v10, v10, v11
+; RV32-NEXT: vmor.mm v9, v9, v12
+; RV32-NEXT: vmseq.vx v12, v8, t2
+; RV32-NEXT: vmor.mm v9, v9, v13
+; RV32-NEXT: vmseq.vx v13, v8, t3
+; RV32-NEXT: vmor.mm v9, v9, v10
+; RV32-NEXT: vmseq.vx v10, v8, t4
+; RV32-NEXT: vmor.mm v9, v9, v11
; RV32-NEXT: vmseq.vx v11, v8, t5
-; RV32-NEXT: vmor.mm v10, v10, v13
-; RV32-NEXT: vmseq.vx v13, v8, t6
-; RV32-NEXT: vmor.mm v10, v10, v12
-; RV32-NEXT: vmseq.vx v12, v8, s0
-; RV32-NEXT: vmor.mm v10, v10, v14
-; RV32-NEXT: vmor.mm v10, v10, v11
-; RV32-NEXT: vmor.mm v10, v10, v13
-; RV32-NEXT: vmor.mm v10, v10, v12
-; RV32-NEXT: vmseq.vx v11, v8, s1
-; RV32-NEXT: vmor.mm v8, v10, v11
+; RV32-NEXT: vmor.mm v9, v9, v12
+; RV32-NEXT: vmseq.vx v12, v8, t6
+; RV32-NEXT: vmor.mm v9, v9, v13
+; RV32-NEXT: vmor.mm v9, v9, v10
+; RV32-NEXT: vmor.mm v9, v9, v11
+; RV32-NEXT: vmor.mm v9, v9, v12
+; RV32-NEXT: vmseq.vx v8, v8, s0
+; RV32-NEXT: vmor.mm v8, v9, v8
; RV32-NEXT: vmand.mm v0, v8, v0
-; RV32-NEXT: lw ra, 60(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s0, 56(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s1, 52(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s2, 48(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s3, 44(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s4, 40(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s5, 36(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s6, 32(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s7, 28(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s8, 24(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s9, 20(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s10, 16(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s11, 12(sp) # 4-byte Folded Reload
-; RV32-NEXT: .cfi_restore ra
+; RV32-NEXT: lw s0, 12(sp) # 4-byte Folded Reload
; RV32-NEXT: .cfi_restore s0
-; RV32-NEXT: .cfi_restore s1
-; RV32-NEXT: .cfi_restore s2
-; RV32-NEXT: .cfi_restore s3
-; RV32-NEXT: .cfi_restore s4
-; RV32-NEXT: .cfi_restore s5
-; RV32-NEXT: .cfi_restore s6
-; RV32-NEXT: .cfi_restore s7
-; RV32-NEXT: .cfi_restore s8
-; RV32-NEXT: .cfi_restore s9
-; RV32-NEXT: .cfi_restore s10
-; RV32-NEXT: .cfi_restore s11
-; RV32-NEXT: addi sp, sp, 64
+; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: .cfi_def_cfa_offset 0
; RV32-NEXT: ret
;
-; RV64-LABEL: match_nxv16i8_v32i8:
+; RV64-LABEL: match_v16i8_v32i8:
; RV64: # %bb.0:
-; RV64-NEXT: addi sp, sp, -112
-; RV64-NEXT: .cfi_def_cfa_offset 112
-; RV64-NEXT: sd ra, 104(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s0, 96(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s1, 88(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s2, 80(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s3, 72(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s4, 64(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s5, 56(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s6, 48(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s7, 40(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s8, 32(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s9, 24(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s10, 16(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s11, 8(sp) # 8-byte Folded Spill
-; RV64-NEXT: .cfi_offset ra, -8
-; RV64-NEXT: .cfi_offset s0, -16
-; RV64-NEXT: .cfi_offset s1, -24
-; RV64-NEXT: .cfi_offset s2, -32
-; RV64-NEXT: .cfi_offset s3, -40
-; RV64-NEXT: .cfi_offset s4, -48
-; RV64-NEXT: .cfi_offset s5, -56
-; RV64-NEXT: .cfi_offset s6, -64
-; RV64-NEXT: .cfi_offset s7, -72
-; RV64-NEXT: .cfi_offset s8, -80
-; RV64-NEXT: .cfi_offset s9, -88
-; RV64-NEXT: .cfi_offset s10, -96
-; RV64-NEXT: .cfi_offset s11, -104
-; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; RV64-NEXT: vmv.x.s a0, v10
-; RV64-NEXT: sd a0, 0(sp) # 8-byte Folded Spill
-; RV64-NEXT: vslidedown.vi v12, v10, 1
-; RV64-NEXT: vslidedown.vi v13, v10, 2
-; RV64-NEXT: vslidedown.vi v14, v10, 3
-; RV64-NEXT: vslidedown.vi v15, v10, 4
-; RV64-NEXT: vslidedown.vi v16, v10, 5
-; RV64-NEXT: vslidedown.vi v17, v10, 6
-; RV64-NEXT: vslidedown.vi v18, v10, 7
-; RV64-NEXT: vslidedown.vi v19, v10, 8
-; RV64-NEXT: vslidedown.vi v20, v10, 9
-; RV64-NEXT: vslidedown.vi v21, v10, 10
-; RV64-NEXT: vslidedown.vi v22, v10, 11
-; RV64-NEXT: vslidedown.vi v23, v10, 12
+; RV64-NEXT: addi sp, sp, -16
+; RV64-NEXT: .cfi_def_cfa_offset 16
+; RV64-NEXT: sd s0, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT: .cfi_offset s0, -8
+; RV64-NEXT: vsetivli zero, 16, e8, m1, ta, ma
+; RV64-NEXT: vrgather.vi v9, v10, 1
+; RV64-NEXT: vrgather.vi v12, v10, 0
+; RV64-NEXT: vrgather.vi v13, v10, 2
+; RV64-NEXT: vrgather.vi v14, v10, 3
+; RV64-NEXT: vrgather.vi v15, v10, 4
+; RV64-NEXT: vrgather.vi v16, v10, 5
+; RV64-NEXT: vrgather.vi v17, v10, 6
+; RV64-NEXT: vrgather.vi v18, v10, 7
+; RV64-NEXT: vrgather.vi v19, v10, 8
+; RV64-NEXT: vrgather.vi v20, v10, 9
+; RV64-NEXT: vrgather.vi v21, v10, 10
+; RV64-NEXT: vrgather.vi v22, v10, 11
+; RV64-NEXT: vrgather.vi v23, v10, 12
; RV64-NEXT: vsetivli zero, 1, e8, m2, ta, ma
; RV64-NEXT: vslidedown.vi v24, v10, 16
-; RV64-NEXT: vmv.x.s a1, v24
+; RV64-NEXT: vmv.x.s a0, v24
; RV64-NEXT: vslidedown.vi v24, v10, 17
-; RV64-NEXT: vmv.x.s a2, v24
+; RV64-NEXT: vmv.x.s a1, v24
; RV64-NEXT: vslidedown.vi v24, v10, 18
-; RV64-NEXT: vmv.x.s a3, v24
+; RV64-NEXT: vmv.x.s a2, v24
; RV64-NEXT: vslidedown.vi v24, v10, 19
-; RV64-NEXT: vmv.x.s a4, v24
+; RV64-NEXT: vmv.x.s a3, v24
; RV64-NEXT: vslidedown.vi v24, v10, 20
-; RV64-NEXT: vmv.x.s a5, v24
+; RV64-NEXT: vmv.x.s a4, v24
; RV64-NEXT: vslidedown.vi v24, v10, 21
-; RV64-NEXT: vmv.x.s a6, v24
+; RV64-NEXT: vmv.x.s a5, v24
; RV64-NEXT: vslidedown.vi v24, v10, 22
-; RV64-NEXT: vmv.x.s a7, v24
+; RV64-NEXT: vmv.x.s a6, v24
; RV64-NEXT: vslidedown.vi v24, v10, 23
-; RV64-NEXT: vmv.x.s t0, v24
+; RV64-NEXT: vmv.x.s a7, v24
; RV64-NEXT: vslidedown.vi v24, v10, 24
-; RV64-NEXT: vmv.x.s t1, v24
+; RV64-NEXT: vmv.x.s t0, v24
; RV64-NEXT: vslidedown.vi v24, v10, 25
-; RV64-NEXT: vmv.x.s t2, v24
+; RV64-NEXT: vmv.x.s t1, v24
; RV64-NEXT: vslidedown.vi v24, v10, 26
-; RV64-NEXT: vmv.x.s t3, v24
+; RV64-NEXT: vmv.x.s t2, v24
; RV64-NEXT: vslidedown.vi v24, v10, 27
-; RV64-NEXT: vmv.x.s t4, v24
+; RV64-NEXT: vmv.x.s t3, v24
; RV64-NEXT: vslidedown.vi v24, v10, 28
-; RV64-NEXT: vmv.x.s t5, v24
+; RV64-NEXT: vmv.x.s t4, v24
; RV64-NEXT: vslidedown.vi v24, v10, 29
-; RV64-NEXT: vmv.x.s t6, v24
+; RV64-NEXT: vmv.x.s t5, v24
; RV64-NEXT: vslidedown.vi v24, v10, 30
-; RV64-NEXT: vmv.x.s s0, v24
+; RV64-NEXT: vmv.x.s t6, v24
; RV64-NEXT: vslidedown.vi v24, v10, 31
-; RV64-NEXT: vmv.x.s s1, v24
-; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; RV64-NEXT: vslidedown.vi v11, v10, 13
-; RV64-NEXT: vslidedown.vi v24, v10, 14
-; RV64-NEXT: vslidedown.vi v10, v10, 15
-; RV64-NEXT: vmv.x.s s2, v12
-; RV64-NEXT: vmv.x.s s3, v13
-; RV64-NEXT: vmv.x.s s4, v14
-; RV64-NEXT: vmv.x.s s5, v15
-; RV64-NEXT: vmv.x.s s6, v16
-; RV64-NEXT: vmv.x.s s7, v17
-; RV64-NEXT: vmv.x.s s8, v18
-; RV64-NEXT: vmv.x.s s9, v19
-; RV64-NEXT: vmv.x.s s10, v20
-; RV64-NEXT: vmv.x.s s11, v21
-; RV64-NEXT: vmv.x.s ra, v22
-; RV64-NEXT: vsetvli a0, zero, e8, m2, ta, ma
-; RV64-NEXT: ld a0, 0(sp) # 8-byte Folded Reload
-; RV64-NEXT: vmseq.vx v12, v8, a0
-; RV64-NEXT: vmv.x.s a0, v23
-; RV64-NEXT: vmseq.vx v13, v8, s2
-; RV64-NEXT: vmv.x.s s2, v11
-; RV64-NEXT: vmseq.vx v11, v8, s3
-; RV64-NEXT: vmv.x.s s3, v24
-; RV64-NEXT: vmseq.vx v14, v8, s4
-; RV64-NEXT: vmv.x.s s4, v10
-; RV64-NEXT: vmseq.vx v10, v8, s5
-; RV64-NEXT: vmor.mm v12, v12, v13
-; RV64-NEXT: vmseq.vx v13, v8, s6
-; RV64-NEXT: vmor.mm v11, v12, v11
-; RV64-NEXT: vmseq.vx v12, v8, s7
-; RV64-NEXT: vmor.mm v11, v11, v14
-; RV64-NEXT: vmseq.vx v14, v8, s8
-; RV64-NEXT: vmor.mm v10, v11, v10
-; RV64-NEXT: vmseq.vx v11, v8, s9
-; RV64-NEXT: vmor.mm v10, v10, v13
-; RV64-NEXT: vmseq.vx v13, v8, s10
-; RV64-NEXT: vmor.mm v10, v10, v12
-; RV64-NEXT: vmseq.vx v12, v8, s11
-; RV64-NEXT: vmor.mm v10, v10, v14
-; RV64-NEXT: vmseq.vx v14, v8, ra
-; RV64-NEXT: vmor.mm v10, v10, v11
-; RV64-NEXT: vmseq.vx v11, v8, a0
-; RV64-NEXT: vmor.mm v10, v10, v13
-; RV64-NEXT: vmseq.vx v13, v8, s2
-; RV64-NEXT: vmor.mm v10, v10, v12
-; RV64-NEXT: vmseq.vx v12, v8, s3
-; RV64-NEXT: vmor.mm v10, v10, v14
-; RV64-NEXT: vmseq.vx v14, v8, s4
-; RV64-NEXT: vmor.mm v10, v10, v11
+; RV64-NEXT: vmv.x.s s0, v24
+; RV64-NEXT: vsetivli zero, 16, e8, m1, ta, ma
+; RV64-NEXT: vrgather.vi v11, v10, 13
+; RV64-NEXT: vrgather.vi v24, v10, 14
+; RV64-NEXT: vrgather.vi v25, v10, 15
+; RV64-NEXT: vmseq.vv v9, v8, v9
+; RV64-NEXT: vmseq.vv v10, v8, v12
+; RV64-NEXT: vmor.mm v9, v10, v9
+; RV64-NEXT: vmseq.vv v10, v8, v13
+; RV64-NEXT: vmor.mm v9, v9, v10
+; RV64-NEXT: vmseq.vv v10, v8, v14
+; RV64-NEXT: vmor.mm v9, v9, v10
+; RV64-NEXT: vmseq.vv v10, v8, v15
+; RV64-NEXT: vmor.mm v9, v9, v10
+; RV64-NEXT: vmseq.vv v10, v8, v16
+; RV64-NEXT: vmor.mm v9, v9, v10
+; RV64-NEXT: vmseq.vv v10, v8, v17
+; RV64-NEXT: vmor.mm v9, v9, v10
+; RV64-NEXT: vmseq.vv v10, v8, v18
+; RV64-NEXT: vmor.mm v9, v9, v10
+; RV64-NEXT: vmseq.vv v10, v8, v19
+; RV64-NEXT: vmor.mm v9, v9, v10
+; RV64-NEXT: vmseq.vv v10, v8, v20
+; RV64-NEXT: vmor.mm v9, v9, v10
+; RV64-NEXT: vmseq.vv v10, v8, v21
+; RV64-NEXT: vmor.mm v9, v9, v10
+; RV64-NEXT: vmseq.vv v10, v8, v22
+; RV64-NEXT: vmor.mm v9, v9, v10
+; RV64-NEXT: vmseq.vv v10, v8, v23
+; RV64-NEXT: vmor.mm v9, v9, v10
+; RV64-NEXT: vmseq.vx v10, v8, a0
+; RV64-NEXT: vmseq.vv v11, v8, v11
+; RV64-NEXT: vmor.mm v9, v9, v11
; RV64-NEXT: vmseq.vx v11, v8, a1
-; RV64-NEXT: vmor.mm v10, v10, v13
-; RV64-NEXT: vmseq.vx v13, v8, a2
-; RV64-NEXT: vmor.mm v10, v10, v12
-; RV64-NEXT: vmseq.vx v12, v8, a3
-; RV64-NEXT: vmor.mm v10, v10, v14
-; RV64-NEXT: vmseq.vx v14, v8, a4
-; RV64-NEXT: vmor.mm v10, v10, v11
+; RV64-NEXT: vmseq.vv v12, v8, v24
+; RV64-NEXT: vmor.mm v9, v9, v12
+; RV64-NEXT: vmseq.vx v12, v8, a2
+; RV64-NEXT: vmseq.vv v13, v8, v25
+; RV64-NEXT: vmor.mm v9, v9, v13
+; RV64-NEXT: vmseq.vx v13, v8, a3
+; RV64-NEXT: vmor.mm v9, v9, v10
+; RV64-NEXT: vmseq.vx v10, v8, a4
+; RV64-NEXT: vmor.mm v9, v9, v11
; RV64-NEXT: vmseq.vx v11, v8, a5
-; RV64-NEXT: vmor.mm v10, v10, v13
-; RV64-NEXT: vmseq.vx v13, v8, a6
-; RV64-NEXT: vmor.mm v10, v10, v12
-; RV64-NEXT: vmseq.vx v12, v8, a7
-; RV64-NEXT: vmor.mm v10, v10, v14
-; RV64-NEXT: vmseq.vx v14, v8, t0
-; RV64-NEXT: vmor.mm v10, v10, v11
+; RV64-NEXT: vmor.mm v9, v9, v12
+; RV64-NEXT: vmseq.vx v12, v8, a6
+; RV64-NEXT: vmor.mm v9, v9, v13
+; RV64-NEXT: vmseq.vx v13, v8, a7
+; RV64-NEXT: vmor.mm v9, v9, v10
+; RV64-NEXT: vmseq.vx v10, v8, t0
+; RV64-NEXT: vmor.mm v9, v9, v11
; RV64-NEXT: vmseq.vx v11, v8, t1
-; RV64-NEXT: vmor.mm v10, v10, v13
-; RV64-NEXT: vmseq.vx v13, v8, t2
-; RV64-NEXT: vmor.mm v10, v10, v12
-; RV64-NEXT: vmseq.vx v12, v8, t3
-; RV64-NEXT: vmor.mm v10, v10, v14
-; RV64-NEXT: vmseq.vx v14, v8, t4
-; RV64-NEXT: vmor.mm v10, v10, v11
+; RV64-NEXT: vmor.mm v9, v9, v12
+; RV64-NEXT: vmseq.vx v12, v8, t2
+; RV64-NEXT: vmor.mm v9, v9, v13
+; RV64-NEXT: vmseq.vx v13, v8, t3
+; RV64-NEXT: vmor.mm v9, v9, v10
+; RV64-NEXT: vmseq.vx v10, v8, t4
+; RV64-NEXT: vmor.mm v9, v9, v11
; RV64-NEXT: vmseq.vx v11, v8, t5
-; RV64-NEXT: vmor.mm v10, v10, v13
-; RV64-NEXT: vmseq.vx v13, v8, t6
-; RV64-NEXT: vmor.mm v10, v10, v12
-; RV64-NEXT: vmseq.vx v12, v8, s0
-; RV64-NEXT: vmor.mm v10, v10, v14
-; RV64-NEXT: vmor.mm v10, v10, v11
-; RV64-NEXT: vmor.mm v10, v10, v13
-; RV64-NEXT: vmor.mm v10, v10, v12
-; RV64-NEXT: vmseq.vx v11, v8, s1
-; RV64-NEXT: vmor.mm v8, v10, v11
+; RV64-NEXT: vmor.mm v9, v9, v12
+; RV64-NEXT: vmseq.vx v12, v8, t6
+; RV64-NEXT: vmor.mm v9, v9, v13
+; RV64-NEXT: vmor.mm v9, v9, v10
+; RV64-NEXT: vmor.mm v9, v9, v11
+; RV64-NEXT: vmor.mm v9, v9, v12
+; RV64-NEXT: vmseq.vx v8, v8, s0
+; RV64-NEXT: vmor.mm v8, v9, v8
; RV64-NEXT: vmand.mm v0, v8, v0
-; RV64-NEXT: ld ra, 104(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s0, 96(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s1, 88(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s2, 80(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s3, 72(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s4, 64(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s5, 56(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s6, 48(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s7, 40(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s8, 32(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s9, 24(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s10, 16(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s11, 8(sp) # 8-byte Folded Reload
-; RV64-NEXT: .cfi_restore ra
+; RV64-NEXT: ld s0, 8(sp) # 8-byte Folded Reload
; RV64-NEXT: .cfi_restore s0
-; RV64-NEXT: .cfi_restore s1
-; RV64-NEXT: .cfi_restore s2
-; RV64-NEXT: .cfi_restore s3
-; RV64-NEXT: .cfi_restore s4
-; RV64-NEXT: .cfi_restore s5
-; RV64-NEXT: .cfi_restore s6
-; RV64-NEXT: .cfi_restore s7
-; RV64-NEXT: .cfi_restore s8
-; RV64-NEXT: .cfi_restore s9
-; RV64-NEXT: .cfi_restore s10
-; RV64-NEXT: .cfi_restore s11
-; RV64-NEXT: addi sp, sp, 112
+; RV64-NEXT: addi sp, sp, 16
; RV64-NEXT: .cfi_def_cfa_offset 0
; RV64-NEXT: ret
- %r = tail call <vscale x 16 x i1> @llvm.experimental.vector.match(<vscale x 16 x i8> %op1, <32 x i8> %op2, <vscale x 16 x i1> %mask)
- ret <vscale x 16 x i1> %r
-}
-
-define <16 x i1> @match_v16i8_v32i8(<16 x i8> %op1, <32 x i8> %op2, <16 x i1> %mask) {
-; CHECK-LABEL: match_v16i8_v32i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: addi sp, sp, -16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a1, a0, 1
-; CHECK-NEXT: add a0, a1, a0
-; CHECK-NEXT: sub sp, sp, a0
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x03, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 3 * vlenb
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 1
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs1r.v v0, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
-; CHECK-NEXT: vrgather.vi v0, v10, 1
-; CHECK-NEXT: vrgather.vi v9, v10, 0
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs1r.v v9, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vrgather.vi v9, v10, 2
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vs1r.v v9, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vrgather.vi v13, v10, 3
-; CHECK-NEXT: vrgather.vi v14, v10, 4
-; CHECK-NEXT: vrgather.vi v15, v10, 5
-; CHECK-NEXT: vrgather.vi v16, v10, 6
-; CHECK-NEXT: vrgather.vi v17, v10, 7
-; CHECK-NEXT: vrgather.vi v18, v10, 8
-; CHECK-NEXT: vrgather.vi v19, v10, 9
-; CHECK-NEXT: vrgather.vi v20, v10, 10
-; CHECK-NEXT: vrgather.vi v21, v10, 11
-; CHECK-NEXT: vrgather.vi v22, v10, 12
-; CHECK-NEXT: vrgather.vi v23, v10, 13
-; CHECK-NEXT: vrgather.vi v24, v10, 14
-; CHECK-NEXT: vrgather.vi v25, v10, 15
-; CHECK-NEXT: vrgather.vi v26, v10, 16
-; CHECK-NEXT: vrgather.vi v27, v10, 17
-; CHECK-NEXT: vrgather.vi v28, v10, 18
-; CHECK-NEXT: vrgather.vi v29, v10, 19
-; CHECK-NEXT: vrgather.vi v30, v10, 20
-; CHECK-NEXT: vrgather.vi v31, v10, 21
-; CHECK-NEXT: vrgather.vi v7, v10, 22
-; CHECK-NEXT: vrgather.vi v6, v10, 23
-; CHECK-NEXT: vrgather.vi v5, v10, 24
-; CHECK-NEXT: vrgather.vi v4, v10, 25
-; CHECK-NEXT: vrgather.vi v3, v10, 26
-; CHECK-NEXT: vrgather.vi v2, v10, 27
-; CHECK-NEXT: vrgather.vi v1, v10, 28
-; CHECK-NEXT: vrgather.vi v12, v10, 29
-; CHECK-NEXT: vrgather.vi v9, v10, 30
-; CHECK-NEXT: vrgather.vi v11, v10, 31
-; CHECK-NEXT: vmseq.vv v10, v8, v0
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl1r.v v0, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vmseq.vv v0, v8, v0
-; CHECK-NEXT: vmor.mm v10, v0, v10
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vl1r.v v0, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vmseq.vv v0, v8, v0
-; CHECK-NEXT: vmor.mm v10, v10, v0
-; CHECK-NEXT: vmseq.vv v13, v8, v13
-; CHECK-NEXT: vmor.mm v10, v10, v13
-; CHECK-NEXT: vmseq.vv v13, v8, v14
-; CHECK-NEXT: vmor.mm v10, v10, v13
-; CHECK-NEXT: vmseq.vv v13, v8, v15
-; CHECK-NEXT: vmor.mm v10, v10, v13
-; CHECK-NEXT: vmseq.vv v13, v8, v16
-; CHECK-NEXT: vmor.mm v10, v10, v13
-; CHECK-NEXT: vmseq.vv v13, v8, v17
-; CHECK-NEXT: vmor.mm v10, v10, v13
-; CHECK-NEXT: vmseq.vv v13, v8, v18
-; CHECK-NEXT: vmor.mm v10, v10, v13
-; CHECK-NEXT: vmseq.vv v13, v8, v19
-; CHECK-NEXT: vmor.mm v10, v10, v13
-; CHECK-NEXT: vmseq.vv v13, v8, v20
-; CHECK-NEXT: vmor.mm v10, v10, v13
-; CHECK-NEXT: vmseq.vv v13, v8, v21
-; CHECK-NEXT: vmor.mm v10, v10, v13
-; CHECK-NEXT: vmseq.vv v13, v8, v22
-; CHECK-NEXT: vmor.mm v10, v10, v13
-; CHECK-NEXT: vmseq.vv v13, v8, v23
-; CHECK-NEXT: vmor.mm v10, v10, v13
-; CHECK-NEXT: vmseq.vv v13, v8, v24
-; CHECK-NEXT: vmor.mm v10, v10, v13
-; CHECK-NEXT: vmseq.vv v13, v8, v25
-; CHECK-NEXT: vmor.mm v10, v10, v13
-; CHECK-NEXT: vmseq.vv v13, v8, v26
-; CHECK-NEXT: vmor.mm v10, v10, v13
-; CHECK-NEXT: vmseq.vv v13, v8, v27
-; CHECK-NEXT: vmor.mm v10, v10, v13
-; CHECK-NEXT: vmseq.vv v13, v8, v28
-; CHECK-NEXT: vmor.mm v10, v10, v13
-; CHECK-NEXT: vmseq.vv v13, v8, v29
-; CHECK-NEXT: vmor.mm v10, v10, v13
-; CHECK-NEXT: vmseq.vv v13, v8, v30
-; CHECK-NEXT: vmor.mm v10, v10, v13
-; CHECK-NEXT: vmseq.vv v13, v8, v31
-; CHECK-NEXT: vmor.mm v10, v10, v13
-; CHECK-NEXT: vmseq.vv v13, v8, v7
-; CHECK-NEXT: vmor.mm v10, v10, v13
-; CHECK-NEXT: vmseq.vv v13, v8, v6
-; CHECK-NEXT: vmor.mm v10, v10, v13
-; CHECK-NEXT: vmseq.vv v13, v8, v5
-; CHECK-NEXT: vmor.mm v10, v10, v13
-; CHECK-NEXT: vmseq.vv v13, v8, v4
-; CHECK-NEXT: vmor.mm v10, v10, v13
-; CHECK-NEXT: vmseq.vv v13, v8, v3
-; CHECK-NEXT: vmor.mm v10, v10, v13
-; CHECK-NEXT: vmseq.vv v13, v8, v2
-; CHECK-NEXT: vmor.mm v10, v10, v13
-; CHECK-NEXT: vmseq.vv v13, v8, v1
-; CHECK-NEXT: vmor.mm v10, v10, v13
-; CHECK-NEXT: vmseq.vv v12, v8, v12
-; CHECK-NEXT: vmseq.vv v9, v8, v9
-; CHECK-NEXT: vmor.mm v10, v10, v12
-; CHECK-NEXT: vmor.mm v9, v10, v9
-; CHECK-NEXT: vmseq.vv v8, v8, v11
-; CHECK-NEXT: vmor.mm v8, v9, v8
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 1
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl1r.v v9, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vmand.mm v0, v8, v9
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a1, a0, 1
-; CHECK-NEXT: add a0, a1, a0
-; CHECK-NEXT: add sp, sp, a0
-; CHECK-NEXT: .cfi_def_cfa sp, 16
-; CHECK-NEXT: addi sp, sp, 16
-; CHECK-NEXT: .cfi_def_cfa_offset 0
-; CHECK-NEXT: ret
%r = tail call <16 x i1> @llvm.experimental.vector.match(<16 x i8> %op1, <32 x i8> %op2, <16 x i1> %mask)
ret <16 x i1> %r
}
@@ -1040,22 +1031,67 @@ define <2 x i1> @match_v2xi64_v2i64(<2 x i64> %op1, <2 x i64> %op2, <2 x i1> %ma
}
define <2 x i1> @match_v2xi64_v4i64(<2 x i64> %op1, <4 x i64> %op2, <2 x i1> %mask) {
-; CHECK-LABEL: match_v2xi64_v4i64:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-NEXT: vrgather.vi v9, v10, 1
-; CHECK-NEXT: vrgather.vi v11, v10, 0
-; CHECK-NEXT: vrgather.vi v12, v10, 2
-; CHECK-NEXT: vrgather.vi v13, v10, 3
-; CHECK-NEXT: vmseq.vv v9, v8, v9
-; CHECK-NEXT: vmseq.vv v10, v8, v11
-; CHECK-NEXT: vmseq.vv v11, v8, v12
-; CHECK-NEXT: vmor.mm v9, v10, v9
-; CHECK-NEXT: vmor.mm v9, v9, v11
-; CHECK-NEXT: vmseq.vv v8, v8, v13
-; CHECK-NEXT: vmor.mm v8, v9, v8
-; CHECK-NEXT: vmand.mm v0, v8, v0
-; CHECK-NEXT: ret
+; RV32-LABEL: match_v2xi64_v4i64:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: vsetivli zero, 1, e64, m2, ta, ma
+; RV32-NEXT: vslidedown.vi v12, v10, 2
+; RV32-NEXT: li a0, 32
+; RV32-NEXT: vmv.x.s a1, v12
+; RV32-NEXT: vsrl.vx v12, v12, a0
+; RV32-NEXT: vmv.x.s a2, v12
+; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT: vrgather.vi v9, v10, 1
+; RV32-NEXT: vrgather.vi v12, v10, 0
+; RV32-NEXT: vsetivli zero, 1, e64, m2, ta, ma
+; RV32-NEXT: vslidedown.vi v10, v10, 3
+; RV32-NEXT: vmv.x.s a3, v10
+; RV32-NEXT: vsrl.vx v10, v10, a0
+; RV32-NEXT: mv a0, sp
+; RV32-NEXT: sw a1, 0(sp)
+; RV32-NEXT: sw a2, 4(sp)
+; RV32-NEXT: vmv.x.s a1, v10
+; RV32-NEXT: sw a3, 8(sp)
+; RV32-NEXT: sw a1, 12(sp)
+; RV32-NEXT: addi a1, sp, 8
+; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT: vmseq.vv v9, v8, v9
+; RV32-NEXT: vlse64.v v10, (a0), zero
+; RV32-NEXT: vmseq.vv v11, v8, v12
+; RV32-NEXT: vlse64.v v12, (a1), zero
+; RV32-NEXT: vmor.mm v9, v11, v9
+; RV32-NEXT: vmseq.vv v10, v8, v10
+; RV32-NEXT: vmor.mm v9, v9, v10
+; RV32-NEXT: vmseq.vv v8, v8, v12
+; RV32-NEXT: vmor.mm v8, v9, v8
+; RV32-NEXT: vmand.mm v0, v8, v0
+; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: .cfi_def_cfa_offset 0
+; RV32-NEXT: ret
+;
+; RV64-LABEL: match_v2xi64_v4i64:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; RV64-NEXT: vrgather.vi v9, v10, 1
+; RV64-NEXT: vsetivli zero, 1, e64, m2, ta, ma
+; RV64-NEXT: vslidedown.vi v12, v10, 2
+; RV64-NEXT: vmv.x.s a0, v12
+; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; RV64-NEXT: vrgather.vi v12, v10, 0
+; RV64-NEXT: vsetivli zero, 1, e64, m2, ta, ma
+; RV64-NEXT: vslidedown.vi v10, v10, 3
+; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; RV64-NEXT: vmseq.vv v9, v8, v9
+; RV64-NEXT: vmseq.vv v11, v8, v12
+; RV64-NEXT: vmseq.vx v12, v8, a0
+; RV64-NEXT: vmv.x.s a0, v10
+; RV64-NEXT: vmor.mm v9, v11, v9
+; RV64-NEXT: vmor.mm v9, v9, v12
+; RV64-NEXT: vmseq.vx v8, v8, a0
+; RV64-NEXT: vmor.mm v8, v9, v8
+; RV64-NEXT: vmand.mm v0, v8, v0
+; RV64-NEXT: ret
%r = tail call <2 x i1> @llvm.experimental.vector.match(<2 x i64> %op1, <4 x i64> %op2, <2 x i1> %mask)
ret <2 x i1> %r
}
>From bc69e7a01902297464cc12453e5f14cb7ff42b77 Mon Sep 17 00:00:00 2001
From: "Mikhail R. Gadelha" <mikhail at igalia.com>
Date: Tue, 14 Jan 2025 12:28:22 -0300
Subject: [PATCH 7/7] Renamed variables
Signed-off-by: Mikhail R. Gadelha <mikhail at igalia.com>
---
llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 40 ++++++++++-----------
1 file changed, 20 insertions(+), 20 deletions(-)
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 1cbec35c5214dc..3e466138928df2 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -3507,17 +3507,17 @@ static std::optional<VIDSequence> isSimpleVIDSequence(SDValue Op,
// Match a splatted value (SPLAT_VECTOR/BUILD_VECTOR) of an EXTRACT_VECTOR_ELT
// and lower it as a VRGATHER_VX_VL from the source vector.
-static SDValue matchSplatAsGather(SDValue SplatVal, MVT VT, const SDLoc &DL,
+static SDValue matchSplatAsGather(SDValue SplatVal, MVT SrcVT, const SDLoc &DL,
SelectionDAG &DAG,
const RISCVSubtarget &Subtarget) {
if (SplatVal.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
return SDValue();
- SDValue Vec = SplatVal.getOperand(0);
+ SDValue SplatVec = SplatVal.getOperand(0);
// Don't perform this optimization for i1 vectors, or if the element types are
// different
// FIXME: Support i1 vectors, maybe by promoting to i8?
- MVT EltTy = VT.getVectorElementType();
- MVT VecVT = Vec.getSimpleValueType();
+ MVT EltTy = SrcVT.getVectorElementType();
+ MVT VecVT = SplatVec.getSimpleValueType();
if (EltTy == MVT::i1 || EltTy != VecVT.getVectorElementType())
return SDValue();
SDValue Idx = SplatVal.getOperand(1);
@@ -3525,45 +3525,45 @@ static SDValue matchSplatAsGather(SDValue SplatVal, MVT VT, const SDLoc &DL,
if (Idx.getValueType() != Subtarget.getXLenVT())
return SDValue();
- // Check that we know Idx lies within VT
+ // Check that we know Idx lies within SrcVT
if (auto *CIdx = dyn_cast<ConstantSDNode>(Idx)) {
- if (CIdx->getZExtValue() >= VT.getVectorElementCount().getKnownMinValue())
+ if (CIdx->getZExtValue() >= SrcVT.getVectorElementCount().getKnownMinValue())
return SDValue();
}
- else if (!TypeSize::isKnownLE(Vec.getValueSizeInBits(), VT.getSizeInBits()))
+ else if (!TypeSize::isKnownLE(SplatVec.getValueSizeInBits(), SrcVT.getSizeInBits()))
return SDValue();
// Convert fixed length vectors to scalable
- MVT ContainerVT = VT;
- if (VT.isFixedLengthVector())
- ContainerVT = getContainerForFixedLengthVector(DAG, VT, Subtarget);
+ MVT ContainerVT = SrcVT;
+ if (SrcVT.isFixedLengthVector())
+ ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT, Subtarget);
MVT ContainerVecVT = VecVT;
if (VecVT.isFixedLengthVector()) {
ContainerVecVT = getContainerForFixedLengthVector(DAG, VecVT, Subtarget);
- Vec = convertToScalableVector(ContainerVecVT, Vec, DAG, Subtarget);
+ SplatVec = convertToScalableVector(ContainerVecVT, SplatVec, DAG, Subtarget);
}
- // Put Vec in a VT sized vector
+ // Put SplatVec in a SrcVT sized vector
if (ContainerVecVT.getVectorMinNumElements() <
ContainerVT.getVectorMinNumElements())
- Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ContainerVT,
- DAG.getUNDEF(ContainerVT), Vec,
+ SplatVec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ContainerVT,
+ DAG.getUNDEF(ContainerVT), SplatVec,
DAG.getVectorIdxConstant(0, DL));
else
- Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ContainerVT, Vec,
+ SplatVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ContainerVT, SplatVec,
DAG.getVectorIdxConstant(0, DL));
- // We checked that Idx fits inside VT earlier
- auto [Mask, VL] = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget);
+ // We checked that Idx fits inside SrcVT earlier
+ auto [Mask, VL] = getDefaultVLOps(SrcVT, ContainerVT, DL, DAG, Subtarget);
- SDValue Gather = DAG.getNode(RISCVISD::VRGATHER_VX_VL, DL, ContainerVT, Vec,
+ SDValue Gather = DAG.getNode(RISCVISD::VRGATHER_VX_VL, DL, ContainerVT, SplatVec,
Idx, DAG.getUNDEF(ContainerVT), Mask, VL);
- if (!VT.isFixedLengthVector())
+ if (!SrcVT.isFixedLengthVector())
return Gather;
- return convertFromScalableVector(VT, Gather, DAG, Subtarget);
+ return convertFromScalableVector(SrcVT, Gather, DAG, Subtarget);
}
/// Try and optimize BUILD_VECTORs with "dominant values" - these are values
More information about the llvm-commits
mailing list