[llvm] [RISCV] Combine vslideup_vl with known VL to a smaller LMUL (PR #66671)
Luke Lau via llvm-commits
llvm-commits at lists.llvm.org
Mon Sep 18 10:00:38 PDT 2023
https://github.com/lukel97 created https://github.com/llvm/llvm-project/pull/66671
Similiar to #66267, we can perform a vslideup_vl on a smaller type if we know the highest lane that will be written to, which can be determined from VL.
This is an alternative to #65997 and #66087
Stacked upon #66267
>From c44ca7326774fa3a9083ef43c619f758f2a0dc77 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Wed, 13 Sep 2023 19:07:55 +0100
Subject: [PATCH 1/2] [RISCV] Combine vslidedown_vl with known VL and offset to
a smaller LMUL
If we know the VL and offset of a vslidedown_vl, we can work out the minimum
number of registers it's going to operate across. We can reuse the logic from
extract_vector_elt to perform it in a smaller type and reduce the LMUL.
The aim is to generalize #65598 and hopefully extend this to vslideup_vl too so
that we can get the same optimisation for insert_subvector and
insert_vector_elt.
One observation from adding this is that the vslide*_vl nodes all take a mask
operand, but currently anything other than vmset_vl will fail to select, as all
the patterns expect true_mask. So we need to create a new vmset_vl instead of
using extract_subvector on the existing vmset_vl.
---
llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 46 ++++-
.../CodeGen/RISCV/rvv/extractelt-int-rv32.ll | 18 +-
.../rvv/fixed-vectors-int-explodevector.ll | 160 ++++++++++--------
3 files changed, 135 insertions(+), 89 deletions(-)
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index de58335b435651c..b960e44d7846cd7 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -8805,15 +8805,6 @@ SDValue RISCVTargetLowering::lowerEXTRACT_SUBVECTOR(SDValue Op,
Vec = convertToScalableVector(ContainerVT, Vec, DAG, Subtarget);
}
- // Shrink down Vec so we're performing the slidedown on a smaller LMUL.
- unsigned LastIdx = OrigIdx + SubVecVT.getVectorNumElements() - 1;
- if (auto ShrunkVT =
- getSmallestVTForIndex(ContainerVT, LastIdx, DL, DAG, Subtarget)) {
- ContainerVT = *ShrunkVT;
- Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ContainerVT, Vec,
- DAG.getVectorIdxConstant(0, DL));
- }
-
SDValue Mask =
getDefaultVLOps(VecVT, ContainerVT, DL, DAG, Subtarget).first;
// Set the vector length to only the number of elements we care about. This
@@ -14260,6 +14251,43 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
if (SDValue V = performCONCAT_VECTORSCombine(N, DAG, Subtarget, *this))
return V;
break;
+ case RISCVISD::VSLIDEDOWN_VL: {
+ MVT OrigVT = N->getSimpleValueType(0);
+ auto *CVL = dyn_cast<ConstantSDNode>(N->getOperand(4));
+ auto *CIdx = dyn_cast<ConstantSDNode>(N->getOperand(2));
+ if (!CVL || !CIdx)
+ break;
+ unsigned MaxIdx = CVL->getZExtValue() + CIdx->getZExtValue() - 1;
+ // We can try and reduce the LMUL that a vslidedown uses if we know where
+ // the maximum index is. For example, if the target has Zvl128b, a
+ // vslidedown of e32 with with an offset of 4 and VL of 2 is only going to
+ // read from the first 2 registers at most. So if we were operating at
+ // LMUL=4 (nxv8i32), we can reduce it to LMUL=2(nxv4i32).
+ if (auto ShrunkVT =
+ getSmallestVTForIndex(OrigVT, MaxIdx, DL, DAG, Subtarget)) {
+ SDValue ShrunkPassthru =
+ DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, *ShrunkVT, N->getOperand(0),
+ DAG.getVectorIdxConstant(0, DL));
+ SDValue ShrunkInVec =
+ DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, *ShrunkVT, N->getOperand(1),
+ DAG.getVectorIdxConstant(0, DL));
+
+ // The only mask ever used in vslide*_vl nodes is vmset_vl, and the only
+ // patterns on vslide*_vl only accept vmset_vl. So create a new vmset
+ // since using an extract_subvector breaks patterns.
+ assert(N->getOperand(3).getOpcode() == RISCVISD::VMSET_VL);
+ SDValue ShrunkMask =
+ DAG.getNode(RISCVISD::VMSET_VL, SDLoc(N), getMaskTypeFor(*ShrunkVT),
+ N->getOperand(4));
+ SDValue ShrunkSlidedown =
+ DAG.getNode(RISCVISD::VSLIDEDOWN_VL, DL, *ShrunkVT,
+ {ShrunkPassthru, ShrunkInVec, N->getOperand(2),
+ ShrunkMask, N->getOperand(4), N->getOperand(5)});
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, OrigVT, N->getOperand(0),
+ ShrunkSlidedown, DAG.getVectorIdxConstant(0, DL));
+ }
+ break;
+ }
case RISCVISD::VFMV_V_F_VL: {
const MVT VT = N->getSimpleValueType(0);
SDValue Passthru = N->getOperand(0);
diff --git a/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv32.ll
index fd2f89e26e59809..c3181a296abe06d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv32.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv32.ll
@@ -679,12 +679,13 @@ define i64 @extractelt_nxv4i64_0(<vscale x 4 x i64> %v) {
define i64 @extractelt_nxv4i64_imm(<vscale x 4 x i64> %v) {
; CHECK-LABEL: extractelt_nxv4i64_imm:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e64, m4, ta, ma
+; CHECK-NEXT: vsetivli zero, 1, e64, m2, ta, ma
; CHECK-NEXT: vslidedown.vi v8, v8, 2
-; CHECK-NEXT: li a0, 32
-; CHECK-NEXT: vsrl.vx v12, v8, a0
-; CHECK-NEXT: vmv.x.s a1, v12
; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: li a1, 32
+; CHECK-NEXT: vsetivli zero, 1, e64, m4, ta, ma
+; CHECK-NEXT: vsrl.vx v8, v8, a1
+; CHECK-NEXT: vmv.x.s a1, v8
; CHECK-NEXT: ret
%r = extractelement <vscale x 4 x i64> %v, i32 2
ret i64 %r
@@ -720,12 +721,13 @@ define i64 @extractelt_nxv8i64_0(<vscale x 8 x i64> %v) {
define i64 @extractelt_nxv8i64_imm(<vscale x 8 x i64> %v) {
; CHECK-LABEL: extractelt_nxv8i64_imm:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e64, m8, ta, ma
+; CHECK-NEXT: vsetivli zero, 1, e64, m2, ta, ma
; CHECK-NEXT: vslidedown.vi v8, v8, 2
-; CHECK-NEXT: li a0, 32
-; CHECK-NEXT: vsrl.vx v16, v8, a0
-; CHECK-NEXT: vmv.x.s a1, v16
; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: li a1, 32
+; CHECK-NEXT: vsetivli zero, 1, e64, m8, ta, ma
+; CHECK-NEXT: vsrl.vx v8, v8, a1
+; CHECK-NEXT: vmv.x.s a1, v8
; CHECK-NEXT: ret
%r = extractelement <vscale x 8 x i64> %v, i32 2
ret i64 %r
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll
index 4e60edf058450f0..6e0ca4cba6bd6d6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll
@@ -875,11 +875,15 @@ define i64 @explode_8xi64(<8 x i64> %v) {
; RV32-NEXT: vsrl.vx v12, v8, a0
; RV32-NEXT: vmv.x.s a1, v12
; RV32-NEXT: vmv.x.s a2, v8
+; RV32-NEXT: vsetivli zero, 1, e64, m2, ta, ma
; RV32-NEXT: vslidedown.vi v12, v8, 1
-; RV32-NEXT: vsrl.vx v16, v12, a0
-; RV32-NEXT: vmv.x.s a3, v16
+; RV32-NEXT: vmv.x.s a3, v12
+; RV32-NEXT: vsetivli zero, 1, e64, m4, ta, ma
+; RV32-NEXT: vsrl.vx v12, v12, a0
; RV32-NEXT: vmv.x.s a4, v12
+; RV32-NEXT: vsetivli zero, 1, e64, m2, ta, ma
; RV32-NEXT: vslidedown.vi v12, v8, 2
+; RV32-NEXT: vsetivli zero, 1, e64, m4, ta, ma
; RV32-NEXT: vsrl.vx v16, v12, a0
; RV32-NEXT: vmv.x.s a5, v16
; RV32-NEXT: vmv.x.s a6, v12
@@ -903,19 +907,19 @@ define i64 @explode_8xi64(<8 x i64> %v) {
; RV32-NEXT: vsrl.vx v12, v8, a0
; RV32-NEXT: vmv.x.s a0, v12
; RV32-NEXT: vmv.x.s s0, v8
-; RV32-NEXT: add a1, a1, a3
-; RV32-NEXT: add a4, a2, a4
-; RV32-NEXT: sltu a2, a4, a2
+; RV32-NEXT: add a1, a1, a4
+; RV32-NEXT: add a3, a2, a3
+; RV32-NEXT: sltu a2, a3, a2
; RV32-NEXT: add a1, a1, a2
-; RV32-NEXT: add a6, a4, a6
-; RV32-NEXT: sltu a2, a6, a4
+; RV32-NEXT: add a6, a3, a6
+; RV32-NEXT: sltu a2, a6, a3
; RV32-NEXT: add a1, a1, a5
-; RV32-NEXT: add a2, a2, a7
-; RV32-NEXT: add a1, a1, a2
; RV32-NEXT: add t0, a6, t0
-; RV32-NEXT: sltu a2, t0, a6
-; RV32-NEXT: add a2, a2, t1
+; RV32-NEXT: sltu a3, t0, a6
+; RV32-NEXT: add a2, a2, a7
; RV32-NEXT: add a1, a1, a2
+; RV32-NEXT: add a3, a3, t1
+; RV32-NEXT: add a1, a1, a3
; RV32-NEXT: add t2, t0, t2
; RV32-NEXT: sltu a2, t2, t0
; RV32-NEXT: add a2, a2, t3
@@ -1029,115 +1033,127 @@ define i64 @explode_16xi64(<16 x i64> %v) {
; RV32-NEXT: vmv.x.s a0, v16
; RV32-NEXT: sw a0, 8(sp) # 4-byte Folded Spill
; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: vsetivli zero, 1, e64, m2, ta, ma
; RV32-NEXT: vslidedown.vi v16, v8, 1
+; RV32-NEXT: vmv.x.s a3, v16
+; RV32-NEXT: vsetivli zero, 1, e64, m8, ta, ma
+; RV32-NEXT: vsrl.vx v16, v16, a1
+; RV32-NEXT: vmv.x.s a4, v16
+; RV32-NEXT: vsetivli zero, 1, e64, m2, ta, ma
+; RV32-NEXT: vslidedown.vi v16, v8, 2
+; RV32-NEXT: vsetivli zero, 1, e64, m8, ta, ma
; RV32-NEXT: vsrl.vx v24, v16, a1
; RV32-NEXT: vmv.x.s a5, v24
; RV32-NEXT: vmv.x.s a6, v16
-; RV32-NEXT: vslidedown.vi v16, v8, 2
-; RV32-NEXT: vsrl.vx v24, v16, a1
-; RV32-NEXT: vmv.x.s a3, v24
-; RV32-NEXT: vmv.x.s a4, v16
+; RV32-NEXT: vsetivli zero, 1, e64, m4, ta, ma
; RV32-NEXT: vslidedown.vi v16, v8, 3
+; RV32-NEXT: vsetivli zero, 1, e64, m8, ta, ma
; RV32-NEXT: vsrl.vx v24, v16, a1
-; RV32-NEXT: vmv.x.s s2, v24
+; RV32-NEXT: vmv.x.s t0, v24
; RV32-NEXT: vmv.x.s a7, v16
+; RV32-NEXT: vsetivli zero, 1, e64, m4, ta, ma
; RV32-NEXT: vslidedown.vi v16, v8, 4
-; RV32-NEXT: vsrl.vx v24, v16, a1
-; RV32-NEXT: vmv.x.s s3, v24
-; RV32-NEXT: vmv.x.s t0, v16
-; RV32-NEXT: vslidedown.vi v16, v8, 5
-; RV32-NEXT: vsrl.vx v24, v16, a1
-; RV32-NEXT: vmv.x.s s4, v24
; RV32-NEXT: vmv.x.s t1, v16
-; RV32-NEXT: vslidedown.vi v16, v8, 6
-; RV32-NEXT: vsrl.vx v24, v16, a1
-; RV32-NEXT: vmv.x.s s5, v24
+; RV32-NEXT: vsetivli zero, 1, e64, m8, ta, ma
+; RV32-NEXT: vsrl.vx v16, v16, a1
+; RV32-NEXT: vmv.x.s t3, v16
+; RV32-NEXT: vsetivli zero, 1, e64, m4, ta, ma
+; RV32-NEXT: vslidedown.vi v16, v8, 5
; RV32-NEXT: vmv.x.s t2, v16
+; RV32-NEXT: vsetivli zero, 1, e64, m8, ta, ma
+; RV32-NEXT: vsrl.vx v16, v16, a1
+; RV32-NEXT: vmv.x.s t5, v16
+; RV32-NEXT: vsetivli zero, 1, e64, m4, ta, ma
+; RV32-NEXT: vslidedown.vi v16, v8, 6
+; RV32-NEXT: vmv.x.s t4, v16
+; RV32-NEXT: vsetivli zero, 1, e64, m8, ta, ma
+; RV32-NEXT: vsrl.vx v16, v16, a1
+; RV32-NEXT: vmv.x.s ra, v16
; RV32-NEXT: vslidedown.vi v16, v8, 7
; RV32-NEXT: vsrl.vx v24, v16, a1
-; RV32-NEXT: vmv.x.s s6, v24
-; RV32-NEXT: vmv.x.s t3, v16
+; RV32-NEXT: vmv.x.s s5, v24
+; RV32-NEXT: vmv.x.s t6, v16
; RV32-NEXT: vslidedown.vi v16, v8, 8
; RV32-NEXT: vsrl.vx v24, v16, a1
-; RV32-NEXT: vmv.x.s s7, v24
-; RV32-NEXT: vmv.x.s t4, v16
+; RV32-NEXT: vmv.x.s s6, v24
+; RV32-NEXT: vmv.x.s s0, v16
; RV32-NEXT: vslidedown.vi v16, v8, 9
; RV32-NEXT: vsrl.vx v24, v16, a1
-; RV32-NEXT: vmv.x.s s8, v24
-; RV32-NEXT: vmv.x.s t5, v16
+; RV32-NEXT: vmv.x.s s7, v24
+; RV32-NEXT: vmv.x.s s1, v16
; RV32-NEXT: vslidedown.vi v16, v8, 10
; RV32-NEXT: vsrl.vx v24, v16, a1
-; RV32-NEXT: vmv.x.s s9, v24
-; RV32-NEXT: vmv.x.s t6, v16
+; RV32-NEXT: vmv.x.s s8, v24
+; RV32-NEXT: vmv.x.s s2, v16
; RV32-NEXT: vslidedown.vi v16, v8, 11
; RV32-NEXT: vsrl.vx v24, v16, a1
-; RV32-NEXT: vmv.x.s s10, v24
-; RV32-NEXT: vmv.x.s s0, v16
+; RV32-NEXT: vmv.x.s s9, v24
+; RV32-NEXT: vmv.x.s s3, v16
; RV32-NEXT: vslidedown.vi v16, v8, 12
; RV32-NEXT: vsrl.vx v24, v16, a1
-; RV32-NEXT: vmv.x.s s11, v24
-; RV32-NEXT: vmv.x.s s1, v16
+; RV32-NEXT: vmv.x.s s10, v24
+; RV32-NEXT: vmv.x.s s4, v16
; RV32-NEXT: vslidedown.vi v0, v8, 13
; RV32-NEXT: vsrl.vx v16, v0, a1
-; RV32-NEXT: vmv.x.s ra, v16
+; RV32-NEXT: vmv.x.s s11, v16
; RV32-NEXT: vslidedown.vi v16, v8, 14
; RV32-NEXT: vsrl.vx v24, v16, a1
; RV32-NEXT: vslidedown.vi v8, v8, 15
; RV32-NEXT: vmv.x.s a2, v0
; RV32-NEXT: vsrl.vx v0, v8, a1
; RV32-NEXT: lw a1, 8(sp) # 4-byte Folded Reload
-; RV32-NEXT: add a5, a1, a5
-; RV32-NEXT: add a6, a0, a6
-; RV32-NEXT: sltu a0, a6, a0
-; RV32-NEXT: add a0, a5, a0
-; RV32-NEXT: add a0, a0, a3
-; RV32-NEXT: add a4, a6, a4
-; RV32-NEXT: sltu a1, a4, a6
-; RV32-NEXT: add a1, a1, s2
+; RV32-NEXT: add a4, a1, a4
+; RV32-NEXT: add a3, a0, a3
+; RV32-NEXT: sltu a0, a3, a0
+; RV32-NEXT: add a0, a4, a0
+; RV32-NEXT: add a0, a0, a5
+; RV32-NEXT: add a6, a3, a6
+; RV32-NEXT: sltu a1, a6, a3
+; RV32-NEXT: add a1, a1, t0
; RV32-NEXT: add a0, a0, a1
-; RV32-NEXT: add a7, a4, a7
-; RV32-NEXT: sltu a1, a7, a4
-; RV32-NEXT: add a1, a1, s3
+; RV32-NEXT: add a7, a6, a7
+; RV32-NEXT: sltu a1, a7, a6
+; RV32-NEXT: add a1, a1, t3
; RV32-NEXT: add a0, a0, a1
-; RV32-NEXT: add t0, a7, t0
-; RV32-NEXT: sltu a1, t0, a7
-; RV32-NEXT: add a1, a1, s4
-; RV32-NEXT: add a0, a0, a1
-; RV32-NEXT: add t1, t0, t1
-; RV32-NEXT: sltu a1, t1, t0
-; RV32-NEXT: add a1, a1, s5
+; RV32-NEXT: add t1, a7, t1
+; RV32-NEXT: sltu a1, t1, a7
+; RV32-NEXT: add a1, a1, t5
; RV32-NEXT: add a0, a0, a1
; RV32-NEXT: add t2, t1, t2
; RV32-NEXT: sltu a1, t2, t1
+; RV32-NEXT: add a1, a1, ra
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add t4, t2, t4
+; RV32-NEXT: sltu a1, t4, t2
+; RV32-NEXT: add a1, a1, s5
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add t6, t4, t6
+; RV32-NEXT: sltu a1, t6, t4
; RV32-NEXT: add a1, a1, s6
; RV32-NEXT: add a0, a0, a1
-; RV32-NEXT: add t3, t2, t3
-; RV32-NEXT: sltu a1, t3, t2
+; RV32-NEXT: add s0, t6, s0
+; RV32-NEXT: sltu a1, s0, t6
; RV32-NEXT: add a1, a1, s7
; RV32-NEXT: add a0, a0, a1
-; RV32-NEXT: add t4, t3, t4
-; RV32-NEXT: sltu a1, t4, t3
+; RV32-NEXT: add s1, s0, s1
+; RV32-NEXT: sltu a1, s1, s0
; RV32-NEXT: add a1, a1, s8
; RV32-NEXT: add a0, a0, a1
-; RV32-NEXT: add t5, t4, t5
-; RV32-NEXT: sltu a1, t5, t4
+; RV32-NEXT: add s2, s1, s2
+; RV32-NEXT: sltu a1, s2, s1
; RV32-NEXT: add a1, a1, s9
; RV32-NEXT: add a0, a0, a1
-; RV32-NEXT: add t6, t5, t6
-; RV32-NEXT: sltu a1, t6, t5
+; RV32-NEXT: add s3, s2, s3
+; RV32-NEXT: sltu a1, s3, s2
; RV32-NEXT: add a1, a1, s10
; RV32-NEXT: add a0, a0, a1
-; RV32-NEXT: add s0, t6, s0
-; RV32-NEXT: sltu a1, s0, t6
+; RV32-NEXT: add s4, s3, s4
+; RV32-NEXT: sltu a1, s4, s3
; RV32-NEXT: add a1, a1, s11
; RV32-NEXT: add a0, a0, a1
-; RV32-NEXT: add s1, s0, s1
-; RV32-NEXT: sltu a1, s1, s0
-; RV32-NEXT: add a1, a1, ra
-; RV32-NEXT: add a0, a0, a1
; RV32-NEXT: vmv.x.s a1, v24
-; RV32-NEXT: add a2, s1, a2
-; RV32-NEXT: sltu a3, a2, s1
+; RV32-NEXT: add a2, s4, a2
+; RV32-NEXT: sltu a3, a2, s4
; RV32-NEXT: add a1, a3, a1
; RV32-NEXT: vmv.x.s a3, v16
; RV32-NEXT: add a0, a0, a1
>From f54ebc7610f76a21cffae8f8047e35c8e9fc0ed5 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Mon, 18 Sep 2023 12:00:41 +0100
Subject: [PATCH 2/2] [RISCV] Combine vslideup_vl with known VL to a smaller
LMUL
Similiar to #66267, we can perform a vslideup_vl on a smaller type if we know
the highest lane that will be written to, which can be determined from VL.
This is an alternative to #65997 and #66087
---
llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 20 +-
.../RISCV/rvv/fixed-vectors-insert-i1.ll | 2 +-
.../rvv/fixed-vectors-insert-subvector.ll | 12 +-
.../CodeGen/RISCV/rvv/fixed-vectors-insert.ll | 2 +-
.../rvv/fixed-vectors-int-explodevector.ll | 66 +-
.../RISCV/rvv/fixed-vectors-masked-gather.ll | 258 +++----
.../rvv/fixed-vectors-strided-load-combine.ll | 12 +-
.../CodeGen/RISCV/rvv/fpclamptosat_vec.ll | 648 ++++++++++++------
llvm/test/CodeGen/RISCV/rvv/insertelt-fp.ll | 16 +-
llvm/test/CodeGen/RISCV/rvv/insertelt-i1.ll | 6 +-
.../CodeGen/RISCV/rvv/insertelt-int-rv32.ll | 22 +-
.../CodeGen/RISCV/rvv/insertelt-int-rv64.ll | 22 +-
12 files changed, 662 insertions(+), 424 deletions(-)
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index b960e44d7846cd7..179db28ec83e476 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -14251,14 +14251,24 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
if (SDValue V = performCONCAT_VECTORSCombine(N, DAG, Subtarget, *this))
return V;
break;
+ case RISCVISD::VSLIDEUP_VL:
case RISCVISD::VSLIDEDOWN_VL: {
MVT OrigVT = N->getSimpleValueType(0);
auto *CVL = dyn_cast<ConstantSDNode>(N->getOperand(4));
- auto *CIdx = dyn_cast<ConstantSDNode>(N->getOperand(2));
- if (!CVL || !CIdx)
+ if (!CVL)
break;
- unsigned MaxIdx = CVL->getZExtValue() + CIdx->getZExtValue() - 1;
- // We can try and reduce the LMUL that a vslidedown uses if we know where
+
+ // The maximum index read or written is VL - 1 for vslideup, and VL + offset
+ // - 1 for vslidedown.
+ unsigned MaxIdx = CVL->getZExtValue() - 1;
+ if (N->getOpcode() == RISCVISD::VSLIDEDOWN_VL) {
+ auto *COffset = dyn_cast<ConstantSDNode>(N->getOperand(2));
+ if (!COffset)
+ break;
+ MaxIdx += COffset->getZExtValue();
+ }
+
+ // We can try and reduce the LMUL that a vslide* uses if we know where
// the maximum index is. For example, if the target has Zvl128b, a
// vslidedown of e32 with with an offset of 4 and VL of 2 is only going to
// read from the first 2 registers at most. So if we were operating at
@@ -14280,7 +14290,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
DAG.getNode(RISCVISD::VMSET_VL, SDLoc(N), getMaskTypeFor(*ShrunkVT),
N->getOperand(4));
SDValue ShrunkSlidedown =
- DAG.getNode(RISCVISD::VSLIDEDOWN_VL, DL, *ShrunkVT,
+ DAG.getNode(N->getOpcode(), DL, *ShrunkVT,
{ShrunkPassthru, ShrunkInVec, N->getOperand(2),
ShrunkMask, N->getOperand(4), N->getOperand(5)});
return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, OrigVT, N->getOperand(0),
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-i1.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-i1.ll
index b3cbad3d9e6b1d7..f7737784d4ca57e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-i1.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-i1.ll
@@ -108,7 +108,7 @@ define <64 x i1> @insertelt_v64i1(<64 x i1> %x, i1 %elt) nounwind {
; CHECK-NEXT: vmv.v.i v8, 0
; CHECK-NEXT: vmerge.vim v8, v8, 1, v0
; CHECK-NEXT: vmv.s.x v12, a0
-; CHECK-NEXT: vsetivli zero, 2, e8, m4, tu, ma
+; CHECK-NEXT: vsetivli zero, 2, e8, m1, tu, ma
; CHECK-NEXT: vslideup.vi v8, v12, 1
; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma
; CHECK-NEXT: vand.vi v8, v8, 1
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll
index 1d6a45ed36f335c..133b09428ed961b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll
@@ -27,7 +27,7 @@ define <vscale x 8 x i32> @insert_nxv8i32_v2i32_2(<vscale x 8 x i32> %vec, ptr %
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
; CHECK-NEXT: vle32.v v12, (a0)
-; CHECK-NEXT: vsetivli zero, 4, e32, m4, tu, ma
+; CHECK-NEXT: vsetivli zero, 4, e32, m1, tu, ma
; CHECK-NEXT: vslideup.vi v8, v12, 2
; CHECK-NEXT: ret
%sv = load <2 x i32>, ptr %svp
@@ -40,7 +40,7 @@ define <vscale x 8 x i32> @insert_nxv8i32_v2i32_6(<vscale x 8 x i32> %vec, ptr %
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
; CHECK-NEXT: vle32.v v12, (a0)
-; CHECK-NEXT: vsetivli zero, 8, e32, m4, tu, ma
+; CHECK-NEXT: vsetivli zero, 8, e32, m2, tu, ma
; CHECK-NEXT: vslideup.vi v8, v12, 6
; CHECK-NEXT: ret
%sv = load <2 x i32>, ptr %svp
@@ -65,7 +65,7 @@ define <vscale x 8 x i32> @insert_nxv8i32_v8i32_0(<vscale x 8 x i32> %vec, ptr %
; LMULMAX1-NEXT: vle32.v v16, (a0)
; LMULMAX1-NEXT: vsetivli zero, 4, e32, m4, tu, ma
; LMULMAX1-NEXT: vmv.v.v v8, v12
-; LMULMAX1-NEXT: vsetivli zero, 8, e32, m4, tu, ma
+; LMULMAX1-NEXT: vsetivli zero, 8, e32, m2, tu, ma
; LMULMAX1-NEXT: vslideup.vi v8, v16, 4
; LMULMAX1-NEXT: ret
%sv = load <8 x i32>, ptr %svp
@@ -197,7 +197,7 @@ define void @insert_v8i32_v2i32_2(ptr %vp, ptr %svp) {
; LMULMAX2-NEXT: vle32.v v8, (a1)
; LMULMAX2-NEXT: vsetivli zero, 8, e32, m2, ta, ma
; LMULMAX2-NEXT: vle32.v v10, (a0)
-; LMULMAX2-NEXT: vsetivli zero, 4, e32, m2, tu, ma
+; LMULMAX2-NEXT: vsetivli zero, 4, e32, m1, tu, ma
; LMULMAX2-NEXT: vslideup.vi v10, v8, 2
; LMULMAX2-NEXT: vsetivli zero, 8, e32, m2, ta, ma
; LMULMAX2-NEXT: vse32.v v10, (a0)
@@ -509,7 +509,7 @@ define void @insert_v2i64_nxv16i64(ptr %psv0, ptr %psv1, <vscale x 16 x i64>* %o
; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; CHECK-NEXT: vle64.v v8, (a0)
; CHECK-NEXT: vle64.v v16, (a1)
-; CHECK-NEXT: vsetivli zero, 6, e64, m8, tu, ma
+; CHECK-NEXT: vsetivli zero, 6, e64, m4, tu, ma
; CHECK-NEXT: vslideup.vi v8, v16, 4
; CHECK-NEXT: vs8r.v v8, (a2)
; CHECK-NEXT: ret
@@ -539,7 +539,7 @@ define void @insert_v2i64_nxv16i64_lo2(ptr %psv, <vscale x 16 x i64>* %out) {
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; CHECK-NEXT: vle64.v v8, (a0)
-; CHECK-NEXT: vsetivli zero, 4, e64, m8, ta, ma
+; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma
; CHECK-NEXT: vslideup.vi v16, v8, 2
; CHECK-NEXT: vs8r.v v16, (a1)
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll
index 373a96356a207e2..6f5ab60fb4ad003 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll
@@ -54,7 +54,7 @@ define <32 x i32> @insertelt_v32i32_4(<32 x i32> %a, i32 %y) {
; CHECK-NEXT: li a1, 32
; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma
; CHECK-NEXT: vmv.s.x v16, a0
-; CHECK-NEXT: vsetivli zero, 5, e32, m8, tu, ma
+; CHECK-NEXT: vsetivli zero, 5, e32, m2, tu, ma
; CHECK-NEXT: vslideup.vi v8, v16, 4
; CHECK-NEXT: ret
%b = insertelement <32 x i32> %a, i32 %y, i32 4
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll
index 6e0ca4cba6bd6d6..eb74c5e608b93ee 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll
@@ -811,9 +811,11 @@ define i64 @explode_4xi64(<4 x i64> %v) {
; RV32-NEXT: vsrl.vx v10, v8, a0
; RV32-NEXT: vmv.x.s a1, v10
; RV32-NEXT: vmv.x.s a2, v8
+; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
; RV32-NEXT: vslidedown.vi v10, v8, 1
-; RV32-NEXT: vsrl.vx v12, v10, a0
-; RV32-NEXT: vmv.x.s a3, v12
+; RV32-NEXT: vmv.x.s a3, v10
+; RV32-NEXT: vsetivli zero, 1, e64, m2, ta, ma
+; RV32-NEXT: vsrl.vx v10, v10, a0
; RV32-NEXT: vmv.x.s a4, v10
; RV32-NEXT: vslidedown.vi v10, v8, 2
; RV32-NEXT: vsrl.vx v12, v10, a0
@@ -823,12 +825,12 @@ define i64 @explode_4xi64(<4 x i64> %v) {
; RV32-NEXT: vsrl.vx v10, v8, a0
; RV32-NEXT: vmv.x.s a0, v10
; RV32-NEXT: vmv.x.s a7, v8
-; RV32-NEXT: add a1, a1, a3
-; RV32-NEXT: add a4, a2, a4
-; RV32-NEXT: sltu a2, a4, a2
+; RV32-NEXT: add a1, a1, a4
+; RV32-NEXT: add a3, a2, a3
+; RV32-NEXT: sltu a2, a3, a2
; RV32-NEXT: add a1, a1, a2
-; RV32-NEXT: add a6, a4, a6
-; RV32-NEXT: sltu a2, a6, a4
+; RV32-NEXT: add a6, a3, a6
+; RV32-NEXT: sltu a2, a6, a3
; RV32-NEXT: add a1, a1, a5
; RV32-NEXT: add a0, a2, a0
; RV32-NEXT: add a1, a1, a0
@@ -875,7 +877,7 @@ define i64 @explode_8xi64(<8 x i64> %v) {
; RV32-NEXT: vsrl.vx v12, v8, a0
; RV32-NEXT: vmv.x.s a1, v12
; RV32-NEXT: vmv.x.s a2, v8
-; RV32-NEXT: vsetivli zero, 1, e64, m2, ta, ma
+; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
; RV32-NEXT: vslidedown.vi v12, v8, 1
; RV32-NEXT: vmv.x.s a3, v12
; RV32-NEXT: vsetivli zero, 1, e64, m4, ta, ma
@@ -887,7 +889,9 @@ define i64 @explode_8xi64(<8 x i64> %v) {
; RV32-NEXT: vsrl.vx v16, v12, a0
; RV32-NEXT: vmv.x.s a5, v16
; RV32-NEXT: vmv.x.s a6, v12
+; RV32-NEXT: vsetivli zero, 1, e64, m2, ta, ma
; RV32-NEXT: vslidedown.vi v12, v8, 3
+; RV32-NEXT: vsetivli zero, 1, e64, m4, ta, ma
; RV32-NEXT: vsrl.vx v16, v12, a0
; RV32-NEXT: vmv.x.s a7, v16
; RV32-NEXT: vmv.x.s t0, v12
@@ -1033,7 +1037,7 @@ define i64 @explode_16xi64(<16 x i64> %v) {
; RV32-NEXT: vmv.x.s a0, v16
; RV32-NEXT: sw a0, 8(sp) # 4-byte Folded Spill
; RV32-NEXT: vmv.x.s a0, v8
-; RV32-NEXT: vsetivli zero, 1, e64, m2, ta, ma
+; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
; RV32-NEXT: vslidedown.vi v16, v8, 1
; RV32-NEXT: vmv.x.s a3, v16
; RV32-NEXT: vsetivli zero, 1, e64, m8, ta, ma
@@ -1045,7 +1049,7 @@ define i64 @explode_16xi64(<16 x i64> %v) {
; RV32-NEXT: vsrl.vx v24, v16, a1
; RV32-NEXT: vmv.x.s a5, v24
; RV32-NEXT: vmv.x.s a6, v16
-; RV32-NEXT: vsetivli zero, 1, e64, m4, ta, ma
+; RV32-NEXT: vsetivli zero, 1, e64, m2, ta, ma
; RV32-NEXT: vslidedown.vi v16, v8, 3
; RV32-NEXT: vsetivli zero, 1, e64, m8, ta, ma
; RV32-NEXT: vsrl.vx v24, v16, a1
@@ -1068,31 +1072,33 @@ define i64 @explode_16xi64(<16 x i64> %v) {
; RV32-NEXT: vmv.x.s t4, v16
; RV32-NEXT: vsetivli zero, 1, e64, m8, ta, ma
; RV32-NEXT: vsrl.vx v16, v16, a1
-; RV32-NEXT: vmv.x.s ra, v16
+; RV32-NEXT: vmv.x.s s0, v16
+; RV32-NEXT: vsetivli zero, 1, e64, m4, ta, ma
; RV32-NEXT: vslidedown.vi v16, v8, 7
-; RV32-NEXT: vsrl.vx v24, v16, a1
-; RV32-NEXT: vmv.x.s s5, v24
; RV32-NEXT: vmv.x.s t6, v16
+; RV32-NEXT: vsetivli zero, 1, e64, m8, ta, ma
+; RV32-NEXT: vsrl.vx v16, v16, a1
+; RV32-NEXT: vmv.x.s ra, v16
; RV32-NEXT: vslidedown.vi v16, v8, 8
; RV32-NEXT: vsrl.vx v24, v16, a1
; RV32-NEXT: vmv.x.s s6, v24
-; RV32-NEXT: vmv.x.s s0, v16
+; RV32-NEXT: vmv.x.s s1, v16
; RV32-NEXT: vslidedown.vi v16, v8, 9
; RV32-NEXT: vsrl.vx v24, v16, a1
; RV32-NEXT: vmv.x.s s7, v24
-; RV32-NEXT: vmv.x.s s1, v16
+; RV32-NEXT: vmv.x.s s2, v16
; RV32-NEXT: vslidedown.vi v16, v8, 10
; RV32-NEXT: vsrl.vx v24, v16, a1
; RV32-NEXT: vmv.x.s s8, v24
-; RV32-NEXT: vmv.x.s s2, v16
+; RV32-NEXT: vmv.x.s s3, v16
; RV32-NEXT: vslidedown.vi v16, v8, 11
; RV32-NEXT: vsrl.vx v24, v16, a1
; RV32-NEXT: vmv.x.s s9, v24
-; RV32-NEXT: vmv.x.s s3, v16
+; RV32-NEXT: vmv.x.s s4, v16
; RV32-NEXT: vslidedown.vi v16, v8, 12
; RV32-NEXT: vsrl.vx v24, v16, a1
; RV32-NEXT: vmv.x.s s10, v24
-; RV32-NEXT: vmv.x.s s4, v16
+; RV32-NEXT: vmv.x.s s5, v16
; RV32-NEXT: vslidedown.vi v0, v8, 13
; RV32-NEXT: vsrl.vx v16, v0, a1
; RV32-NEXT: vmv.x.s s11, v16
@@ -1121,39 +1127,39 @@ define i64 @explode_16xi64(<16 x i64> %v) {
; RV32-NEXT: add a0, a0, a1
; RV32-NEXT: add t2, t1, t2
; RV32-NEXT: sltu a1, t2, t1
-; RV32-NEXT: add a1, a1, ra
+; RV32-NEXT: add a1, a1, s0
; RV32-NEXT: add a0, a0, a1
; RV32-NEXT: add t4, t2, t4
; RV32-NEXT: sltu a1, t4, t2
-; RV32-NEXT: add a1, a1, s5
+; RV32-NEXT: add a1, a1, ra
; RV32-NEXT: add a0, a0, a1
; RV32-NEXT: add t6, t4, t6
; RV32-NEXT: sltu a1, t6, t4
; RV32-NEXT: add a1, a1, s6
; RV32-NEXT: add a0, a0, a1
-; RV32-NEXT: add s0, t6, s0
-; RV32-NEXT: sltu a1, s0, t6
+; RV32-NEXT: add s1, t6, s1
+; RV32-NEXT: sltu a1, s1, t6
; RV32-NEXT: add a1, a1, s7
; RV32-NEXT: add a0, a0, a1
-; RV32-NEXT: add s1, s0, s1
-; RV32-NEXT: sltu a1, s1, s0
-; RV32-NEXT: add a1, a1, s8
-; RV32-NEXT: add a0, a0, a1
; RV32-NEXT: add s2, s1, s2
; RV32-NEXT: sltu a1, s2, s1
-; RV32-NEXT: add a1, a1, s9
+; RV32-NEXT: add a1, a1, s8
; RV32-NEXT: add a0, a0, a1
; RV32-NEXT: add s3, s2, s3
; RV32-NEXT: sltu a1, s3, s2
-; RV32-NEXT: add a1, a1, s10
+; RV32-NEXT: add a1, a1, s9
; RV32-NEXT: add a0, a0, a1
; RV32-NEXT: add s4, s3, s4
; RV32-NEXT: sltu a1, s4, s3
+; RV32-NEXT: add a1, a1, s10
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add s5, s4, s5
+; RV32-NEXT: sltu a1, s5, s4
; RV32-NEXT: add a1, a1, s11
; RV32-NEXT: add a0, a0, a1
; RV32-NEXT: vmv.x.s a1, v24
-; RV32-NEXT: add a2, s4, a2
-; RV32-NEXT: sltu a3, a2, s4
+; RV32-NEXT: add a2, s5, a2
+; RV32-NEXT: sltu a3, a2, s5
; RV32-NEXT: add a1, a3, a1
; RV32-NEXT: vmv.x.s a3, v16
; RV32-NEXT: add a0, a0, a1
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
index 3711f014e06478b..6c0288dd9b5d4ae 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
@@ -2431,7 +2431,7 @@ define <8 x i32> @mgather_v8i32(<8 x ptr> %ptrs, <8 x i1> %m, <8 x i32> %passthr
; RV64ZVE32F-NEXT: .LBB34_10: # %cond.load1
; RV64ZVE32F-NEXT: ld a2, 8(a0)
; RV64ZVE32F-NEXT: lw a2, 0(a2)
-; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m2, tu, ma
+; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, tu, ma
; RV64ZVE32F-NEXT: vmv.s.x v10, a2
; RV64ZVE32F-NEXT: vslideup.vi v8, v10, 1
; RV64ZVE32F-NEXT: andi a2, a1, 4
@@ -2439,7 +2439,7 @@ define <8 x i32> @mgather_v8i32(<8 x ptr> %ptrs, <8 x i1> %m, <8 x i32> %passthr
; RV64ZVE32F-NEXT: .LBB34_11: # %cond.load4
; RV64ZVE32F-NEXT: ld a2, 16(a0)
; RV64ZVE32F-NEXT: lw a2, 0(a2)
-; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, ma
+; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m1, tu, ma
; RV64ZVE32F-NEXT: vmv.s.x v10, a2
; RV64ZVE32F-NEXT: vslideup.vi v8, v10, 2
; RV64ZVE32F-NEXT: andi a2, a1, 8
@@ -2447,7 +2447,7 @@ define <8 x i32> @mgather_v8i32(<8 x ptr> %ptrs, <8 x i1> %m, <8 x i32> %passthr
; RV64ZVE32F-NEXT: .LBB34_12: # %cond.load7
; RV64ZVE32F-NEXT: ld a2, 24(a0)
; RV64ZVE32F-NEXT: lw a2, 0(a2)
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, tu, ma
+; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, tu, ma
; RV64ZVE32F-NEXT: vmv.s.x v10, a2
; RV64ZVE32F-NEXT: vslideup.vi v8, v10, 3
; RV64ZVE32F-NEXT: andi a2, a1, 16
@@ -2531,9 +2531,9 @@ define <8 x i32> @mgather_baseidx_v8i8_v8i32(ptr %base, <8 x i8> %idxs, <8 x i1>
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: lw a2, 0(a2)
; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma
-; RV64ZVE32F-NEXT: vmv.s.x v12, a2
-; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m2, tu, ma
-; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 1
+; RV64ZVE32F-NEXT: vmv.s.x v9, a2
+; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, tu, ma
+; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 1
; RV64ZVE32F-NEXT: .LBB35_4: # %else2
; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4
@@ -2579,7 +2579,7 @@ define <8 x i32> @mgather_baseidx_v8i8_v8i32(ptr %base, <8 x i8> %idxs, <8 x i1>
; RV64ZVE32F-NEXT: lw a2, 0(a2)
; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma
; RV64ZVE32F-NEXT: vmv.s.x v12, a2
-; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, ma
+; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m1, tu, ma
; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2
; RV64ZVE32F-NEXT: andi a2, a1, 8
; RV64ZVE32F-NEXT: beqz a2, .LBB35_6
@@ -2591,9 +2591,9 @@ define <8 x i32> @mgather_baseidx_v8i8_v8i32(ptr %base, <8 x i8> %idxs, <8 x i1>
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: lw a2, 0(a2)
; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma
-; RV64ZVE32F-NEXT: vmv.s.x v12, a2
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, tu, ma
-; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 3
+; RV64ZVE32F-NEXT: vmv.s.x v8, a2
+; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, tu, ma
+; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 3
; RV64ZVE32F-NEXT: andi a2, a1, 16
; RV64ZVE32F-NEXT: beqz a2, .LBB35_7
; RV64ZVE32F-NEXT: .LBB35_14: # %cond.load10
@@ -2681,9 +2681,9 @@ define <8 x i32> @mgather_baseidx_sext_v8i8_v8i32(ptr %base, <8 x i8> %idxs, <8
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: lw a2, 0(a2)
; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma
-; RV64ZVE32F-NEXT: vmv.s.x v12, a2
-; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m2, tu, ma
-; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 1
+; RV64ZVE32F-NEXT: vmv.s.x v9, a2
+; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, tu, ma
+; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 1
; RV64ZVE32F-NEXT: .LBB36_4: # %else2
; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4
@@ -2729,7 +2729,7 @@ define <8 x i32> @mgather_baseidx_sext_v8i8_v8i32(ptr %base, <8 x i8> %idxs, <8
; RV64ZVE32F-NEXT: lw a2, 0(a2)
; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma
; RV64ZVE32F-NEXT: vmv.s.x v12, a2
-; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, ma
+; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m1, tu, ma
; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2
; RV64ZVE32F-NEXT: andi a2, a1, 8
; RV64ZVE32F-NEXT: beqz a2, .LBB36_6
@@ -2741,9 +2741,9 @@ define <8 x i32> @mgather_baseidx_sext_v8i8_v8i32(ptr %base, <8 x i8> %idxs, <8
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: lw a2, 0(a2)
; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma
-; RV64ZVE32F-NEXT: vmv.s.x v12, a2
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, tu, ma
-; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 3
+; RV64ZVE32F-NEXT: vmv.s.x v8, a2
+; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, tu, ma
+; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 3
; RV64ZVE32F-NEXT: andi a2, a1, 16
; RV64ZVE32F-NEXT: beqz a2, .LBB36_7
; RV64ZVE32F-NEXT: .LBB36_14: # %cond.load10
@@ -2835,9 +2835,9 @@ define <8 x i32> @mgather_baseidx_zext_v8i8_v8i32(ptr %base, <8 x i8> %idxs, <8
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: lw a2, 0(a2)
; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma
-; RV64ZVE32F-NEXT: vmv.s.x v12, a2
-; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m2, tu, ma
-; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 1
+; RV64ZVE32F-NEXT: vmv.s.x v9, a2
+; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, tu, ma
+; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 1
; RV64ZVE32F-NEXT: .LBB37_4: # %else2
; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4
@@ -2885,7 +2885,7 @@ define <8 x i32> @mgather_baseidx_zext_v8i8_v8i32(ptr %base, <8 x i8> %idxs, <8
; RV64ZVE32F-NEXT: lw a2, 0(a2)
; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma
; RV64ZVE32F-NEXT: vmv.s.x v12, a2
-; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, ma
+; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m1, tu, ma
; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2
; RV64ZVE32F-NEXT: andi a2, a1, 8
; RV64ZVE32F-NEXT: beqz a2, .LBB37_6
@@ -2899,7 +2899,7 @@ define <8 x i32> @mgather_baseidx_zext_v8i8_v8i32(ptr %base, <8 x i8> %idxs, <8
; RV64ZVE32F-NEXT: lw a2, 0(a2)
; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma
; RV64ZVE32F-NEXT: vmv.s.x v8, a2
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, tu, ma
+; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, tu, ma
; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 3
; RV64ZVE32F-NEXT: andi a2, a1, 16
; RV64ZVE32F-NEXT: beqz a2, .LBB37_7
@@ -2993,9 +2993,9 @@ define <8 x i32> @mgather_baseidx_v8i16_v8i32(ptr %base, <8 x i16> %idxs, <8 x i
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: lw a2, 0(a2)
; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma
-; RV64ZVE32F-NEXT: vmv.s.x v12, a2
-; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m2, tu, ma
-; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 1
+; RV64ZVE32F-NEXT: vmv.s.x v9, a2
+; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, tu, ma
+; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 1
; RV64ZVE32F-NEXT: .LBB38_4: # %else2
; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4
@@ -3041,7 +3041,7 @@ define <8 x i32> @mgather_baseidx_v8i16_v8i32(ptr %base, <8 x i16> %idxs, <8 x i
; RV64ZVE32F-NEXT: lw a2, 0(a2)
; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma
; RV64ZVE32F-NEXT: vmv.s.x v12, a2
-; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, ma
+; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m1, tu, ma
; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2
; RV64ZVE32F-NEXT: andi a2, a1, 8
; RV64ZVE32F-NEXT: beqz a2, .LBB38_6
@@ -3053,9 +3053,9 @@ define <8 x i32> @mgather_baseidx_v8i16_v8i32(ptr %base, <8 x i16> %idxs, <8 x i
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: lw a2, 0(a2)
; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma
-; RV64ZVE32F-NEXT: vmv.s.x v12, a2
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, tu, ma
-; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 3
+; RV64ZVE32F-NEXT: vmv.s.x v8, a2
+; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, tu, ma
+; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 3
; RV64ZVE32F-NEXT: andi a2, a1, 16
; RV64ZVE32F-NEXT: beqz a2, .LBB38_7
; RV64ZVE32F-NEXT: .LBB38_14: # %cond.load10
@@ -3144,9 +3144,9 @@ define <8 x i32> @mgather_baseidx_sext_v8i16_v8i32(ptr %base, <8 x i16> %idxs, <
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: lw a2, 0(a2)
; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma
-; RV64ZVE32F-NEXT: vmv.s.x v12, a2
-; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m2, tu, ma
-; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 1
+; RV64ZVE32F-NEXT: vmv.s.x v9, a2
+; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, tu, ma
+; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 1
; RV64ZVE32F-NEXT: .LBB39_4: # %else2
; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4
@@ -3192,7 +3192,7 @@ define <8 x i32> @mgather_baseidx_sext_v8i16_v8i32(ptr %base, <8 x i16> %idxs, <
; RV64ZVE32F-NEXT: lw a2, 0(a2)
; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma
; RV64ZVE32F-NEXT: vmv.s.x v12, a2
-; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, ma
+; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m1, tu, ma
; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2
; RV64ZVE32F-NEXT: andi a2, a1, 8
; RV64ZVE32F-NEXT: beqz a2, .LBB39_6
@@ -3204,9 +3204,9 @@ define <8 x i32> @mgather_baseidx_sext_v8i16_v8i32(ptr %base, <8 x i16> %idxs, <
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: lw a2, 0(a2)
; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma
-; RV64ZVE32F-NEXT: vmv.s.x v12, a2
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, tu, ma
-; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 3
+; RV64ZVE32F-NEXT: vmv.s.x v8, a2
+; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, tu, ma
+; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 3
; RV64ZVE32F-NEXT: andi a2, a1, 16
; RV64ZVE32F-NEXT: beqz a2, .LBB39_7
; RV64ZVE32F-NEXT: .LBB39_14: # %cond.load10
@@ -3299,9 +3299,9 @@ define <8 x i32> @mgather_baseidx_zext_v8i16_v8i32(ptr %base, <8 x i16> %idxs, <
; RV64ZVE32F-NEXT: add a3, a0, a3
; RV64ZVE32F-NEXT: lw a3, 0(a3)
; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma
-; RV64ZVE32F-NEXT: vmv.s.x v12, a3
-; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m2, tu, ma
-; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 1
+; RV64ZVE32F-NEXT: vmv.s.x v9, a3
+; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, tu, ma
+; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 1
; RV64ZVE32F-NEXT: .LBB40_4: # %else2
; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4
@@ -3349,7 +3349,7 @@ define <8 x i32> @mgather_baseidx_zext_v8i16_v8i32(ptr %base, <8 x i16> %idxs, <
; RV64ZVE32F-NEXT: lw a3, 0(a3)
; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma
; RV64ZVE32F-NEXT: vmv.s.x v12, a3
-; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, ma
+; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m1, tu, ma
; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2
; RV64ZVE32F-NEXT: andi a3, a2, 8
; RV64ZVE32F-NEXT: beqz a3, .LBB40_6
@@ -3362,9 +3362,9 @@ define <8 x i32> @mgather_baseidx_zext_v8i16_v8i32(ptr %base, <8 x i16> %idxs, <
; RV64ZVE32F-NEXT: add a3, a0, a3
; RV64ZVE32F-NEXT: lw a3, 0(a3)
; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma
-; RV64ZVE32F-NEXT: vmv.s.x v12, a3
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, tu, ma
-; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 3
+; RV64ZVE32F-NEXT: vmv.s.x v8, a3
+; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, tu, ma
+; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 3
; RV64ZVE32F-NEXT: andi a3, a2, 16
; RV64ZVE32F-NEXT: beqz a3, .LBB40_7
; RV64ZVE32F-NEXT: .LBB40_14: # %cond.load10
@@ -3455,7 +3455,7 @@ define <8 x i32> @mgather_baseidx_v8i32(ptr %base, <8 x i32> %idxs, <8 x i1> %m,
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: lw a2, 0(a2)
; RV64ZVE32F-NEXT: vmv.s.x v12, a2
-; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m2, tu, ma
+; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, tu, ma
; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 1
; RV64ZVE32F-NEXT: .LBB41_4: # %else2
; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, ta, ma
@@ -3499,9 +3499,9 @@ define <8 x i32> @mgather_baseidx_v8i32(ptr %base, <8 x i32> %idxs, <8 x i1> %m,
; RV64ZVE32F-NEXT: slli a2, a2, 2
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: lw a2, 0(a2)
-; RV64ZVE32F-NEXT: vmv.s.x v14, a2
-; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, ma
-; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 2
+; RV64ZVE32F-NEXT: vmv.s.x v9, a2
+; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m1, tu, ma
+; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 2
; RV64ZVE32F-NEXT: andi a2, a1, 8
; RV64ZVE32F-NEXT: beqz a2, .LBB41_6
; RV64ZVE32F-NEXT: .LBB41_13: # %cond.load7
@@ -3512,7 +3512,7 @@ define <8 x i32> @mgather_baseidx_v8i32(ptr %base, <8 x i32> %idxs, <8 x i1> %m,
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: lw a2, 0(a2)
; RV64ZVE32F-NEXT: vmv.s.x v8, a2
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, tu, ma
+; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, tu, ma
; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 3
; RV64ZVE32F-NEXT: andi a2, a1, 16
; RV64ZVE32F-NEXT: beqz a2, .LBB41_7
@@ -8234,7 +8234,7 @@ define <8 x float> @mgather_v8f32(<8 x ptr> %ptrs, <8 x i1> %m, <8 x float> %pas
; RV64ZVE32F-NEXT: .LBB73_10: # %cond.load1
; RV64ZVE32F-NEXT: ld a2, 8(a0)
; RV64ZVE32F-NEXT: flw fa5, 0(a2)
-; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m2, tu, ma
+; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, tu, ma
; RV64ZVE32F-NEXT: vfmv.s.f v10, fa5
; RV64ZVE32F-NEXT: vslideup.vi v8, v10, 1
; RV64ZVE32F-NEXT: andi a2, a1, 4
@@ -8242,7 +8242,7 @@ define <8 x float> @mgather_v8f32(<8 x ptr> %ptrs, <8 x i1> %m, <8 x float> %pas
; RV64ZVE32F-NEXT: .LBB73_11: # %cond.load4
; RV64ZVE32F-NEXT: ld a2, 16(a0)
; RV64ZVE32F-NEXT: flw fa5, 0(a2)
-; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, ma
+; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m1, tu, ma
; RV64ZVE32F-NEXT: vfmv.s.f v10, fa5
; RV64ZVE32F-NEXT: vslideup.vi v8, v10, 2
; RV64ZVE32F-NEXT: andi a2, a1, 8
@@ -8250,7 +8250,7 @@ define <8 x float> @mgather_v8f32(<8 x ptr> %ptrs, <8 x i1> %m, <8 x float> %pas
; RV64ZVE32F-NEXT: .LBB73_12: # %cond.load7
; RV64ZVE32F-NEXT: ld a2, 24(a0)
; RV64ZVE32F-NEXT: flw fa5, 0(a2)
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, tu, ma
+; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, tu, ma
; RV64ZVE32F-NEXT: vfmv.s.f v10, fa5
; RV64ZVE32F-NEXT: vslideup.vi v8, v10, 3
; RV64ZVE32F-NEXT: andi a2, a1, 16
@@ -8334,9 +8334,9 @@ define <8 x float> @mgather_baseidx_v8i8_v8f32(ptr %base, <8 x i8> %idxs, <8 x i
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: flw fa5, 0(a2)
; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma
-; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5
-; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m2, tu, ma
-; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 1
+; RV64ZVE32F-NEXT: vfmv.s.f v9, fa5
+; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, tu, ma
+; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 1
; RV64ZVE32F-NEXT: .LBB74_4: # %else2
; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4
@@ -8382,7 +8382,7 @@ define <8 x float> @mgather_baseidx_v8i8_v8f32(ptr %base, <8 x i8> %idxs, <8 x i
; RV64ZVE32F-NEXT: flw fa5, 0(a2)
; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma
; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5
-; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, ma
+; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m1, tu, ma
; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2
; RV64ZVE32F-NEXT: andi a2, a1, 8
; RV64ZVE32F-NEXT: beqz a2, .LBB74_6
@@ -8394,9 +8394,9 @@ define <8 x float> @mgather_baseidx_v8i8_v8f32(ptr %base, <8 x i8> %idxs, <8 x i
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: flw fa5, 0(a2)
; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma
-; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, tu, ma
-; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 3
+; RV64ZVE32F-NEXT: vfmv.s.f v8, fa5
+; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, tu, ma
+; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 3
; RV64ZVE32F-NEXT: andi a2, a1, 16
; RV64ZVE32F-NEXT: beqz a2, .LBB74_7
; RV64ZVE32F-NEXT: .LBB74_14: # %cond.load10
@@ -8484,9 +8484,9 @@ define <8 x float> @mgather_baseidx_sext_v8i8_v8f32(ptr %base, <8 x i8> %idxs, <
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: flw fa5, 0(a2)
; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma
-; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5
-; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m2, tu, ma
-; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 1
+; RV64ZVE32F-NEXT: vfmv.s.f v9, fa5
+; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, tu, ma
+; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 1
; RV64ZVE32F-NEXT: .LBB75_4: # %else2
; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4
@@ -8532,7 +8532,7 @@ define <8 x float> @mgather_baseidx_sext_v8i8_v8f32(ptr %base, <8 x i8> %idxs, <
; RV64ZVE32F-NEXT: flw fa5, 0(a2)
; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma
; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5
-; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, ma
+; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m1, tu, ma
; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2
; RV64ZVE32F-NEXT: andi a2, a1, 8
; RV64ZVE32F-NEXT: beqz a2, .LBB75_6
@@ -8544,9 +8544,9 @@ define <8 x float> @mgather_baseidx_sext_v8i8_v8f32(ptr %base, <8 x i8> %idxs, <
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: flw fa5, 0(a2)
; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma
-; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, tu, ma
-; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 3
+; RV64ZVE32F-NEXT: vfmv.s.f v8, fa5
+; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, tu, ma
+; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 3
; RV64ZVE32F-NEXT: andi a2, a1, 16
; RV64ZVE32F-NEXT: beqz a2, .LBB75_7
; RV64ZVE32F-NEXT: .LBB75_14: # %cond.load10
@@ -8638,9 +8638,9 @@ define <8 x float> @mgather_baseidx_zext_v8i8_v8f32(ptr %base, <8 x i8> %idxs, <
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: flw fa5, 0(a2)
; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma
-; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5
-; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m2, tu, ma
-; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 1
+; RV64ZVE32F-NEXT: vfmv.s.f v9, fa5
+; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, tu, ma
+; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 1
; RV64ZVE32F-NEXT: .LBB76_4: # %else2
; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4
@@ -8688,7 +8688,7 @@ define <8 x float> @mgather_baseidx_zext_v8i8_v8f32(ptr %base, <8 x i8> %idxs, <
; RV64ZVE32F-NEXT: flw fa5, 0(a2)
; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma
; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5
-; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, ma
+; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m1, tu, ma
; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2
; RV64ZVE32F-NEXT: andi a2, a1, 8
; RV64ZVE32F-NEXT: beqz a2, .LBB76_6
@@ -8702,7 +8702,7 @@ define <8 x float> @mgather_baseidx_zext_v8i8_v8f32(ptr %base, <8 x i8> %idxs, <
; RV64ZVE32F-NEXT: flw fa5, 0(a2)
; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma
; RV64ZVE32F-NEXT: vfmv.s.f v8, fa5
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, tu, ma
+; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, tu, ma
; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 3
; RV64ZVE32F-NEXT: andi a2, a1, 16
; RV64ZVE32F-NEXT: beqz a2, .LBB76_7
@@ -8796,9 +8796,9 @@ define <8 x float> @mgather_baseidx_v8i16_v8f32(ptr %base, <8 x i16> %idxs, <8 x
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: flw fa5, 0(a2)
; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma
-; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5
-; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m2, tu, ma
-; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 1
+; RV64ZVE32F-NEXT: vfmv.s.f v9, fa5
+; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, tu, ma
+; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 1
; RV64ZVE32F-NEXT: .LBB77_4: # %else2
; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4
@@ -8844,7 +8844,7 @@ define <8 x float> @mgather_baseidx_v8i16_v8f32(ptr %base, <8 x i16> %idxs, <8 x
; RV64ZVE32F-NEXT: flw fa5, 0(a2)
; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma
; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5
-; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, ma
+; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m1, tu, ma
; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2
; RV64ZVE32F-NEXT: andi a2, a1, 8
; RV64ZVE32F-NEXT: beqz a2, .LBB77_6
@@ -8856,9 +8856,9 @@ define <8 x float> @mgather_baseidx_v8i16_v8f32(ptr %base, <8 x i16> %idxs, <8 x
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: flw fa5, 0(a2)
; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma
-; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, tu, ma
-; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 3
+; RV64ZVE32F-NEXT: vfmv.s.f v8, fa5
+; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, tu, ma
+; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 3
; RV64ZVE32F-NEXT: andi a2, a1, 16
; RV64ZVE32F-NEXT: beqz a2, .LBB77_7
; RV64ZVE32F-NEXT: .LBB77_14: # %cond.load10
@@ -8947,9 +8947,9 @@ define <8 x float> @mgather_baseidx_sext_v8i16_v8f32(ptr %base, <8 x i16> %idxs,
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: flw fa5, 0(a2)
; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma
-; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5
-; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m2, tu, ma
-; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 1
+; RV64ZVE32F-NEXT: vfmv.s.f v9, fa5
+; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, tu, ma
+; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 1
; RV64ZVE32F-NEXT: .LBB78_4: # %else2
; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4
@@ -8995,7 +8995,7 @@ define <8 x float> @mgather_baseidx_sext_v8i16_v8f32(ptr %base, <8 x i16> %idxs,
; RV64ZVE32F-NEXT: flw fa5, 0(a2)
; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma
; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5
-; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, ma
+; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m1, tu, ma
; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2
; RV64ZVE32F-NEXT: andi a2, a1, 8
; RV64ZVE32F-NEXT: beqz a2, .LBB78_6
@@ -9007,9 +9007,9 @@ define <8 x float> @mgather_baseidx_sext_v8i16_v8f32(ptr %base, <8 x i16> %idxs,
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: flw fa5, 0(a2)
; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma
-; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, tu, ma
-; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 3
+; RV64ZVE32F-NEXT: vfmv.s.f v8, fa5
+; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, tu, ma
+; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 3
; RV64ZVE32F-NEXT: andi a2, a1, 16
; RV64ZVE32F-NEXT: beqz a2, .LBB78_7
; RV64ZVE32F-NEXT: .LBB78_14: # %cond.load10
@@ -9102,9 +9102,9 @@ define <8 x float> @mgather_baseidx_zext_v8i16_v8f32(ptr %base, <8 x i16> %idxs,
; RV64ZVE32F-NEXT: add a3, a0, a3
; RV64ZVE32F-NEXT: flw fa5, 0(a3)
; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma
-; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5
-; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m2, tu, ma
-; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 1
+; RV64ZVE32F-NEXT: vfmv.s.f v9, fa5
+; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, tu, ma
+; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 1
; RV64ZVE32F-NEXT: .LBB79_4: # %else2
; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4
@@ -9152,7 +9152,7 @@ define <8 x float> @mgather_baseidx_zext_v8i16_v8f32(ptr %base, <8 x i16> %idxs,
; RV64ZVE32F-NEXT: flw fa5, 0(a3)
; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma
; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5
-; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, ma
+; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m1, tu, ma
; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2
; RV64ZVE32F-NEXT: andi a3, a2, 8
; RV64ZVE32F-NEXT: beqz a3, .LBB79_6
@@ -9165,9 +9165,9 @@ define <8 x float> @mgather_baseidx_zext_v8i16_v8f32(ptr %base, <8 x i16> %idxs,
; RV64ZVE32F-NEXT: add a3, a0, a3
; RV64ZVE32F-NEXT: flw fa5, 0(a3)
; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma
-; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, tu, ma
-; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 3
+; RV64ZVE32F-NEXT: vfmv.s.f v8, fa5
+; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, tu, ma
+; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 3
; RV64ZVE32F-NEXT: andi a3, a2, 16
; RV64ZVE32F-NEXT: beqz a3, .LBB79_7
; RV64ZVE32F-NEXT: .LBB79_14: # %cond.load10
@@ -9258,7 +9258,7 @@ define <8 x float> @mgather_baseidx_v8f32(ptr %base, <8 x i32> %idxs, <8 x i1> %
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: flw fa5, 0(a2)
; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5
-; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m2, tu, ma
+; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, tu, ma
; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 1
; RV64ZVE32F-NEXT: .LBB80_4: # %else2
; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, ta, ma
@@ -9302,9 +9302,9 @@ define <8 x float> @mgather_baseidx_v8f32(ptr %base, <8 x i32> %idxs, <8 x i1> %
; RV64ZVE32F-NEXT: slli a2, a2, 2
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: flw fa5, 0(a2)
-; RV64ZVE32F-NEXT: vfmv.s.f v14, fa5
-; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, ma
-; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 2
+; RV64ZVE32F-NEXT: vfmv.s.f v9, fa5
+; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m1, tu, ma
+; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 2
; RV64ZVE32F-NEXT: andi a2, a1, 8
; RV64ZVE32F-NEXT: beqz a2, .LBB80_6
; RV64ZVE32F-NEXT: .LBB80_13: # %cond.load7
@@ -9315,7 +9315,7 @@ define <8 x float> @mgather_baseidx_v8f32(ptr %base, <8 x i32> %idxs, <8 x i1> %
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: flw fa5, 0(a2)
; RV64ZVE32F-NEXT: vfmv.s.f v8, fa5
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, tu, ma
+; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, tu, ma
; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 3
; RV64ZVE32F-NEXT: andi a2, a1, 16
; RV64ZVE32F-NEXT: beqz a2, .LBB80_7
@@ -12395,7 +12395,7 @@ define <32 x i8> @mgather_baseidx_v32i8(ptr %base, <32 x i8> %idxs, <32 x i1> %m
; RV64ZVE32F-NEXT: lbu a2, 0(a2)
; RV64ZVE32F-NEXT: li a3, 32
; RV64ZVE32F-NEXT: vmv.s.x v12, a2
-; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, m2, tu, ma
+; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, m1, tu, ma
; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 1
; RV64ZVE32F-NEXT: .LBB98_4: # %else2
; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
@@ -12417,9 +12417,9 @@ define <32 x i8> @mgather_baseidx_v32i8(ptr %base, <32 x i8> %idxs, <32 x i1> %m
; RV64ZVE32F-NEXT: lbu a2, 0(a2)
; RV64ZVE32F-NEXT: li a3, 32
; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m1, ta, ma
-; RV64ZVE32F-NEXT: vmv.s.x v14, a2
-; RV64ZVE32F-NEXT: vsetivli zero, 5, e8, m2, tu, ma
-; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 4
+; RV64ZVE32F-NEXT: vmv.s.x v12, a2
+; RV64ZVE32F-NEXT: vsetivli zero, 5, e8, m1, tu, ma
+; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 4
; RV64ZVE32F-NEXT: .LBB98_8: # %else11
; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, m1, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 32
@@ -12433,7 +12433,7 @@ define <32 x i8> @mgather_baseidx_v32i8(ptr %base, <32 x i8> %idxs, <32 x i1> %m
; RV64ZVE32F-NEXT: lbu a2, 0(a2)
; RV64ZVE32F-NEXT: li a3, 32
; RV64ZVE32F-NEXT: vmv.s.x v14, a2
-; RV64ZVE32F-NEXT: vsetivli zero, 6, e8, m2, tu, ma
+; RV64ZVE32F-NEXT: vsetivli zero, 6, e8, m1, tu, ma
; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 5
; RV64ZVE32F-NEXT: .LBB98_10: # %else14
; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
@@ -12456,9 +12456,9 @@ define <32 x i8> @mgather_baseidx_v32i8(ptr %base, <32 x i8> %idxs, <32 x i1> %m
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: lbu a2, 0(a2)
; RV64ZVE32F-NEXT: li a3, 32
-; RV64ZVE32F-NEXT: vmv.s.x v14, a2
-; RV64ZVE32F-NEXT: vsetivli zero, 10, e8, m2, tu, ma
-; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 9
+; RV64ZVE32F-NEXT: vmv.s.x v13, a2
+; RV64ZVE32F-NEXT: vsetivli zero, 10, e8, m1, tu, ma
+; RV64ZVE32F-NEXT: vslideup.vi v10, v13, 9
; RV64ZVE32F-NEXT: .LBB98_15: # %else26
; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v13, v12, 4
@@ -12472,7 +12472,7 @@ define <32 x i8> @mgather_baseidx_v32i8(ptr %base, <32 x i8> %idxs, <32 x i1> %m
; RV64ZVE32F-NEXT: lbu a2, 0(a2)
; RV64ZVE32F-NEXT: li a3, 32
; RV64ZVE32F-NEXT: vmv.s.x v14, a2
-; RV64ZVE32F-NEXT: vsetivli zero, 11, e8, m2, tu, ma
+; RV64ZVE32F-NEXT: vsetivli zero, 11, e8, m1, tu, ma
; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 10
; RV64ZVE32F-NEXT: .LBB98_17: # %else29
; RV64ZVE32F-NEXT: slli a2, a1, 52
@@ -12484,9 +12484,9 @@ define <32 x i8> @mgather_baseidx_v32i8(ptr %base, <32 x i8> %idxs, <32 x i1> %m
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: lbu a2, 0(a2)
; RV64ZVE32F-NEXT: li a3, 32
-; RV64ZVE32F-NEXT: vmv.s.x v14, a2
-; RV64ZVE32F-NEXT: vsetivli zero, 12, e8, m2, tu, ma
-; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 11
+; RV64ZVE32F-NEXT: vmv.s.x v12, a2
+; RV64ZVE32F-NEXT: vsetivli zero, 12, e8, m1, tu, ma
+; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 11
; RV64ZVE32F-NEXT: .LBB98_19: # %else32
; RV64ZVE32F-NEXT: vsetivli zero, 16, e8, m2, ta, ma
; RV64ZVE32F-NEXT: slli a2, a1, 51
@@ -12497,9 +12497,9 @@ define <32 x i8> @mgather_baseidx_v32i8(ptr %base, <32 x i8> %idxs, <32 x i1> %m
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: lbu a2, 0(a2)
; RV64ZVE32F-NEXT: li a3, 32
-; RV64ZVE32F-NEXT: vmv.s.x v14, a2
-; RV64ZVE32F-NEXT: vsetivli zero, 13, e8, m2, tu, ma
-; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 12
+; RV64ZVE32F-NEXT: vmv.s.x v9, a2
+; RV64ZVE32F-NEXT: vsetivli zero, 13, e8, m1, tu, ma
+; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 12
; RV64ZVE32F-NEXT: .LBB98_21: # %else35
; RV64ZVE32F-NEXT: slli a2, a1, 50
; RV64ZVE32F-NEXT: bgez a2, .LBB98_23
@@ -12510,9 +12510,9 @@ define <32 x i8> @mgather_baseidx_v32i8(ptr %base, <32 x i8> %idxs, <32 x i1> %m
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: lbu a2, 0(a2)
; RV64ZVE32F-NEXT: li a3, 32
-; RV64ZVE32F-NEXT: vmv.s.x v14, a2
-; RV64ZVE32F-NEXT: vsetivli zero, 14, e8, m2, tu, ma
-; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 13
+; RV64ZVE32F-NEXT: vmv.s.x v9, a2
+; RV64ZVE32F-NEXT: vsetivli zero, 14, e8, m1, tu, ma
+; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 13
; RV64ZVE32F-NEXT: .LBB98_23: # %else38
; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: slli a2, a1, 49
@@ -12661,7 +12661,7 @@ define <32 x i8> @mgather_baseidx_v32i8(ptr %base, <32 x i8> %idxs, <32 x i1> %m
; RV64ZVE32F-NEXT: lbu a2, 0(a2)
; RV64ZVE32F-NEXT: li a3, 32
; RV64ZVE32F-NEXT: vmv.s.x v14, a2
-; RV64ZVE32F-NEXT: vsetivli zero, 3, e8, m2, tu, ma
+; RV64ZVE32F-NEXT: vsetivli zero, 3, e8, m1, tu, ma
; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 2
; RV64ZVE32F-NEXT: andi a2, a1, 8
; RV64ZVE32F-NEXT: beqz a2, .LBB98_6
@@ -12672,9 +12672,9 @@ define <32 x i8> @mgather_baseidx_v32i8(ptr %base, <32 x i8> %idxs, <32 x i1> %m
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: lbu a2, 0(a2)
; RV64ZVE32F-NEXT: li a3, 32
-; RV64ZVE32F-NEXT: vmv.s.x v14, a2
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, m2, tu, ma
-; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 3
+; RV64ZVE32F-NEXT: vmv.s.x v12, a2
+; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, m1, tu, ma
+; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 3
; RV64ZVE32F-NEXT: andi a2, a1, 16
; RV64ZVE32F-NEXT: bnez a2, .LBB98_7
; RV64ZVE32F-NEXT: j .LBB98_8
@@ -12684,7 +12684,7 @@ define <32 x i8> @mgather_baseidx_v32i8(ptr %base, <32 x i8> %idxs, <32 x i1> %m
; RV64ZVE32F-NEXT: lbu a2, 0(a2)
; RV64ZVE32F-NEXT: li a3, 32
; RV64ZVE32F-NEXT: vmv.s.x v14, a2
-; RV64ZVE32F-NEXT: vsetivli zero, 7, e8, m2, tu, ma
+; RV64ZVE32F-NEXT: vsetivli zero, 7, e8, m1, tu, ma
; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 6
; RV64ZVE32F-NEXT: andi a2, a1, 128
; RV64ZVE32F-NEXT: beqz a2, .LBB98_12
@@ -12695,9 +12695,9 @@ define <32 x i8> @mgather_baseidx_v32i8(ptr %base, <32 x i8> %idxs, <32 x i1> %m
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: lbu a2, 0(a2)
; RV64ZVE32F-NEXT: li a3, 32
-; RV64ZVE32F-NEXT: vmv.s.x v14, a2
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, m2, tu, ma
-; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 7
+; RV64ZVE32F-NEXT: vmv.s.x v13, a2
+; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, m1, tu, ma
+; RV64ZVE32F-NEXT: vslideup.vi v10, v13, 7
; RV64ZVE32F-NEXT: andi a2, a1, 256
; RV64ZVE32F-NEXT: beqz a2, .LBB98_13
; RV64ZVE32F-NEXT: .LBB98_53: # %cond.load22
@@ -12707,9 +12707,9 @@ define <32 x i8> @mgather_baseidx_v32i8(ptr %base, <32 x i8> %idxs, <32 x i1> %m
; RV64ZVE32F-NEXT: lbu a2, 0(a2)
; RV64ZVE32F-NEXT: li a3, 32
; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m1, ta, ma
-; RV64ZVE32F-NEXT: vmv.s.x v14, a2
-; RV64ZVE32F-NEXT: vsetivli zero, 9, e8, m2, tu, ma
-; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 8
+; RV64ZVE32F-NEXT: vmv.s.x v13, a2
+; RV64ZVE32F-NEXT: vsetivli zero, 9, e8, m1, tu, ma
+; RV64ZVE32F-NEXT: vslideup.vi v10, v13, 8
; RV64ZVE32F-NEXT: andi a2, a1, 512
; RV64ZVE32F-NEXT: bnez a2, .LBB98_14
; RV64ZVE32F-NEXT: j .LBB98_15
@@ -12719,7 +12719,7 @@ define <32 x i8> @mgather_baseidx_v32i8(ptr %base, <32 x i8> %idxs, <32 x i1> %m
; RV64ZVE32F-NEXT: lbu a2, 0(a2)
; RV64ZVE32F-NEXT: li a3, 32
; RV64ZVE32F-NEXT: vmv.s.x v12, a2
-; RV64ZVE32F-NEXT: vsetivli zero, 15, e8, m2, tu, ma
+; RV64ZVE32F-NEXT: vsetivli zero, 15, e8, m1, tu, ma
; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 14
; RV64ZVE32F-NEXT: slli a2, a1, 48
; RV64ZVE32F-NEXT: bgez a2, .LBB98_25
@@ -12730,9 +12730,9 @@ define <32 x i8> @mgather_baseidx_v32i8(ptr %base, <32 x i8> %idxs, <32 x i1> %m
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: lbu a2, 0(a2)
; RV64ZVE32F-NEXT: li a3, 32
-; RV64ZVE32F-NEXT: vmv.s.x v12, a2
-; RV64ZVE32F-NEXT: vsetivli zero, 16, e8, m2, tu, ma
-; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 15
+; RV64ZVE32F-NEXT: vmv.s.x v9, a2
+; RV64ZVE32F-NEXT: vsetivli zero, 16, e8, m1, tu, ma
+; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 15
; RV64ZVE32F-NEXT: slli a2, a1, 47
; RV64ZVE32F-NEXT: bgez a2, .LBB98_26
; RV64ZVE32F-NEXT: .LBB98_56: # %cond.load46
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-combine.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-combine.ll
index f52ba6f51d5c897..eb91271bcdd1b5b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-combine.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-combine.ll
@@ -30,7 +30,7 @@ define void @widen_3xv4i16(ptr %x, ptr %z) {
; CHECK-NEXT: vle16.v v10, (a2)
; CHECK-NEXT: addi a0, a0, 16
; CHECK-NEXT: vle16.v v12, (a0)
-; CHECK-NEXT: vsetivli zero, 8, e16, m2, tu, ma
+; CHECK-NEXT: vsetivli zero, 8, e16, m1, tu, ma
; CHECK-NEXT: vslideup.vi v8, v10, 4
; CHECK-NEXT: vsetivli zero, 12, e16, m2, tu, ma
; CHECK-NEXT: vslideup.vi v8, v12, 8
@@ -80,7 +80,7 @@ define void @widen_4xv4i16_unaligned(ptr %x, ptr %z) {
; CHECK-NO-MISALIGN-NEXT: vle8.v v12, (a2)
; CHECK-NO-MISALIGN-NEXT: addi a0, a0, 24
; CHECK-NO-MISALIGN-NEXT: vle8.v v14, (a0)
-; CHECK-NO-MISALIGN-NEXT: vsetivli zero, 8, e16, m2, tu, ma
+; CHECK-NO-MISALIGN-NEXT: vsetvli zero, zero, e16, m1, tu, ma
; CHECK-NO-MISALIGN-NEXT: vslideup.vi v8, v10, 4
; CHECK-NO-MISALIGN-NEXT: vsetivli zero, 12, e16, m2, tu, ma
; CHECK-NO-MISALIGN-NEXT: vslideup.vi v8, v12, 8
@@ -193,7 +193,7 @@ define void @strided_constant_mismatch_4xv4i16(ptr %x, ptr %z) {
; CHECK-NEXT: vle16.v v12, (a2)
; CHECK-NEXT: addi a0, a0, 8
; CHECK-NEXT: vle16.v v14, (a0)
-; CHECK-NEXT: vsetivli zero, 8, e16, m2, tu, ma
+; CHECK-NEXT: vsetivli zero, 8, e16, m1, tu, ma
; CHECK-NEXT: vslideup.vi v8, v10, 4
; CHECK-NEXT: vsetivli zero, 12, e16, m2, tu, ma
; CHECK-NEXT: vslideup.vi v8, v12, 8
@@ -263,7 +263,7 @@ define void @strided_runtime_mismatch_4xv4i16(ptr %x, ptr %z, i64 %s, i64 %t) {
; RV32-NEXT: vle16.v v12, (a0)
; RV32-NEXT: add a0, a0, a2
; RV32-NEXT: vle16.v v14, (a0)
-; RV32-NEXT: vsetivli zero, 8, e16, m2, tu, ma
+; RV32-NEXT: vsetivli zero, 8, e16, m1, tu, ma
; RV32-NEXT: vslideup.vi v8, v10, 4
; RV32-NEXT: vsetivli zero, 12, e16, m2, tu, ma
; RV32-NEXT: vslideup.vi v8, v12, 8
@@ -282,7 +282,7 @@ define void @strided_runtime_mismatch_4xv4i16(ptr %x, ptr %z, i64 %s, i64 %t) {
; RV64-NEXT: vle16.v v12, (a0)
; RV64-NEXT: add a0, a0, a2
; RV64-NEXT: vle16.v v14, (a0)
-; RV64-NEXT: vsetivli zero, 8, e16, m2, tu, ma
+; RV64-NEXT: vsetivli zero, 8, e16, m1, tu, ma
; RV64-NEXT: vslideup.vi v8, v10, 4
; RV64-NEXT: vsetivli zero, 12, e16, m2, tu, ma
; RV64-NEXT: vslideup.vi v8, v12, 8
@@ -301,7 +301,7 @@ define void @strided_runtime_mismatch_4xv4i16(ptr %x, ptr %z, i64 %s, i64 %t) {
; ZVE64F-NEXT: vle16.v v12, (a0)
; ZVE64F-NEXT: add a0, a0, a2
; ZVE64F-NEXT: vle16.v v14, (a0)
-; ZVE64F-NEXT: vsetivli zero, 8, e16, m2, tu, ma
+; ZVE64F-NEXT: vsetivli zero, 8, e16, m1, tu, ma
; ZVE64F-NEXT: vslideup.vi v8, v10, 4
; ZVE64F-NEXT: vsetivli zero, 12, e16, m2, tu, ma
; ZVE64F-NEXT: vslideup.vi v8, v12, 8
diff --git a/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll b/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll
index 31e7e7be76c89b1..4741e55ab3a05fe 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll
@@ -460,54 +460,49 @@ define <4 x i32> @stest_f16i32(<4 x half> %x) {
; CHECK-V-NEXT: sub sp, sp, a1
; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 4 * vlenb
; CHECK-V-NEXT: lhu s0, 24(a0)
-; CHECK-V-NEXT: lhu s1, 16(a0)
-; CHECK-V-NEXT: lhu s2, 0(a0)
-; CHECK-V-NEXT: lhu a0, 8(a0)
+; CHECK-V-NEXT: lhu s1, 0(a0)
+; CHECK-V-NEXT: lhu s2, 8(a0)
+; CHECK-V-NEXT: lhu a0, 16(a0)
; CHECK-V-NEXT: fmv.w.x fa0, a0
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz
; CHECK-V-NEXT: vsetivli zero, 1, e64, m1, ta, ma
; CHECK-V-NEXT: vmv.s.x v8, a0
-; CHECK-V-NEXT: addi a0, sp, 16
-; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT: fmv.w.x fa0, s2
-; CHECK-V-NEXT: call __extendhfsf2 at plt
-; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT: vsetivli zero, 2, e64, m2, tu, ma
-; CHECK-V-NEXT: vmv.s.x v8, a0
-; CHECK-V-NEXT: addi a0, sp, 16
-; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT: vslideup.vi v8, v10, 1
; CHECK-V-NEXT: csrr a0, vlenb
; CHECK-V-NEXT: slli a0, a0, 1
; CHECK-V-NEXT: add a0, sp, a0
; CHECK-V-NEXT: addi a0, a0, 16
; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT: fmv.w.x fa0, s2
+; CHECK-V-NEXT: call __extendhfsf2 at plt
+; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz
+; CHECK-V-NEXT: vsetivli zero, 1, e64, m1, ta, ma
; CHECK-V-NEXT: fmv.w.x fa0, s1
+; CHECK-V-NEXT: vmv.s.x v8, a0
+; CHECK-V-NEXT: addi a0, sp, 16
+; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz
+; CHECK-V-NEXT: vsetivli zero, 2, e64, m1, tu, ma
+; CHECK-V-NEXT: vmv.s.x v10, a0
+; CHECK-V-NEXT: addi a0, sp, 16
+; CHECK-V-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT: vslideup.vi v10, v8, 1
; CHECK-V-NEXT: vsetivli zero, 3, e64, m2, tu, ma
-; CHECK-V-NEXT: vmv.s.x v8, a0
; CHECK-V-NEXT: csrr a0, vlenb
; CHECK-V-NEXT: slli a0, a0, 1
; CHECK-V-NEXT: add a0, sp, a0
; CHECK-V-NEXT: addi a0, a0, 16
-; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload
; CHECK-V-NEXT: vslideup.vi v10, v8, 2
-; CHECK-V-NEXT: csrr a0, vlenb
-; CHECK-V-NEXT: slli a0, a0, 1
-; CHECK-V-NEXT: add a0, sp, a0
-; CHECK-V-NEXT: addi a0, a0, 16
+; CHECK-V-NEXT: addi a0, sp, 16
; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill
; CHECK-V-NEXT: fmv.w.x fa0, s0
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz
; CHECK-V-NEXT: vsetivli zero, 4, e64, m2, ta, ma
; CHECK-V-NEXT: vmv.s.x v8, a0
-; CHECK-V-NEXT: csrr a0, vlenb
-; CHECK-V-NEXT: slli a0, a0, 1
-; CHECK-V-NEXT: add a0, sp, a0
-; CHECK-V-NEXT: addi a0, a0, 16
+; CHECK-V-NEXT: addi a0, sp, 16
; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload
; CHECK-V-NEXT: vslideup.vi v10, v8, 3
; CHECK-V-NEXT: lui a0, 524288
@@ -632,54 +627,49 @@ define <4 x i32> @utesth_f16i32(<4 x half> %x) {
; CHECK-V-NEXT: sub sp, sp, a1
; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 4 * vlenb
; CHECK-V-NEXT: lhu s0, 24(a0)
-; CHECK-V-NEXT: lhu s1, 16(a0)
-; CHECK-V-NEXT: lhu s2, 0(a0)
-; CHECK-V-NEXT: lhu a0, 8(a0)
+; CHECK-V-NEXT: lhu s1, 0(a0)
+; CHECK-V-NEXT: lhu s2, 8(a0)
+; CHECK-V-NEXT: lhu a0, 16(a0)
; CHECK-V-NEXT: fmv.w.x fa0, a0
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz
; CHECK-V-NEXT: vsetivli zero, 1, e64, m1, ta, ma
; CHECK-V-NEXT: vmv.s.x v8, a0
-; CHECK-V-NEXT: addi a0, sp, 16
-; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT: fmv.w.x fa0, s2
-; CHECK-V-NEXT: call __extendhfsf2 at plt
-; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz
-; CHECK-V-NEXT: vsetivli zero, 2, e64, m2, tu, ma
-; CHECK-V-NEXT: vmv.s.x v8, a0
-; CHECK-V-NEXT: addi a0, sp, 16
-; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT: vslideup.vi v8, v10, 1
; CHECK-V-NEXT: csrr a0, vlenb
; CHECK-V-NEXT: slli a0, a0, 1
; CHECK-V-NEXT: add a0, sp, a0
; CHECK-V-NEXT: addi a0, a0, 16
; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT: fmv.w.x fa0, s2
+; CHECK-V-NEXT: call __extendhfsf2 at plt
+; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz
+; CHECK-V-NEXT: vsetivli zero, 1, e64, m1, ta, ma
; CHECK-V-NEXT: fmv.w.x fa0, s1
+; CHECK-V-NEXT: vmv.s.x v8, a0
+; CHECK-V-NEXT: addi a0, sp, 16
+; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz
+; CHECK-V-NEXT: vsetivli zero, 2, e64, m1, tu, ma
+; CHECK-V-NEXT: vmv.s.x v10, a0
+; CHECK-V-NEXT: addi a0, sp, 16
+; CHECK-V-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT: vslideup.vi v10, v8, 1
; CHECK-V-NEXT: vsetivli zero, 3, e64, m2, tu, ma
-; CHECK-V-NEXT: vmv.s.x v8, a0
; CHECK-V-NEXT: csrr a0, vlenb
; CHECK-V-NEXT: slli a0, a0, 1
; CHECK-V-NEXT: add a0, sp, a0
; CHECK-V-NEXT: addi a0, a0, 16
-; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload
; CHECK-V-NEXT: vslideup.vi v10, v8, 2
-; CHECK-V-NEXT: csrr a0, vlenb
-; CHECK-V-NEXT: slli a0, a0, 1
-; CHECK-V-NEXT: add a0, sp, a0
-; CHECK-V-NEXT: addi a0, a0, 16
+; CHECK-V-NEXT: addi a0, sp, 16
; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill
; CHECK-V-NEXT: fmv.w.x fa0, s0
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz
; CHECK-V-NEXT: vsetivli zero, 4, e64, m2, ta, ma
; CHECK-V-NEXT: vmv.s.x v8, a0
-; CHECK-V-NEXT: csrr a0, vlenb
-; CHECK-V-NEXT: slli a0, a0, 1
-; CHECK-V-NEXT: add a0, sp, a0
-; CHECK-V-NEXT: addi a0, a0, 16
+; CHECK-V-NEXT: addi a0, sp, 16
; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload
; CHECK-V-NEXT: vslideup.vi v10, v8, 3
; CHECK-V-NEXT: li a0, -1
@@ -813,54 +803,49 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) {
; CHECK-V-NEXT: sub sp, sp, a1
; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 4 * vlenb
; CHECK-V-NEXT: lhu s0, 24(a0)
-; CHECK-V-NEXT: lhu s1, 16(a0)
-; CHECK-V-NEXT: lhu s2, 0(a0)
-; CHECK-V-NEXT: lhu a0, 8(a0)
+; CHECK-V-NEXT: lhu s1, 0(a0)
+; CHECK-V-NEXT: lhu s2, 8(a0)
+; CHECK-V-NEXT: lhu a0, 16(a0)
; CHECK-V-NEXT: fmv.w.x fa0, a0
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz
; CHECK-V-NEXT: vsetivli zero, 1, e64, m1, ta, ma
; CHECK-V-NEXT: vmv.s.x v8, a0
-; CHECK-V-NEXT: addi a0, sp, 16
-; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT: fmv.w.x fa0, s2
-; CHECK-V-NEXT: call __extendhfsf2 at plt
-; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT: vsetivli zero, 2, e64, m2, tu, ma
-; CHECK-V-NEXT: vmv.s.x v8, a0
-; CHECK-V-NEXT: addi a0, sp, 16
-; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT: vslideup.vi v8, v10, 1
; CHECK-V-NEXT: csrr a0, vlenb
; CHECK-V-NEXT: slli a0, a0, 1
; CHECK-V-NEXT: add a0, sp, a0
; CHECK-V-NEXT: addi a0, a0, 16
; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT: fmv.w.x fa0, s2
+; CHECK-V-NEXT: call __extendhfsf2 at plt
+; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz
+; CHECK-V-NEXT: vsetivli zero, 1, e64, m1, ta, ma
; CHECK-V-NEXT: fmv.w.x fa0, s1
+; CHECK-V-NEXT: vmv.s.x v8, a0
+; CHECK-V-NEXT: addi a0, sp, 16
+; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz
+; CHECK-V-NEXT: vsetivli zero, 2, e64, m1, tu, ma
+; CHECK-V-NEXT: vmv.s.x v10, a0
+; CHECK-V-NEXT: addi a0, sp, 16
+; CHECK-V-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT: vslideup.vi v10, v8, 1
; CHECK-V-NEXT: vsetivli zero, 3, e64, m2, tu, ma
-; CHECK-V-NEXT: vmv.s.x v8, a0
; CHECK-V-NEXT: csrr a0, vlenb
; CHECK-V-NEXT: slli a0, a0, 1
; CHECK-V-NEXT: add a0, sp, a0
; CHECK-V-NEXT: addi a0, a0, 16
-; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload
; CHECK-V-NEXT: vslideup.vi v10, v8, 2
-; CHECK-V-NEXT: csrr a0, vlenb
-; CHECK-V-NEXT: slli a0, a0, 1
-; CHECK-V-NEXT: add a0, sp, a0
-; CHECK-V-NEXT: addi a0, a0, 16
+; CHECK-V-NEXT: addi a0, sp, 16
; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill
; CHECK-V-NEXT: fmv.w.x fa0, s0
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz
; CHECK-V-NEXT: vsetivli zero, 4, e64, m2, ta, ma
; CHECK-V-NEXT: vmv.s.x v8, a0
-; CHECK-V-NEXT: csrr a0, vlenb
-; CHECK-V-NEXT: slli a0, a0, 1
-; CHECK-V-NEXT: add a0, sp, a0
-; CHECK-V-NEXT: addi a0, a0, 16
+; CHECK-V-NEXT: addi a0, sp, 16
; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload
; CHECK-V-NEXT: vslideup.vi v10, v8, 3
; CHECK-V-NEXT: li a0, -1
@@ -1445,9 +1430,9 @@ define <8 x i16> @stest_f16i16(<8 x half> %x) {
; CHECK-V-NEXT: .cfi_offset s5, -56
; CHECK-V-NEXT: .cfi_offset s6, -64
; CHECK-V-NEXT: csrr a1, vlenb
-; CHECK-V-NEXT: slli a1, a1, 1
+; CHECK-V-NEXT: slli a1, a1, 2
; CHECK-V-NEXT: sub sp, sp, a1
-; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 80 + 2 * vlenb
+; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 80 + 4 * vlenb
; CHECK-V-NEXT: lhu s0, 56(a0)
; CHECK-V-NEXT: lhu s1, 48(a0)
; CHECK-V-NEXT: lhu s2, 40(a0)
@@ -1466,63 +1451,105 @@ define <8 x i16> @stest_f16i16(<8 x half> %x) {
; CHECK-V-NEXT: fmv.w.x fa0, s6
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT: vsetivli zero, 2, e32, m2, tu, ma
+; CHECK-V-NEXT: vsetivli zero, 2, e32, m1, tu, ma
; CHECK-V-NEXT: vmv.s.x v8, a0
; CHECK-V-NEXT: addi a0, sp, 16
; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload
; CHECK-V-NEXT: vslideup.vi v8, v10, 1
+; CHECK-V-NEXT: csrr a0, vlenb
+; CHECK-V-NEXT: slli a0, a0, 1
+; CHECK-V-NEXT: add a0, sp, a0
+; CHECK-V-NEXT: addi a0, a0, 16
; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill
; CHECK-V-NEXT: fmv.w.x fa0, s5
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT: vsetivli zero, 3, e32, m2, tu, ma
+; CHECK-V-NEXT: vsetivli zero, 3, e32, m1, tu, ma
; CHECK-V-NEXT: vmv.s.x v8, a0
-; CHECK-V-NEXT: addi a0, sp, 16
+; CHECK-V-NEXT: csrr a0, vlenb
+; CHECK-V-NEXT: slli a0, a0, 1
+; CHECK-V-NEXT: add a0, sp, a0
+; CHECK-V-NEXT: addi a0, a0, 16
; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload
; CHECK-V-NEXT: vslideup.vi v10, v8, 2
+; CHECK-V-NEXT: csrr a0, vlenb
+; CHECK-V-NEXT: slli a0, a0, 1
+; CHECK-V-NEXT: add a0, sp, a0
+; CHECK-V-NEXT: addi a0, a0, 16
; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill
; CHECK-V-NEXT: fmv.w.x fa0, s4
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT: vsetivli zero, 4, e32, m2, tu, ma
+; CHECK-V-NEXT: vsetivli zero, 4, e32, m1, tu, ma
; CHECK-V-NEXT: vmv.s.x v8, a0
-; CHECK-V-NEXT: addi a0, sp, 16
+; CHECK-V-NEXT: csrr a0, vlenb
+; CHECK-V-NEXT: slli a0, a0, 1
+; CHECK-V-NEXT: add a0, sp, a0
+; CHECK-V-NEXT: addi a0, a0, 16
; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload
; CHECK-V-NEXT: vslideup.vi v10, v8, 3
+; CHECK-V-NEXT: csrr a0, vlenb
+; CHECK-V-NEXT: slli a0, a0, 1
+; CHECK-V-NEXT: add a0, sp, a0
+; CHECK-V-NEXT: addi a0, a0, 16
; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill
; CHECK-V-NEXT: fmv.w.x fa0, s3
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz
; CHECK-V-NEXT: vsetivli zero, 5, e32, m2, tu, ma
; CHECK-V-NEXT: vmv.s.x v8, a0
-; CHECK-V-NEXT: addi a0, sp, 16
+; CHECK-V-NEXT: csrr a0, vlenb
+; CHECK-V-NEXT: slli a0, a0, 1
+; CHECK-V-NEXT: add a0, sp, a0
+; CHECK-V-NEXT: addi a0, a0, 16
; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload
; CHECK-V-NEXT: vslideup.vi v10, v8, 4
+; CHECK-V-NEXT: csrr a0, vlenb
+; CHECK-V-NEXT: slli a0, a0, 1
+; CHECK-V-NEXT: add a0, sp, a0
+; CHECK-V-NEXT: addi a0, a0, 16
; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill
; CHECK-V-NEXT: fmv.w.x fa0, s2
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz
; CHECK-V-NEXT: vsetivli zero, 6, e32, m2, tu, ma
; CHECK-V-NEXT: vmv.s.x v8, a0
-; CHECK-V-NEXT: addi a0, sp, 16
+; CHECK-V-NEXT: csrr a0, vlenb
+; CHECK-V-NEXT: slli a0, a0, 1
+; CHECK-V-NEXT: add a0, sp, a0
+; CHECK-V-NEXT: addi a0, a0, 16
; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload
; CHECK-V-NEXT: vslideup.vi v10, v8, 5
+; CHECK-V-NEXT: csrr a0, vlenb
+; CHECK-V-NEXT: slli a0, a0, 1
+; CHECK-V-NEXT: add a0, sp, a0
+; CHECK-V-NEXT: addi a0, a0, 16
; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill
; CHECK-V-NEXT: fmv.w.x fa0, s1
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz
; CHECK-V-NEXT: vsetivli zero, 7, e32, m2, tu, ma
; CHECK-V-NEXT: vmv.s.x v8, a0
-; CHECK-V-NEXT: addi a0, sp, 16
+; CHECK-V-NEXT: csrr a0, vlenb
+; CHECK-V-NEXT: slli a0, a0, 1
+; CHECK-V-NEXT: add a0, sp, a0
+; CHECK-V-NEXT: addi a0, a0, 16
; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload
; CHECK-V-NEXT: vslideup.vi v10, v8, 6
+; CHECK-V-NEXT: csrr a0, vlenb
+; CHECK-V-NEXT: slli a0, a0, 1
+; CHECK-V-NEXT: add a0, sp, a0
+; CHECK-V-NEXT: addi a0, a0, 16
; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill
; CHECK-V-NEXT: fmv.w.x fa0, s0
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz
; CHECK-V-NEXT: vsetivli zero, 8, e32, m2, ta, ma
; CHECK-V-NEXT: vmv.s.x v8, a0
-; CHECK-V-NEXT: addi a0, sp, 16
+; CHECK-V-NEXT: csrr a0, vlenb
+; CHECK-V-NEXT: slli a0, a0, 1
+; CHECK-V-NEXT: add a0, sp, a0
+; CHECK-V-NEXT: addi a0, a0, 16
; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload
; CHECK-V-NEXT: vslideup.vi v10, v8, 7
; CHECK-V-NEXT: lui a0, 8
@@ -1533,7 +1560,7 @@ define <8 x i16> @stest_f16i16(<8 x half> %x) {
; CHECK-V-NEXT: vsetvli zero, zero, e16, m1, ta, ma
; CHECK-V-NEXT: vnsrl.wi v8, v10, 0
; CHECK-V-NEXT: csrr a0, vlenb
-; CHECK-V-NEXT: slli a0, a0, 1
+; CHECK-V-NEXT: slli a0, a0, 2
; CHECK-V-NEXT: add sp, sp, a0
; CHECK-V-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
; CHECK-V-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
@@ -1728,9 +1755,9 @@ define <8 x i16> @utesth_f16i16(<8 x half> %x) {
; CHECK-V-NEXT: .cfi_offset s5, -56
; CHECK-V-NEXT: .cfi_offset s6, -64
; CHECK-V-NEXT: csrr a1, vlenb
-; CHECK-V-NEXT: slli a1, a1, 1
+; CHECK-V-NEXT: slli a1, a1, 2
; CHECK-V-NEXT: sub sp, sp, a1
-; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 80 + 2 * vlenb
+; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 80 + 4 * vlenb
; CHECK-V-NEXT: lhu s0, 56(a0)
; CHECK-V-NEXT: lhu s1, 48(a0)
; CHECK-V-NEXT: lhu s2, 40(a0)
@@ -1749,63 +1776,105 @@ define <8 x i16> @utesth_f16i16(<8 x half> %x) {
; CHECK-V-NEXT: fmv.w.x fa0, s6
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz
-; CHECK-V-NEXT: vsetivli zero, 2, e32, m2, tu, ma
+; CHECK-V-NEXT: vsetivli zero, 2, e32, m1, tu, ma
; CHECK-V-NEXT: vmv.s.x v8, a0
; CHECK-V-NEXT: addi a0, sp, 16
; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload
; CHECK-V-NEXT: vslideup.vi v8, v10, 1
+; CHECK-V-NEXT: csrr a0, vlenb
+; CHECK-V-NEXT: slli a0, a0, 1
+; CHECK-V-NEXT: add a0, sp, a0
+; CHECK-V-NEXT: addi a0, a0, 16
; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill
; CHECK-V-NEXT: fmv.w.x fa0, s5
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz
-; CHECK-V-NEXT: vsetivli zero, 3, e32, m2, tu, ma
+; CHECK-V-NEXT: vsetivli zero, 3, e32, m1, tu, ma
; CHECK-V-NEXT: vmv.s.x v8, a0
-; CHECK-V-NEXT: addi a0, sp, 16
+; CHECK-V-NEXT: csrr a0, vlenb
+; CHECK-V-NEXT: slli a0, a0, 1
+; CHECK-V-NEXT: add a0, sp, a0
+; CHECK-V-NEXT: addi a0, a0, 16
; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload
; CHECK-V-NEXT: vslideup.vi v10, v8, 2
+; CHECK-V-NEXT: csrr a0, vlenb
+; CHECK-V-NEXT: slli a0, a0, 1
+; CHECK-V-NEXT: add a0, sp, a0
+; CHECK-V-NEXT: addi a0, a0, 16
; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill
; CHECK-V-NEXT: fmv.w.x fa0, s4
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz
-; CHECK-V-NEXT: vsetivli zero, 4, e32, m2, tu, ma
+; CHECK-V-NEXT: vsetivli zero, 4, e32, m1, tu, ma
; CHECK-V-NEXT: vmv.s.x v8, a0
-; CHECK-V-NEXT: addi a0, sp, 16
+; CHECK-V-NEXT: csrr a0, vlenb
+; CHECK-V-NEXT: slli a0, a0, 1
+; CHECK-V-NEXT: add a0, sp, a0
+; CHECK-V-NEXT: addi a0, a0, 16
; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload
; CHECK-V-NEXT: vslideup.vi v10, v8, 3
+; CHECK-V-NEXT: csrr a0, vlenb
+; CHECK-V-NEXT: slli a0, a0, 1
+; CHECK-V-NEXT: add a0, sp, a0
+; CHECK-V-NEXT: addi a0, a0, 16
; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill
; CHECK-V-NEXT: fmv.w.x fa0, s3
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz
; CHECK-V-NEXT: vsetivli zero, 5, e32, m2, tu, ma
; CHECK-V-NEXT: vmv.s.x v8, a0
-; CHECK-V-NEXT: addi a0, sp, 16
+; CHECK-V-NEXT: csrr a0, vlenb
+; CHECK-V-NEXT: slli a0, a0, 1
+; CHECK-V-NEXT: add a0, sp, a0
+; CHECK-V-NEXT: addi a0, a0, 16
; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload
; CHECK-V-NEXT: vslideup.vi v10, v8, 4
+; CHECK-V-NEXT: csrr a0, vlenb
+; CHECK-V-NEXT: slli a0, a0, 1
+; CHECK-V-NEXT: add a0, sp, a0
+; CHECK-V-NEXT: addi a0, a0, 16
; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill
; CHECK-V-NEXT: fmv.w.x fa0, s2
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz
; CHECK-V-NEXT: vsetivli zero, 6, e32, m2, tu, ma
; CHECK-V-NEXT: vmv.s.x v8, a0
-; CHECK-V-NEXT: addi a0, sp, 16
+; CHECK-V-NEXT: csrr a0, vlenb
+; CHECK-V-NEXT: slli a0, a0, 1
+; CHECK-V-NEXT: add a0, sp, a0
+; CHECK-V-NEXT: addi a0, a0, 16
; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload
; CHECK-V-NEXT: vslideup.vi v10, v8, 5
+; CHECK-V-NEXT: csrr a0, vlenb
+; CHECK-V-NEXT: slli a0, a0, 1
+; CHECK-V-NEXT: add a0, sp, a0
+; CHECK-V-NEXT: addi a0, a0, 16
; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill
; CHECK-V-NEXT: fmv.w.x fa0, s1
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz
; CHECK-V-NEXT: vsetivli zero, 7, e32, m2, tu, ma
; CHECK-V-NEXT: vmv.s.x v8, a0
-; CHECK-V-NEXT: addi a0, sp, 16
+; CHECK-V-NEXT: csrr a0, vlenb
+; CHECK-V-NEXT: slli a0, a0, 1
+; CHECK-V-NEXT: add a0, sp, a0
+; CHECK-V-NEXT: addi a0, a0, 16
; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload
; CHECK-V-NEXT: vslideup.vi v10, v8, 6
+; CHECK-V-NEXT: csrr a0, vlenb
+; CHECK-V-NEXT: slli a0, a0, 1
+; CHECK-V-NEXT: add a0, sp, a0
+; CHECK-V-NEXT: addi a0, a0, 16
; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill
; CHECK-V-NEXT: fmv.w.x fa0, s0
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz
; CHECK-V-NEXT: vsetivli zero, 8, e32, m2, ta, ma
; CHECK-V-NEXT: vmv.s.x v8, a0
-; CHECK-V-NEXT: addi a0, sp, 16
+; CHECK-V-NEXT: csrr a0, vlenb
+; CHECK-V-NEXT: slli a0, a0, 1
+; CHECK-V-NEXT: add a0, sp, a0
+; CHECK-V-NEXT: addi a0, a0, 16
; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload
; CHECK-V-NEXT: vslideup.vi v10, v8, 7
; CHECK-V-NEXT: lui a0, 16
@@ -1814,7 +1883,7 @@ define <8 x i16> @utesth_f16i16(<8 x half> %x) {
; CHECK-V-NEXT: vsetvli zero, zero, e16, m1, ta, ma
; CHECK-V-NEXT: vnsrl.wi v8, v10, 0
; CHECK-V-NEXT: csrr a0, vlenb
-; CHECK-V-NEXT: slli a0, a0, 1
+; CHECK-V-NEXT: slli a0, a0, 2
; CHECK-V-NEXT: add sp, sp, a0
; CHECK-V-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
; CHECK-V-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
@@ -2031,9 +2100,9 @@ define <8 x i16> @ustest_f16i16(<8 x half> %x) {
; CHECK-V-NEXT: .cfi_offset s5, -56
; CHECK-V-NEXT: .cfi_offset s6, -64
; CHECK-V-NEXT: csrr a1, vlenb
-; CHECK-V-NEXT: slli a1, a1, 1
+; CHECK-V-NEXT: slli a1, a1, 2
; CHECK-V-NEXT: sub sp, sp, a1
-; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 80 + 2 * vlenb
+; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 80 + 4 * vlenb
; CHECK-V-NEXT: lhu s0, 56(a0)
; CHECK-V-NEXT: lhu s1, 48(a0)
; CHECK-V-NEXT: lhu s2, 40(a0)
@@ -2052,63 +2121,105 @@ define <8 x i16> @ustest_f16i16(<8 x half> %x) {
; CHECK-V-NEXT: fmv.w.x fa0, s6
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT: vsetivli zero, 2, e32, m2, tu, ma
+; CHECK-V-NEXT: vsetivli zero, 2, e32, m1, tu, ma
; CHECK-V-NEXT: vmv.s.x v8, a0
; CHECK-V-NEXT: addi a0, sp, 16
; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload
; CHECK-V-NEXT: vslideup.vi v8, v10, 1
+; CHECK-V-NEXT: csrr a0, vlenb
+; CHECK-V-NEXT: slli a0, a0, 1
+; CHECK-V-NEXT: add a0, sp, a0
+; CHECK-V-NEXT: addi a0, a0, 16
; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill
; CHECK-V-NEXT: fmv.w.x fa0, s5
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT: vsetivli zero, 3, e32, m2, tu, ma
+; CHECK-V-NEXT: vsetivli zero, 3, e32, m1, tu, ma
; CHECK-V-NEXT: vmv.s.x v8, a0
-; CHECK-V-NEXT: addi a0, sp, 16
+; CHECK-V-NEXT: csrr a0, vlenb
+; CHECK-V-NEXT: slli a0, a0, 1
+; CHECK-V-NEXT: add a0, sp, a0
+; CHECK-V-NEXT: addi a0, a0, 16
; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload
; CHECK-V-NEXT: vslideup.vi v10, v8, 2
+; CHECK-V-NEXT: csrr a0, vlenb
+; CHECK-V-NEXT: slli a0, a0, 1
+; CHECK-V-NEXT: add a0, sp, a0
+; CHECK-V-NEXT: addi a0, a0, 16
; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill
; CHECK-V-NEXT: fmv.w.x fa0, s4
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT: vsetivli zero, 4, e32, m2, tu, ma
+; CHECK-V-NEXT: vsetivli zero, 4, e32, m1, tu, ma
; CHECK-V-NEXT: vmv.s.x v8, a0
-; CHECK-V-NEXT: addi a0, sp, 16
+; CHECK-V-NEXT: csrr a0, vlenb
+; CHECK-V-NEXT: slli a0, a0, 1
+; CHECK-V-NEXT: add a0, sp, a0
+; CHECK-V-NEXT: addi a0, a0, 16
; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload
; CHECK-V-NEXT: vslideup.vi v10, v8, 3
+; CHECK-V-NEXT: csrr a0, vlenb
+; CHECK-V-NEXT: slli a0, a0, 1
+; CHECK-V-NEXT: add a0, sp, a0
+; CHECK-V-NEXT: addi a0, a0, 16
; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill
; CHECK-V-NEXT: fmv.w.x fa0, s3
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz
; CHECK-V-NEXT: vsetivli zero, 5, e32, m2, tu, ma
; CHECK-V-NEXT: vmv.s.x v8, a0
-; CHECK-V-NEXT: addi a0, sp, 16
+; CHECK-V-NEXT: csrr a0, vlenb
+; CHECK-V-NEXT: slli a0, a0, 1
+; CHECK-V-NEXT: add a0, sp, a0
+; CHECK-V-NEXT: addi a0, a0, 16
; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload
; CHECK-V-NEXT: vslideup.vi v10, v8, 4
+; CHECK-V-NEXT: csrr a0, vlenb
+; CHECK-V-NEXT: slli a0, a0, 1
+; CHECK-V-NEXT: add a0, sp, a0
+; CHECK-V-NEXT: addi a0, a0, 16
; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill
; CHECK-V-NEXT: fmv.w.x fa0, s2
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz
; CHECK-V-NEXT: vsetivli zero, 6, e32, m2, tu, ma
; CHECK-V-NEXT: vmv.s.x v8, a0
-; CHECK-V-NEXT: addi a0, sp, 16
+; CHECK-V-NEXT: csrr a0, vlenb
+; CHECK-V-NEXT: slli a0, a0, 1
+; CHECK-V-NEXT: add a0, sp, a0
+; CHECK-V-NEXT: addi a0, a0, 16
; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload
; CHECK-V-NEXT: vslideup.vi v10, v8, 5
+; CHECK-V-NEXT: csrr a0, vlenb
+; CHECK-V-NEXT: slli a0, a0, 1
+; CHECK-V-NEXT: add a0, sp, a0
+; CHECK-V-NEXT: addi a0, a0, 16
; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill
; CHECK-V-NEXT: fmv.w.x fa0, s1
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz
; CHECK-V-NEXT: vsetivli zero, 7, e32, m2, tu, ma
; CHECK-V-NEXT: vmv.s.x v8, a0
-; CHECK-V-NEXT: addi a0, sp, 16
+; CHECK-V-NEXT: csrr a0, vlenb
+; CHECK-V-NEXT: slli a0, a0, 1
+; CHECK-V-NEXT: add a0, sp, a0
+; CHECK-V-NEXT: addi a0, a0, 16
; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload
; CHECK-V-NEXT: vslideup.vi v10, v8, 6
-; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT: fmv.w.x fa0, s0
-; CHECK-V-NEXT: call __extendhfsf2 at plt
+; CHECK-V-NEXT: csrr a0, vlenb
+; CHECK-V-NEXT: slli a0, a0, 1
+; CHECK-V-NEXT: add a0, sp, a0
+; CHECK-V-NEXT: addi a0, a0, 16
+; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT: fmv.w.x fa0, s0
+; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz
; CHECK-V-NEXT: vsetivli zero, 8, e32, m2, ta, ma
; CHECK-V-NEXT: vmv.s.x v8, a0
-; CHECK-V-NEXT: addi a0, sp, 16
+; CHECK-V-NEXT: csrr a0, vlenb
+; CHECK-V-NEXT: slli a0, a0, 1
+; CHECK-V-NEXT: add a0, sp, a0
+; CHECK-V-NEXT: addi a0, a0, 16
; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload
; CHECK-V-NEXT: vslideup.vi v10, v8, 7
; CHECK-V-NEXT: lui a0, 16
@@ -2118,7 +2229,7 @@ define <8 x i16> @ustest_f16i16(<8 x half> %x) {
; CHECK-V-NEXT: vsetvli zero, zero, e16, m1, ta, ma
; CHECK-V-NEXT: vnsrl.wi v8, v10, 0
; CHECK-V-NEXT: csrr a0, vlenb
-; CHECK-V-NEXT: slli a0, a0, 1
+; CHECK-V-NEXT: slli a0, a0, 2
; CHECK-V-NEXT: add sp, sp, a0
; CHECK-V-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
; CHECK-V-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
@@ -3796,54 +3907,49 @@ define <4 x i32> @stest_f16i32_mm(<4 x half> %x) {
; CHECK-V-NEXT: sub sp, sp, a1
; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 4 * vlenb
; CHECK-V-NEXT: lhu s0, 24(a0)
-; CHECK-V-NEXT: lhu s1, 16(a0)
-; CHECK-V-NEXT: lhu s2, 0(a0)
-; CHECK-V-NEXT: lhu a0, 8(a0)
+; CHECK-V-NEXT: lhu s1, 0(a0)
+; CHECK-V-NEXT: lhu s2, 8(a0)
+; CHECK-V-NEXT: lhu a0, 16(a0)
; CHECK-V-NEXT: fmv.w.x fa0, a0
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz
; CHECK-V-NEXT: vsetivli zero, 1, e64, m1, ta, ma
; CHECK-V-NEXT: vmv.s.x v8, a0
-; CHECK-V-NEXT: addi a0, sp, 16
-; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT: fmv.w.x fa0, s2
-; CHECK-V-NEXT: call __extendhfsf2 at plt
-; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT: vsetivli zero, 2, e64, m2, tu, ma
-; CHECK-V-NEXT: vmv.s.x v8, a0
-; CHECK-V-NEXT: addi a0, sp, 16
-; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT: vslideup.vi v8, v10, 1
; CHECK-V-NEXT: csrr a0, vlenb
; CHECK-V-NEXT: slli a0, a0, 1
; CHECK-V-NEXT: add a0, sp, a0
; CHECK-V-NEXT: addi a0, a0, 16
; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT: fmv.w.x fa0, s2
+; CHECK-V-NEXT: call __extendhfsf2 at plt
+; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz
+; CHECK-V-NEXT: vsetivli zero, 1, e64, m1, ta, ma
; CHECK-V-NEXT: fmv.w.x fa0, s1
+; CHECK-V-NEXT: vmv.s.x v8, a0
+; CHECK-V-NEXT: addi a0, sp, 16
+; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz
+; CHECK-V-NEXT: vsetivli zero, 2, e64, m1, tu, ma
+; CHECK-V-NEXT: vmv.s.x v10, a0
+; CHECK-V-NEXT: addi a0, sp, 16
+; CHECK-V-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT: vslideup.vi v10, v8, 1
; CHECK-V-NEXT: vsetivli zero, 3, e64, m2, tu, ma
-; CHECK-V-NEXT: vmv.s.x v8, a0
; CHECK-V-NEXT: csrr a0, vlenb
; CHECK-V-NEXT: slli a0, a0, 1
; CHECK-V-NEXT: add a0, sp, a0
; CHECK-V-NEXT: addi a0, a0, 16
-; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload
; CHECK-V-NEXT: vslideup.vi v10, v8, 2
-; CHECK-V-NEXT: csrr a0, vlenb
-; CHECK-V-NEXT: slli a0, a0, 1
-; CHECK-V-NEXT: add a0, sp, a0
-; CHECK-V-NEXT: addi a0, a0, 16
+; CHECK-V-NEXT: addi a0, sp, 16
; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill
; CHECK-V-NEXT: fmv.w.x fa0, s0
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz
; CHECK-V-NEXT: vsetivli zero, 4, e64, m2, ta, ma
; CHECK-V-NEXT: vmv.s.x v8, a0
-; CHECK-V-NEXT: csrr a0, vlenb
-; CHECK-V-NEXT: slli a0, a0, 1
-; CHECK-V-NEXT: add a0, sp, a0
-; CHECK-V-NEXT: addi a0, a0, 16
+; CHECK-V-NEXT: addi a0, sp, 16
; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload
; CHECK-V-NEXT: vslideup.vi v10, v8, 3
; CHECK-V-NEXT: lui a0, 524288
@@ -3966,54 +4072,49 @@ define <4 x i32> @utesth_f16i32_mm(<4 x half> %x) {
; CHECK-V-NEXT: sub sp, sp, a1
; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 4 * vlenb
; CHECK-V-NEXT: lhu s0, 24(a0)
-; CHECK-V-NEXT: lhu s1, 16(a0)
-; CHECK-V-NEXT: lhu s2, 0(a0)
-; CHECK-V-NEXT: lhu a0, 8(a0)
+; CHECK-V-NEXT: lhu s1, 0(a0)
+; CHECK-V-NEXT: lhu s2, 8(a0)
+; CHECK-V-NEXT: lhu a0, 16(a0)
; CHECK-V-NEXT: fmv.w.x fa0, a0
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz
; CHECK-V-NEXT: vsetivli zero, 1, e64, m1, ta, ma
; CHECK-V-NEXT: vmv.s.x v8, a0
-; CHECK-V-NEXT: addi a0, sp, 16
-; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT: fmv.w.x fa0, s2
-; CHECK-V-NEXT: call __extendhfsf2 at plt
-; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz
-; CHECK-V-NEXT: vsetivli zero, 2, e64, m2, tu, ma
-; CHECK-V-NEXT: vmv.s.x v8, a0
-; CHECK-V-NEXT: addi a0, sp, 16
-; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT: vslideup.vi v8, v10, 1
; CHECK-V-NEXT: csrr a0, vlenb
; CHECK-V-NEXT: slli a0, a0, 1
; CHECK-V-NEXT: add a0, sp, a0
; CHECK-V-NEXT: addi a0, a0, 16
; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT: fmv.w.x fa0, s2
+; CHECK-V-NEXT: call __extendhfsf2 at plt
+; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz
+; CHECK-V-NEXT: vsetivli zero, 1, e64, m1, ta, ma
; CHECK-V-NEXT: fmv.w.x fa0, s1
+; CHECK-V-NEXT: vmv.s.x v8, a0
+; CHECK-V-NEXT: addi a0, sp, 16
+; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz
+; CHECK-V-NEXT: vsetivli zero, 2, e64, m1, tu, ma
+; CHECK-V-NEXT: vmv.s.x v10, a0
+; CHECK-V-NEXT: addi a0, sp, 16
+; CHECK-V-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT: vslideup.vi v10, v8, 1
; CHECK-V-NEXT: vsetivli zero, 3, e64, m2, tu, ma
-; CHECK-V-NEXT: vmv.s.x v8, a0
; CHECK-V-NEXT: csrr a0, vlenb
; CHECK-V-NEXT: slli a0, a0, 1
; CHECK-V-NEXT: add a0, sp, a0
; CHECK-V-NEXT: addi a0, a0, 16
-; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload
; CHECK-V-NEXT: vslideup.vi v10, v8, 2
-; CHECK-V-NEXT: csrr a0, vlenb
-; CHECK-V-NEXT: slli a0, a0, 1
-; CHECK-V-NEXT: add a0, sp, a0
-; CHECK-V-NEXT: addi a0, a0, 16
+; CHECK-V-NEXT: addi a0, sp, 16
; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill
; CHECK-V-NEXT: fmv.w.x fa0, s0
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz
; CHECK-V-NEXT: vsetivli zero, 4, e64, m2, ta, ma
; CHECK-V-NEXT: vmv.s.x v8, a0
-; CHECK-V-NEXT: csrr a0, vlenb
-; CHECK-V-NEXT: slli a0, a0, 1
-; CHECK-V-NEXT: add a0, sp, a0
-; CHECK-V-NEXT: addi a0, a0, 16
+; CHECK-V-NEXT: addi a0, sp, 16
; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload
; CHECK-V-NEXT: vslideup.vi v10, v8, 3
; CHECK-V-NEXT: li a0, -1
@@ -4146,54 +4247,49 @@ define <4 x i32> @ustest_f16i32_mm(<4 x half> %x) {
; CHECK-V-NEXT: sub sp, sp, a1
; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 4 * vlenb
; CHECK-V-NEXT: lhu s0, 24(a0)
-; CHECK-V-NEXT: lhu s1, 16(a0)
-; CHECK-V-NEXT: lhu s2, 0(a0)
-; CHECK-V-NEXT: lhu a0, 8(a0)
+; CHECK-V-NEXT: lhu s1, 0(a0)
+; CHECK-V-NEXT: lhu s2, 8(a0)
+; CHECK-V-NEXT: lhu a0, 16(a0)
; CHECK-V-NEXT: fmv.w.x fa0, a0
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz
; CHECK-V-NEXT: vsetivli zero, 1, e64, m1, ta, ma
; CHECK-V-NEXT: vmv.s.x v8, a0
-; CHECK-V-NEXT: addi a0, sp, 16
-; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT: fmv.w.x fa0, s2
-; CHECK-V-NEXT: call __extendhfsf2 at plt
-; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT: vsetivli zero, 2, e64, m2, tu, ma
-; CHECK-V-NEXT: vmv.s.x v8, a0
-; CHECK-V-NEXT: addi a0, sp, 16
-; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT: vslideup.vi v8, v10, 1
; CHECK-V-NEXT: csrr a0, vlenb
; CHECK-V-NEXT: slli a0, a0, 1
; CHECK-V-NEXT: add a0, sp, a0
; CHECK-V-NEXT: addi a0, a0, 16
; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT: fmv.w.x fa0, s2
+; CHECK-V-NEXT: call __extendhfsf2 at plt
+; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz
+; CHECK-V-NEXT: vsetivli zero, 1, e64, m1, ta, ma
; CHECK-V-NEXT: fmv.w.x fa0, s1
+; CHECK-V-NEXT: vmv.s.x v8, a0
+; CHECK-V-NEXT: addi a0, sp, 16
+; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz
+; CHECK-V-NEXT: vsetivli zero, 2, e64, m1, tu, ma
+; CHECK-V-NEXT: vmv.s.x v10, a0
+; CHECK-V-NEXT: addi a0, sp, 16
+; CHECK-V-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT: vslideup.vi v10, v8, 1
; CHECK-V-NEXT: vsetivli zero, 3, e64, m2, tu, ma
-; CHECK-V-NEXT: vmv.s.x v8, a0
; CHECK-V-NEXT: csrr a0, vlenb
; CHECK-V-NEXT: slli a0, a0, 1
; CHECK-V-NEXT: add a0, sp, a0
; CHECK-V-NEXT: addi a0, a0, 16
-; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload
; CHECK-V-NEXT: vslideup.vi v10, v8, 2
-; CHECK-V-NEXT: csrr a0, vlenb
-; CHECK-V-NEXT: slli a0, a0, 1
-; CHECK-V-NEXT: add a0, sp, a0
-; CHECK-V-NEXT: addi a0, a0, 16
+; CHECK-V-NEXT: addi a0, sp, 16
; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill
; CHECK-V-NEXT: fmv.w.x fa0, s0
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz
; CHECK-V-NEXT: vsetivli zero, 4, e64, m2, ta, ma
; CHECK-V-NEXT: vmv.s.x v8, a0
-; CHECK-V-NEXT: csrr a0, vlenb
-; CHECK-V-NEXT: slli a0, a0, 1
-; CHECK-V-NEXT: add a0, sp, a0
-; CHECK-V-NEXT: addi a0, a0, 16
+; CHECK-V-NEXT: addi a0, sp, 16
; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload
; CHECK-V-NEXT: vslideup.vi v10, v8, 3
; CHECK-V-NEXT: li a0, -1
@@ -4766,9 +4862,9 @@ define <8 x i16> @stest_f16i16_mm(<8 x half> %x) {
; CHECK-V-NEXT: .cfi_offset s5, -56
; CHECK-V-NEXT: .cfi_offset s6, -64
; CHECK-V-NEXT: csrr a1, vlenb
-; CHECK-V-NEXT: slli a1, a1, 1
+; CHECK-V-NEXT: slli a1, a1, 2
; CHECK-V-NEXT: sub sp, sp, a1
-; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 80 + 2 * vlenb
+; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 80 + 4 * vlenb
; CHECK-V-NEXT: lhu s0, 56(a0)
; CHECK-V-NEXT: lhu s1, 48(a0)
; CHECK-V-NEXT: lhu s2, 40(a0)
@@ -4787,63 +4883,105 @@ define <8 x i16> @stest_f16i16_mm(<8 x half> %x) {
; CHECK-V-NEXT: fmv.w.x fa0, s6
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT: vsetivli zero, 2, e32, m2, tu, ma
+; CHECK-V-NEXT: vsetivli zero, 2, e32, m1, tu, ma
; CHECK-V-NEXT: vmv.s.x v8, a0
; CHECK-V-NEXT: addi a0, sp, 16
; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload
; CHECK-V-NEXT: vslideup.vi v8, v10, 1
+; CHECK-V-NEXT: csrr a0, vlenb
+; CHECK-V-NEXT: slli a0, a0, 1
+; CHECK-V-NEXT: add a0, sp, a0
+; CHECK-V-NEXT: addi a0, a0, 16
; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill
; CHECK-V-NEXT: fmv.w.x fa0, s5
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT: vsetivli zero, 3, e32, m2, tu, ma
+; CHECK-V-NEXT: vsetivli zero, 3, e32, m1, tu, ma
; CHECK-V-NEXT: vmv.s.x v8, a0
-; CHECK-V-NEXT: addi a0, sp, 16
+; CHECK-V-NEXT: csrr a0, vlenb
+; CHECK-V-NEXT: slli a0, a0, 1
+; CHECK-V-NEXT: add a0, sp, a0
+; CHECK-V-NEXT: addi a0, a0, 16
; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload
; CHECK-V-NEXT: vslideup.vi v10, v8, 2
+; CHECK-V-NEXT: csrr a0, vlenb
+; CHECK-V-NEXT: slli a0, a0, 1
+; CHECK-V-NEXT: add a0, sp, a0
+; CHECK-V-NEXT: addi a0, a0, 16
; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill
; CHECK-V-NEXT: fmv.w.x fa0, s4
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT: vsetivli zero, 4, e32, m2, tu, ma
+; CHECK-V-NEXT: vsetivli zero, 4, e32, m1, tu, ma
; CHECK-V-NEXT: vmv.s.x v8, a0
-; CHECK-V-NEXT: addi a0, sp, 16
+; CHECK-V-NEXT: csrr a0, vlenb
+; CHECK-V-NEXT: slli a0, a0, 1
+; CHECK-V-NEXT: add a0, sp, a0
+; CHECK-V-NEXT: addi a0, a0, 16
; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload
; CHECK-V-NEXT: vslideup.vi v10, v8, 3
+; CHECK-V-NEXT: csrr a0, vlenb
+; CHECK-V-NEXT: slli a0, a0, 1
+; CHECK-V-NEXT: add a0, sp, a0
+; CHECK-V-NEXT: addi a0, a0, 16
; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill
; CHECK-V-NEXT: fmv.w.x fa0, s3
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz
; CHECK-V-NEXT: vsetivli zero, 5, e32, m2, tu, ma
; CHECK-V-NEXT: vmv.s.x v8, a0
-; CHECK-V-NEXT: addi a0, sp, 16
+; CHECK-V-NEXT: csrr a0, vlenb
+; CHECK-V-NEXT: slli a0, a0, 1
+; CHECK-V-NEXT: add a0, sp, a0
+; CHECK-V-NEXT: addi a0, a0, 16
; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload
; CHECK-V-NEXT: vslideup.vi v10, v8, 4
+; CHECK-V-NEXT: csrr a0, vlenb
+; CHECK-V-NEXT: slli a0, a0, 1
+; CHECK-V-NEXT: add a0, sp, a0
+; CHECK-V-NEXT: addi a0, a0, 16
; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill
; CHECK-V-NEXT: fmv.w.x fa0, s2
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz
; CHECK-V-NEXT: vsetivli zero, 6, e32, m2, tu, ma
; CHECK-V-NEXT: vmv.s.x v8, a0
-; CHECK-V-NEXT: addi a0, sp, 16
+; CHECK-V-NEXT: csrr a0, vlenb
+; CHECK-V-NEXT: slli a0, a0, 1
+; CHECK-V-NEXT: add a0, sp, a0
+; CHECK-V-NEXT: addi a0, a0, 16
; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload
; CHECK-V-NEXT: vslideup.vi v10, v8, 5
+; CHECK-V-NEXT: csrr a0, vlenb
+; CHECK-V-NEXT: slli a0, a0, 1
+; CHECK-V-NEXT: add a0, sp, a0
+; CHECK-V-NEXT: addi a0, a0, 16
; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill
; CHECK-V-NEXT: fmv.w.x fa0, s1
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz
; CHECK-V-NEXT: vsetivli zero, 7, e32, m2, tu, ma
; CHECK-V-NEXT: vmv.s.x v8, a0
-; CHECK-V-NEXT: addi a0, sp, 16
+; CHECK-V-NEXT: csrr a0, vlenb
+; CHECK-V-NEXT: slli a0, a0, 1
+; CHECK-V-NEXT: add a0, sp, a0
+; CHECK-V-NEXT: addi a0, a0, 16
; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload
; CHECK-V-NEXT: vslideup.vi v10, v8, 6
+; CHECK-V-NEXT: csrr a0, vlenb
+; CHECK-V-NEXT: slli a0, a0, 1
+; CHECK-V-NEXT: add a0, sp, a0
+; CHECK-V-NEXT: addi a0, a0, 16
; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill
; CHECK-V-NEXT: fmv.w.x fa0, s0
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz
; CHECK-V-NEXT: vsetivli zero, 8, e32, m2, ta, ma
; CHECK-V-NEXT: vmv.s.x v8, a0
-; CHECK-V-NEXT: addi a0, sp, 16
+; CHECK-V-NEXT: csrr a0, vlenb
+; CHECK-V-NEXT: slli a0, a0, 1
+; CHECK-V-NEXT: add a0, sp, a0
+; CHECK-V-NEXT: addi a0, a0, 16
; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload
; CHECK-V-NEXT: vslideup.vi v10, v8, 7
; CHECK-V-NEXT: lui a0, 8
@@ -4854,7 +4992,7 @@ define <8 x i16> @stest_f16i16_mm(<8 x half> %x) {
; CHECK-V-NEXT: vsetvli zero, zero, e16, m1, ta, ma
; CHECK-V-NEXT: vnsrl.wi v8, v10, 0
; CHECK-V-NEXT: csrr a0, vlenb
-; CHECK-V-NEXT: slli a0, a0, 1
+; CHECK-V-NEXT: slli a0, a0, 2
; CHECK-V-NEXT: add sp, sp, a0
; CHECK-V-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
; CHECK-V-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
@@ -5045,9 +5183,9 @@ define <8 x i16> @utesth_f16i16_mm(<8 x half> %x) {
; CHECK-V-NEXT: .cfi_offset s5, -56
; CHECK-V-NEXT: .cfi_offset s6, -64
; CHECK-V-NEXT: csrr a1, vlenb
-; CHECK-V-NEXT: slli a1, a1, 1
+; CHECK-V-NEXT: slli a1, a1, 2
; CHECK-V-NEXT: sub sp, sp, a1
-; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 80 + 2 * vlenb
+; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 80 + 4 * vlenb
; CHECK-V-NEXT: lhu s0, 56(a0)
; CHECK-V-NEXT: lhu s1, 48(a0)
; CHECK-V-NEXT: lhu s2, 40(a0)
@@ -5066,63 +5204,105 @@ define <8 x i16> @utesth_f16i16_mm(<8 x half> %x) {
; CHECK-V-NEXT: fmv.w.x fa0, s6
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz
-; CHECK-V-NEXT: vsetivli zero, 2, e32, m2, tu, ma
+; CHECK-V-NEXT: vsetivli zero, 2, e32, m1, tu, ma
; CHECK-V-NEXT: vmv.s.x v8, a0
; CHECK-V-NEXT: addi a0, sp, 16
; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload
; CHECK-V-NEXT: vslideup.vi v8, v10, 1
+; CHECK-V-NEXT: csrr a0, vlenb
+; CHECK-V-NEXT: slli a0, a0, 1
+; CHECK-V-NEXT: add a0, sp, a0
+; CHECK-V-NEXT: addi a0, a0, 16
; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill
; CHECK-V-NEXT: fmv.w.x fa0, s5
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz
-; CHECK-V-NEXT: vsetivli zero, 3, e32, m2, tu, ma
+; CHECK-V-NEXT: vsetivli zero, 3, e32, m1, tu, ma
; CHECK-V-NEXT: vmv.s.x v8, a0
-; CHECK-V-NEXT: addi a0, sp, 16
+; CHECK-V-NEXT: csrr a0, vlenb
+; CHECK-V-NEXT: slli a0, a0, 1
+; CHECK-V-NEXT: add a0, sp, a0
+; CHECK-V-NEXT: addi a0, a0, 16
; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload
; CHECK-V-NEXT: vslideup.vi v10, v8, 2
+; CHECK-V-NEXT: csrr a0, vlenb
+; CHECK-V-NEXT: slli a0, a0, 1
+; CHECK-V-NEXT: add a0, sp, a0
+; CHECK-V-NEXT: addi a0, a0, 16
; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill
; CHECK-V-NEXT: fmv.w.x fa0, s4
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz
-; CHECK-V-NEXT: vsetivli zero, 4, e32, m2, tu, ma
+; CHECK-V-NEXT: vsetivli zero, 4, e32, m1, tu, ma
; CHECK-V-NEXT: vmv.s.x v8, a0
-; CHECK-V-NEXT: addi a0, sp, 16
+; CHECK-V-NEXT: csrr a0, vlenb
+; CHECK-V-NEXT: slli a0, a0, 1
+; CHECK-V-NEXT: add a0, sp, a0
+; CHECK-V-NEXT: addi a0, a0, 16
; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload
; CHECK-V-NEXT: vslideup.vi v10, v8, 3
+; CHECK-V-NEXT: csrr a0, vlenb
+; CHECK-V-NEXT: slli a0, a0, 1
+; CHECK-V-NEXT: add a0, sp, a0
+; CHECK-V-NEXT: addi a0, a0, 16
; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill
; CHECK-V-NEXT: fmv.w.x fa0, s3
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz
; CHECK-V-NEXT: vsetivli zero, 5, e32, m2, tu, ma
; CHECK-V-NEXT: vmv.s.x v8, a0
-; CHECK-V-NEXT: addi a0, sp, 16
+; CHECK-V-NEXT: csrr a0, vlenb
+; CHECK-V-NEXT: slli a0, a0, 1
+; CHECK-V-NEXT: add a0, sp, a0
+; CHECK-V-NEXT: addi a0, a0, 16
; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload
; CHECK-V-NEXT: vslideup.vi v10, v8, 4
+; CHECK-V-NEXT: csrr a0, vlenb
+; CHECK-V-NEXT: slli a0, a0, 1
+; CHECK-V-NEXT: add a0, sp, a0
+; CHECK-V-NEXT: addi a0, a0, 16
; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill
; CHECK-V-NEXT: fmv.w.x fa0, s2
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz
; CHECK-V-NEXT: vsetivli zero, 6, e32, m2, tu, ma
; CHECK-V-NEXT: vmv.s.x v8, a0
-; CHECK-V-NEXT: addi a0, sp, 16
+; CHECK-V-NEXT: csrr a0, vlenb
+; CHECK-V-NEXT: slli a0, a0, 1
+; CHECK-V-NEXT: add a0, sp, a0
+; CHECK-V-NEXT: addi a0, a0, 16
; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload
; CHECK-V-NEXT: vslideup.vi v10, v8, 5
+; CHECK-V-NEXT: csrr a0, vlenb
+; CHECK-V-NEXT: slli a0, a0, 1
+; CHECK-V-NEXT: add a0, sp, a0
+; CHECK-V-NEXT: addi a0, a0, 16
; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill
; CHECK-V-NEXT: fmv.w.x fa0, s1
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz
; CHECK-V-NEXT: vsetivli zero, 7, e32, m2, tu, ma
; CHECK-V-NEXT: vmv.s.x v8, a0
-; CHECK-V-NEXT: addi a0, sp, 16
+; CHECK-V-NEXT: csrr a0, vlenb
+; CHECK-V-NEXT: slli a0, a0, 1
+; CHECK-V-NEXT: add a0, sp, a0
+; CHECK-V-NEXT: addi a0, a0, 16
; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload
; CHECK-V-NEXT: vslideup.vi v10, v8, 6
+; CHECK-V-NEXT: csrr a0, vlenb
+; CHECK-V-NEXT: slli a0, a0, 1
+; CHECK-V-NEXT: add a0, sp, a0
+; CHECK-V-NEXT: addi a0, a0, 16
; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill
; CHECK-V-NEXT: fmv.w.x fa0, s0
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz
; CHECK-V-NEXT: vsetivli zero, 8, e32, m2, ta, ma
; CHECK-V-NEXT: vmv.s.x v8, a0
-; CHECK-V-NEXT: addi a0, sp, 16
+; CHECK-V-NEXT: csrr a0, vlenb
+; CHECK-V-NEXT: slli a0, a0, 1
+; CHECK-V-NEXT: add a0, sp, a0
+; CHECK-V-NEXT: addi a0, a0, 16
; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload
; CHECK-V-NEXT: vslideup.vi v10, v8, 7
; CHECK-V-NEXT: lui a0, 16
@@ -5131,7 +5311,7 @@ define <8 x i16> @utesth_f16i16_mm(<8 x half> %x) {
; CHECK-V-NEXT: vsetvli zero, zero, e16, m1, ta, ma
; CHECK-V-NEXT: vnsrl.wi v8, v10, 0
; CHECK-V-NEXT: csrr a0, vlenb
-; CHECK-V-NEXT: slli a0, a0, 1
+; CHECK-V-NEXT: slli a0, a0, 2
; CHECK-V-NEXT: add sp, sp, a0
; CHECK-V-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
; CHECK-V-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
@@ -5347,9 +5527,9 @@ define <8 x i16> @ustest_f16i16_mm(<8 x half> %x) {
; CHECK-V-NEXT: .cfi_offset s5, -56
; CHECK-V-NEXT: .cfi_offset s6, -64
; CHECK-V-NEXT: csrr a1, vlenb
-; CHECK-V-NEXT: slli a1, a1, 1
+; CHECK-V-NEXT: slli a1, a1, 2
; CHECK-V-NEXT: sub sp, sp, a1
-; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 80 + 2 * vlenb
+; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 80 + 4 * vlenb
; CHECK-V-NEXT: lhu s0, 56(a0)
; CHECK-V-NEXT: lhu s1, 48(a0)
; CHECK-V-NEXT: lhu s2, 40(a0)
@@ -5368,63 +5548,105 @@ define <8 x i16> @ustest_f16i16_mm(<8 x half> %x) {
; CHECK-V-NEXT: fmv.w.x fa0, s6
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT: vsetivli zero, 2, e32, m2, tu, ma
+; CHECK-V-NEXT: vsetivli zero, 2, e32, m1, tu, ma
; CHECK-V-NEXT: vmv.s.x v8, a0
; CHECK-V-NEXT: addi a0, sp, 16
; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload
; CHECK-V-NEXT: vslideup.vi v8, v10, 1
+; CHECK-V-NEXT: csrr a0, vlenb
+; CHECK-V-NEXT: slli a0, a0, 1
+; CHECK-V-NEXT: add a0, sp, a0
+; CHECK-V-NEXT: addi a0, a0, 16
; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill
; CHECK-V-NEXT: fmv.w.x fa0, s5
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT: vsetivli zero, 3, e32, m2, tu, ma
+; CHECK-V-NEXT: vsetivli zero, 3, e32, m1, tu, ma
; CHECK-V-NEXT: vmv.s.x v8, a0
-; CHECK-V-NEXT: addi a0, sp, 16
+; CHECK-V-NEXT: csrr a0, vlenb
+; CHECK-V-NEXT: slli a0, a0, 1
+; CHECK-V-NEXT: add a0, sp, a0
+; CHECK-V-NEXT: addi a0, a0, 16
; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload
; CHECK-V-NEXT: vslideup.vi v10, v8, 2
+; CHECK-V-NEXT: csrr a0, vlenb
+; CHECK-V-NEXT: slli a0, a0, 1
+; CHECK-V-NEXT: add a0, sp, a0
+; CHECK-V-NEXT: addi a0, a0, 16
; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill
; CHECK-V-NEXT: fmv.w.x fa0, s4
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT: vsetivli zero, 4, e32, m2, tu, ma
+; CHECK-V-NEXT: vsetivli zero, 4, e32, m1, tu, ma
; CHECK-V-NEXT: vmv.s.x v8, a0
-; CHECK-V-NEXT: addi a0, sp, 16
+; CHECK-V-NEXT: csrr a0, vlenb
+; CHECK-V-NEXT: slli a0, a0, 1
+; CHECK-V-NEXT: add a0, sp, a0
+; CHECK-V-NEXT: addi a0, a0, 16
; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload
; CHECK-V-NEXT: vslideup.vi v10, v8, 3
+; CHECK-V-NEXT: csrr a0, vlenb
+; CHECK-V-NEXT: slli a0, a0, 1
+; CHECK-V-NEXT: add a0, sp, a0
+; CHECK-V-NEXT: addi a0, a0, 16
; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill
; CHECK-V-NEXT: fmv.w.x fa0, s3
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz
; CHECK-V-NEXT: vsetivli zero, 5, e32, m2, tu, ma
; CHECK-V-NEXT: vmv.s.x v8, a0
-; CHECK-V-NEXT: addi a0, sp, 16
+; CHECK-V-NEXT: csrr a0, vlenb
+; CHECK-V-NEXT: slli a0, a0, 1
+; CHECK-V-NEXT: add a0, sp, a0
+; CHECK-V-NEXT: addi a0, a0, 16
; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload
; CHECK-V-NEXT: vslideup.vi v10, v8, 4
+; CHECK-V-NEXT: csrr a0, vlenb
+; CHECK-V-NEXT: slli a0, a0, 1
+; CHECK-V-NEXT: add a0, sp, a0
+; CHECK-V-NEXT: addi a0, a0, 16
; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill
; CHECK-V-NEXT: fmv.w.x fa0, s2
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz
; CHECK-V-NEXT: vsetivli zero, 6, e32, m2, tu, ma
; CHECK-V-NEXT: vmv.s.x v8, a0
-; CHECK-V-NEXT: addi a0, sp, 16
+; CHECK-V-NEXT: csrr a0, vlenb
+; CHECK-V-NEXT: slli a0, a0, 1
+; CHECK-V-NEXT: add a0, sp, a0
+; CHECK-V-NEXT: addi a0, a0, 16
; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload
; CHECK-V-NEXT: vslideup.vi v10, v8, 5
+; CHECK-V-NEXT: csrr a0, vlenb
+; CHECK-V-NEXT: slli a0, a0, 1
+; CHECK-V-NEXT: add a0, sp, a0
+; CHECK-V-NEXT: addi a0, a0, 16
; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill
; CHECK-V-NEXT: fmv.w.x fa0, s1
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz
; CHECK-V-NEXT: vsetivli zero, 7, e32, m2, tu, ma
; CHECK-V-NEXT: vmv.s.x v8, a0
-; CHECK-V-NEXT: addi a0, sp, 16
+; CHECK-V-NEXT: csrr a0, vlenb
+; CHECK-V-NEXT: slli a0, a0, 1
+; CHECK-V-NEXT: add a0, sp, a0
+; CHECK-V-NEXT: addi a0, a0, 16
; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload
; CHECK-V-NEXT: vslideup.vi v10, v8, 6
+; CHECK-V-NEXT: csrr a0, vlenb
+; CHECK-V-NEXT: slli a0, a0, 1
+; CHECK-V-NEXT: add a0, sp, a0
+; CHECK-V-NEXT: addi a0, a0, 16
; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill
; CHECK-V-NEXT: fmv.w.x fa0, s0
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz
; CHECK-V-NEXT: vsetivli zero, 8, e32, m2, ta, ma
; CHECK-V-NEXT: vmv.s.x v8, a0
-; CHECK-V-NEXT: addi a0, sp, 16
+; CHECK-V-NEXT: csrr a0, vlenb
+; CHECK-V-NEXT: slli a0, a0, 1
+; CHECK-V-NEXT: add a0, sp, a0
+; CHECK-V-NEXT: addi a0, a0, 16
; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload
; CHECK-V-NEXT: vslideup.vi v10, v8, 7
; CHECK-V-NEXT: lui a0, 16
@@ -5434,7 +5656,7 @@ define <8 x i16> @ustest_f16i16_mm(<8 x half> %x) {
; CHECK-V-NEXT: vsetvli zero, zero, e16, m1, ta, ma
; CHECK-V-NEXT: vnsrl.wi v8, v10, 0
; CHECK-V-NEXT: csrr a0, vlenb
-; CHECK-V-NEXT: slli a0, a0, 1
+; CHECK-V-NEXT: slli a0, a0, 2
; CHECK-V-NEXT: add sp, sp, a0
; CHECK-V-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
; CHECK-V-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/rvv/insertelt-fp.ll b/llvm/test/CodeGen/RISCV/rvv/insertelt-fp.ll
index 4bd9f7befa52a1c..141d1b9ed95441c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/insertelt-fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/insertelt-fp.ll
@@ -119,7 +119,7 @@ define <vscale x 8 x half> @insertelt_nxv8f16_0(<vscale x 8 x half> %v, half %el
define <vscale x 8 x half> @insertelt_nxv8f16_imm(<vscale x 8 x half> %v, half %elt) {
; CHECK-LABEL: insertelt_nxv8f16_imm:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 4, e16, m2, tu, ma
+; CHECK-NEXT: vsetivli zero, 4, e16, m1, tu, ma
; CHECK-NEXT: vfmv.s.f v10, fa0
; CHECK-NEXT: vslideup.vi v8, v10, 3
; CHECK-NEXT: ret
@@ -153,7 +153,7 @@ define <vscale x 16 x half> @insertelt_nxv16f16_0(<vscale x 16 x half> %v, half
define <vscale x 16 x half> @insertelt_nxv16f16_imm(<vscale x 16 x half> %v, half %elt) {
; CHECK-LABEL: insertelt_nxv16f16_imm:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 4, e16, m4, tu, ma
+; CHECK-NEXT: vsetivli zero, 4, e16, m1, tu, ma
; CHECK-NEXT: vfmv.s.f v12, fa0
; CHECK-NEXT: vslideup.vi v8, v12, 3
; CHECK-NEXT: ret
@@ -187,7 +187,7 @@ define <vscale x 32 x half> @insertelt_nxv32f16_0(<vscale x 32 x half> %v, half
define <vscale x 32 x half> @insertelt_nxv32f16_imm(<vscale x 32 x half> %v, half %elt) {
; CHECK-LABEL: insertelt_nxv32f16_imm:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 4, e16, m8, tu, ma
+; CHECK-NEXT: vsetivli zero, 4, e16, m1, tu, ma
; CHECK-NEXT: vfmv.s.f v16, fa0
; CHECK-NEXT: vslideup.vi v8, v16, 3
; CHECK-NEXT: ret
@@ -289,7 +289,7 @@ define <vscale x 4 x float> @insertelt_nxv4f32_0(<vscale x 4 x float> %v, float
define <vscale x 4 x float> @insertelt_nxv4f32_imm(<vscale x 4 x float> %v, float %elt) {
; CHECK-LABEL: insertelt_nxv4f32_imm:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 4, e32, m2, tu, ma
+; CHECK-NEXT: vsetivli zero, 4, e32, m1, tu, ma
; CHECK-NEXT: vfmv.s.f v10, fa0
; CHECK-NEXT: vslideup.vi v8, v10, 3
; CHECK-NEXT: ret
@@ -323,7 +323,7 @@ define <vscale x 8 x float> @insertelt_nxv8f32_0(<vscale x 8 x float> %v, float
define <vscale x 8 x float> @insertelt_nxv8f32_imm(<vscale x 8 x float> %v, float %elt) {
; CHECK-LABEL: insertelt_nxv8f32_imm:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 4, e32, m4, tu, ma
+; CHECK-NEXT: vsetivli zero, 4, e32, m1, tu, ma
; CHECK-NEXT: vfmv.s.f v12, fa0
; CHECK-NEXT: vslideup.vi v8, v12, 3
; CHECK-NEXT: ret
@@ -357,7 +357,7 @@ define <vscale x 16 x float> @insertelt_nxv16f32_0(<vscale x 16 x float> %v, flo
define <vscale x 16 x float> @insertelt_nxv16f32_imm(<vscale x 16 x float> %v, float %elt) {
; CHECK-LABEL: insertelt_nxv16f32_imm:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 4, e32, m8, tu, ma
+; CHECK-NEXT: vsetivli zero, 4, e32, m1, tu, ma
; CHECK-NEXT: vfmv.s.f v16, fa0
; CHECK-NEXT: vslideup.vi v8, v16, 3
; CHECK-NEXT: ret
@@ -459,7 +459,7 @@ define <vscale x 4 x double> @insertelt_nxv4f64_0(<vscale x 4 x double> %v, doub
define <vscale x 4 x double> @insertelt_nxv4f64_imm(<vscale x 4 x double> %v, double %elt) {
; CHECK-LABEL: insertelt_nxv4f64_imm:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 4, e64, m4, tu, ma
+; CHECK-NEXT: vsetivli zero, 4, e64, m2, tu, ma
; CHECK-NEXT: vfmv.s.f v12, fa0
; CHECK-NEXT: vslideup.vi v8, v12, 3
; CHECK-NEXT: ret
@@ -493,7 +493,7 @@ define <vscale x 8 x double> @insertelt_nxv8f64_0(<vscale x 8 x double> %v, doub
define <vscale x 8 x double> @insertelt_nxv8f64_imm(<vscale x 8 x double> %v, double %elt) {
; CHECK-LABEL: insertelt_nxv8f64_imm:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 4, e64, m8, tu, ma
+; CHECK-NEXT: vsetivli zero, 4, e64, m2, tu, ma
; CHECK-NEXT: vfmv.s.f v16, fa0
; CHECK-NEXT: vslideup.vi v8, v16, 3
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/insertelt-i1.ll b/llvm/test/CodeGen/RISCV/rvv/insertelt-i1.ll
index a7bd15f2a7b330a..911072d9571ff81 100644
--- a/llvm/test/CodeGen/RISCV/rvv/insertelt-i1.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/insertelt-i1.ll
@@ -149,7 +149,7 @@ define <vscale x 16 x i1> @insertelt_nxv16i1(<vscale x 16 x i1> %x, i1 %elt) {
; CHECK-NEXT: vmv.v.i v8, 0
; CHECK-NEXT: vmerge.vim v8, v8, 1, v0
; CHECK-NEXT: vmv.s.x v10, a0
-; CHECK-NEXT: vsetivli zero, 3, e8, m2, tu, ma
+; CHECK-NEXT: vsetivli zero, 3, e8, m1, tu, ma
; CHECK-NEXT: vslideup.vi v8, v10, 2
; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma
; CHECK-NEXT: vand.vi v8, v8, 1
@@ -184,7 +184,7 @@ define <vscale x 32 x i1> @insertelt_nxv32i1(<vscale x 32 x i1> %x, i1 %elt) {
; CHECK-NEXT: vmv.v.i v8, 0
; CHECK-NEXT: vmerge.vim v8, v8, 1, v0
; CHECK-NEXT: vmv.s.x v12, a0
-; CHECK-NEXT: vsetivli zero, 3, e8, m4, tu, ma
+; CHECK-NEXT: vsetivli zero, 3, e8, m1, tu, ma
; CHECK-NEXT: vslideup.vi v8, v12, 2
; CHECK-NEXT: vsetvli a0, zero, e8, m4, ta, ma
; CHECK-NEXT: vand.vi v8, v8, 1
@@ -219,7 +219,7 @@ define <vscale x 64 x i1> @insertelt_nxv64i1(<vscale x 64 x i1> %x, i1 %elt) {
; CHECK-NEXT: vmv.v.i v8, 0
; CHECK-NEXT: vmerge.vim v8, v8, 1, v0
; CHECK-NEXT: vmv.s.x v16, a0
-; CHECK-NEXT: vsetivli zero, 3, e8, m8, tu, ma
+; CHECK-NEXT: vsetivli zero, 3, e8, m1, tu, ma
; CHECK-NEXT: vslideup.vi v8, v16, 2
; CHECK-NEXT: vsetvli a0, zero, e8, m8, ta, ma
; CHECK-NEXT: vand.vi v8, v8, 1
diff --git a/llvm/test/CodeGen/RISCV/rvv/insertelt-int-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/insertelt-int-rv32.ll
index 39f94eab2aa6606..c56f3df66dee9a3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/insertelt-int-rv32.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/insertelt-int-rv32.ll
@@ -151,7 +151,7 @@ define <vscale x 16 x i8> @insertelt_nxv16i8_0(<vscale x 16 x i8> %v, i8 signext
define <vscale x 16 x i8> @insertelt_nxv16i8_imm(<vscale x 16 x i8> %v, i8 signext %elt) {
; CHECK-LABEL: insertelt_nxv16i8_imm:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 4, e8, m2, tu, ma
+; CHECK-NEXT: vsetivli zero, 4, e8, m1, tu, ma
; CHECK-NEXT: vmv.s.x v10, a0
; CHECK-NEXT: vslideup.vi v8, v10, 3
; CHECK-NEXT: ret
@@ -185,7 +185,7 @@ define <vscale x 32 x i8> @insertelt_nxv32i8_0(<vscale x 32 x i8> %v, i8 signext
define <vscale x 32 x i8> @insertelt_nxv32i8_imm(<vscale x 32 x i8> %v, i8 signext %elt) {
; CHECK-LABEL: insertelt_nxv32i8_imm:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 4, e8, m4, tu, ma
+; CHECK-NEXT: vsetivli zero, 4, e8, m1, tu, ma
; CHECK-NEXT: vmv.s.x v12, a0
; CHECK-NEXT: vslideup.vi v8, v12, 3
; CHECK-NEXT: ret
@@ -219,7 +219,7 @@ define <vscale x 64 x i8> @insertelt_nxv64i8_0(<vscale x 64 x i8> %v, i8 signext
define <vscale x 64 x i8> @insertelt_nxv64i8_imm(<vscale x 64 x i8> %v, i8 signext %elt) {
; CHECK-LABEL: insertelt_nxv64i8_imm:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 4, e8, m8, tu, ma
+; CHECK-NEXT: vsetivli zero, 4, e8, m1, tu, ma
; CHECK-NEXT: vmv.s.x v16, a0
; CHECK-NEXT: vslideup.vi v8, v16, 3
; CHECK-NEXT: ret
@@ -355,7 +355,7 @@ define <vscale x 8 x i16> @insertelt_nxv8i16_0(<vscale x 8 x i16> %v, i16 signex
define <vscale x 8 x i16> @insertelt_nxv8i16_imm(<vscale x 8 x i16> %v, i16 signext %elt) {
; CHECK-LABEL: insertelt_nxv8i16_imm:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 4, e16, m2, tu, ma
+; CHECK-NEXT: vsetivli zero, 4, e16, m1, tu, ma
; CHECK-NEXT: vmv.s.x v10, a0
; CHECK-NEXT: vslideup.vi v8, v10, 3
; CHECK-NEXT: ret
@@ -389,7 +389,7 @@ define <vscale x 16 x i16> @insertelt_nxv16i16_0(<vscale x 16 x i16> %v, i16 sig
define <vscale x 16 x i16> @insertelt_nxv16i16_imm(<vscale x 16 x i16> %v, i16 signext %elt) {
; CHECK-LABEL: insertelt_nxv16i16_imm:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 4, e16, m4, tu, ma
+; CHECK-NEXT: vsetivli zero, 4, e16, m1, tu, ma
; CHECK-NEXT: vmv.s.x v12, a0
; CHECK-NEXT: vslideup.vi v8, v12, 3
; CHECK-NEXT: ret
@@ -423,7 +423,7 @@ define <vscale x 32 x i16> @insertelt_nxv32i16_0(<vscale x 32 x i16> %v, i16 sig
define <vscale x 32 x i16> @insertelt_nxv32i16_imm(<vscale x 32 x i16> %v, i16 signext %elt) {
; CHECK-LABEL: insertelt_nxv32i16_imm:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 4, e16, m8, tu, ma
+; CHECK-NEXT: vsetivli zero, 4, e16, m1, tu, ma
; CHECK-NEXT: vmv.s.x v16, a0
; CHECK-NEXT: vslideup.vi v8, v16, 3
; CHECK-NEXT: ret
@@ -525,7 +525,7 @@ define <vscale x 4 x i32> @insertelt_nxv4i32_0(<vscale x 4 x i32> %v, i32 %elt)
define <vscale x 4 x i32> @insertelt_nxv4i32_imm(<vscale x 4 x i32> %v, i32 %elt) {
; CHECK-LABEL: insertelt_nxv4i32_imm:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 4, e32, m2, tu, ma
+; CHECK-NEXT: vsetivli zero, 4, e32, m1, tu, ma
; CHECK-NEXT: vmv.s.x v10, a0
; CHECK-NEXT: vslideup.vi v8, v10, 3
; CHECK-NEXT: ret
@@ -559,7 +559,7 @@ define <vscale x 8 x i32> @insertelt_nxv8i32_0(<vscale x 8 x i32> %v, i32 %elt)
define <vscale x 8 x i32> @insertelt_nxv8i32_imm(<vscale x 8 x i32> %v, i32 %elt) {
; CHECK-LABEL: insertelt_nxv8i32_imm:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 4, e32, m4, tu, ma
+; CHECK-NEXT: vsetivli zero, 4, e32, m1, tu, ma
; CHECK-NEXT: vmv.s.x v12, a0
; CHECK-NEXT: vslideup.vi v8, v12, 3
; CHECK-NEXT: ret
@@ -593,7 +593,7 @@ define <vscale x 16 x i32> @insertelt_nxv16i32_0(<vscale x 16 x i32> %v, i32 %el
define <vscale x 16 x i32> @insertelt_nxv16i32_imm(<vscale x 16 x i32> %v, i32 %elt) {
; CHECK-LABEL: insertelt_nxv16i32_imm:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 4, e32, m8, tu, ma
+; CHECK-NEXT: vsetivli zero, 4, e32, m1, tu, ma
; CHECK-NEXT: vmv.s.x v16, a0
; CHECK-NEXT: vslideup.vi v8, v16, 3
; CHECK-NEXT: ret
@@ -707,7 +707,7 @@ define <vscale x 4 x i64> @insertelt_nxv4i64_imm(<vscale x 4 x i64> %v, i64 %elt
; CHECK-NEXT: vsetivli zero, 2, e32, m4, ta, ma
; CHECK-NEXT: vslide1down.vx v12, v8, a0
; CHECK-NEXT: vslide1down.vx v12, v12, a1
-; CHECK-NEXT: vsetivli zero, 4, e64, m4, tu, ma
+; CHECK-NEXT: vsetivli zero, 4, e64, m2, tu, ma
; CHECK-NEXT: vslideup.vi v8, v12, 3
; CHECK-NEXT: ret
%r = insertelement <vscale x 4 x i64> %v, i64 %elt, i32 3
@@ -745,7 +745,7 @@ define <vscale x 8 x i64> @insertelt_nxv8i64_imm(<vscale x 8 x i64> %v, i64 %elt
; CHECK-NEXT: vsetivli zero, 2, e32, m8, ta, ma
; CHECK-NEXT: vslide1down.vx v16, v8, a0
; CHECK-NEXT: vslide1down.vx v16, v16, a1
-; CHECK-NEXT: vsetivli zero, 4, e64, m8, tu, ma
+; CHECK-NEXT: vsetivli zero, 4, e64, m2, tu, ma
; CHECK-NEXT: vslideup.vi v8, v16, 3
; CHECK-NEXT: ret
%r = insertelement <vscale x 8 x i64> %v, i64 %elt, i32 3
diff --git a/llvm/test/CodeGen/RISCV/rvv/insertelt-int-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/insertelt-int-rv64.ll
index 1dd00197bbbb044..44a68a5e2ca3133 100644
--- a/llvm/test/CodeGen/RISCV/rvv/insertelt-int-rv64.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/insertelt-int-rv64.ll
@@ -151,7 +151,7 @@ define <vscale x 16 x i8> @insertelt_nxv16i8_0(<vscale x 16 x i8> %v, i8 signext
define <vscale x 16 x i8> @insertelt_nxv16i8_imm(<vscale x 16 x i8> %v, i8 signext %elt) {
; CHECK-LABEL: insertelt_nxv16i8_imm:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 4, e8, m2, tu, ma
+; CHECK-NEXT: vsetivli zero, 4, e8, m1, tu, ma
; CHECK-NEXT: vmv.s.x v10, a0
; CHECK-NEXT: vslideup.vi v8, v10, 3
; CHECK-NEXT: ret
@@ -185,7 +185,7 @@ define <vscale x 32 x i8> @insertelt_nxv32i8_0(<vscale x 32 x i8> %v, i8 signext
define <vscale x 32 x i8> @insertelt_nxv32i8_imm(<vscale x 32 x i8> %v, i8 signext %elt) {
; CHECK-LABEL: insertelt_nxv32i8_imm:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 4, e8, m4, tu, ma
+; CHECK-NEXT: vsetivli zero, 4, e8, m1, tu, ma
; CHECK-NEXT: vmv.s.x v12, a0
; CHECK-NEXT: vslideup.vi v8, v12, 3
; CHECK-NEXT: ret
@@ -219,7 +219,7 @@ define <vscale x 64 x i8> @insertelt_nxv64i8_0(<vscale x 64 x i8> %v, i8 signext
define <vscale x 64 x i8> @insertelt_nxv64i8_imm(<vscale x 64 x i8> %v, i8 signext %elt) {
; CHECK-LABEL: insertelt_nxv64i8_imm:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 4, e8, m8, tu, ma
+; CHECK-NEXT: vsetivli zero, 4, e8, m1, tu, ma
; CHECK-NEXT: vmv.s.x v16, a0
; CHECK-NEXT: vslideup.vi v8, v16, 3
; CHECK-NEXT: ret
@@ -355,7 +355,7 @@ define <vscale x 8 x i16> @insertelt_nxv8i16_0(<vscale x 8 x i16> %v, i16 signex
define <vscale x 8 x i16> @insertelt_nxv8i16_imm(<vscale x 8 x i16> %v, i16 signext %elt) {
; CHECK-LABEL: insertelt_nxv8i16_imm:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 4, e16, m2, tu, ma
+; CHECK-NEXT: vsetivli zero, 4, e16, m1, tu, ma
; CHECK-NEXT: vmv.s.x v10, a0
; CHECK-NEXT: vslideup.vi v8, v10, 3
; CHECK-NEXT: ret
@@ -389,7 +389,7 @@ define <vscale x 16 x i16> @insertelt_nxv16i16_0(<vscale x 16 x i16> %v, i16 sig
define <vscale x 16 x i16> @insertelt_nxv16i16_imm(<vscale x 16 x i16> %v, i16 signext %elt) {
; CHECK-LABEL: insertelt_nxv16i16_imm:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 4, e16, m4, tu, ma
+; CHECK-NEXT: vsetivli zero, 4, e16, m1, tu, ma
; CHECK-NEXT: vmv.s.x v12, a0
; CHECK-NEXT: vslideup.vi v8, v12, 3
; CHECK-NEXT: ret
@@ -423,7 +423,7 @@ define <vscale x 32 x i16> @insertelt_nxv32i16_0(<vscale x 32 x i16> %v, i16 sig
define <vscale x 32 x i16> @insertelt_nxv32i16_imm(<vscale x 32 x i16> %v, i16 signext %elt) {
; CHECK-LABEL: insertelt_nxv32i16_imm:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 4, e16, m8, tu, ma
+; CHECK-NEXT: vsetivli zero, 4, e16, m1, tu, ma
; CHECK-NEXT: vmv.s.x v16, a0
; CHECK-NEXT: vslideup.vi v8, v16, 3
; CHECK-NEXT: ret
@@ -525,7 +525,7 @@ define <vscale x 4 x i32> @insertelt_nxv4i32_0(<vscale x 4 x i32> %v, i32 signex
define <vscale x 4 x i32> @insertelt_nxv4i32_imm(<vscale x 4 x i32> %v, i32 signext %elt) {
; CHECK-LABEL: insertelt_nxv4i32_imm:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 4, e32, m2, tu, ma
+; CHECK-NEXT: vsetivli zero, 4, e32, m1, tu, ma
; CHECK-NEXT: vmv.s.x v10, a0
; CHECK-NEXT: vslideup.vi v8, v10, 3
; CHECK-NEXT: ret
@@ -559,7 +559,7 @@ define <vscale x 8 x i32> @insertelt_nxv8i32_0(<vscale x 8 x i32> %v, i32 signex
define <vscale x 8 x i32> @insertelt_nxv8i32_imm(<vscale x 8 x i32> %v, i32 signext %elt) {
; CHECK-LABEL: insertelt_nxv8i32_imm:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 4, e32, m4, tu, ma
+; CHECK-NEXT: vsetivli zero, 4, e32, m1, tu, ma
; CHECK-NEXT: vmv.s.x v12, a0
; CHECK-NEXT: vslideup.vi v8, v12, 3
; CHECK-NEXT: ret
@@ -593,7 +593,7 @@ define <vscale x 16 x i32> @insertelt_nxv16i32_0(<vscale x 16 x i32> %v, i32 sig
define <vscale x 16 x i32> @insertelt_nxv16i32_imm(<vscale x 16 x i32> %v, i32 signext %elt) {
; CHECK-LABEL: insertelt_nxv16i32_imm:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 4, e32, m8, tu, ma
+; CHECK-NEXT: vsetivli zero, 4, e32, m1, tu, ma
; CHECK-NEXT: vmv.s.x v16, a0
; CHECK-NEXT: vslideup.vi v8, v16, 3
; CHECK-NEXT: ret
@@ -699,7 +699,7 @@ define <vscale x 4 x i64> @insertelt_nxv4i64_0(<vscale x 4 x i64> %v, i64 %elt)
define <vscale x 4 x i64> @insertelt_nxv4i64_imm(<vscale x 4 x i64> %v, i64 %elt) {
; CHECK-LABEL: insertelt_nxv4i64_imm:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 4, e64, m4, tu, ma
+; CHECK-NEXT: vsetivli zero, 4, e64, m2, tu, ma
; CHECK-NEXT: vmv.s.x v12, a0
; CHECK-NEXT: vslideup.vi v8, v12, 3
; CHECK-NEXT: ret
@@ -735,7 +735,7 @@ define <vscale x 8 x i64> @insertelt_nxv8i64_0(<vscale x 8 x i64> %v, i64 %elt)
define <vscale x 8 x i64> @insertelt_nxv8i64_imm(<vscale x 8 x i64> %v, i64 %elt) {
; CHECK-LABEL: insertelt_nxv8i64_imm:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 4, e64, m8, tu, ma
+; CHECK-NEXT: vsetivli zero, 4, e64, m2, tu, ma
; CHECK-NEXT: vmv.s.x v16, a0
; CHECK-NEXT: vslideup.vi v8, v16, 3
; CHECK-NEXT: ret
More information about the llvm-commits
mailing list