[llvm] 3a6cc52 - Revert "[RISCV] Shrink vslideup's LMUL when lowering fixed insert_subvector (#65997)"
Philip Reames via llvm-commits
llvm-commits at lists.llvm.org
Tue Oct 10 15:14:24 PDT 2023
Author: Philip Reames
Date: 2023-10-10T15:13:57-07:00
New Revision: 3a6cc52fe3501b2b7b3aabdff305a18122c9e0db
URL: https://github.com/llvm/llvm-project/commit/3a6cc52fe3501b2b7b3aabdff305a18122c9e0db
DIFF: https://github.com/llvm/llvm-project/commit/3a6cc52fe3501b2b7b3aabdff305a18122c9e0db.diff
LOG: Revert "[RISCV] Shrink vslideup's LMUL when lowering fixed insert_subvector (#65997)"
This reverts commit b5ff71e261b637ab7088fb5c3314bf71d6e01da7. As described in
https://github.com/llvm/llvm-project/issues/68730, this appears to have exposed
an existing liveness issue. Revert to green until we can figure out how to
address the root cause.
Note: This was not a clean revert. I ended up doing it by hand.
Added:
Modified:
llvm/lib/Target/RISCV/RISCVISelLowering.cpp
llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll
llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-combine.ll
llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 6be3fa71479be5c..e0ca9913f942acd 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -8887,17 +8887,6 @@ SDValue RISCVTargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
return DAG.getBitcast(Op.getValueType(), SubVec);
}
- // Shrink down Vec so we're performing the slideup on a smaller LMUL.
- unsigned LastIdx = OrigIdx + SubVecVT.getVectorNumElements() - 1;
- MVT OrigContainerVT = ContainerVT;
- SDValue OrigVec = Vec;
- if (auto ShrunkVT =
- getSmallestVTForIndex(ContainerVT, LastIdx, DL, DAG, Subtarget)) {
- ContainerVT = *ShrunkVT;
- Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ContainerVT, Vec,
- DAG.getVectorIdxConstant(0, DL));
- }
-
SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ContainerVT,
DAG.getUNDEF(ContainerVT), SubVec,
DAG.getConstant(0, DL, XLenVT));
@@ -8924,12 +8913,6 @@ SDValue RISCVTargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
SlideupAmt, Mask, VL, Policy);
}
- // If we performed the slideup on a smaller LMUL, insert the result back
- // into the rest of the vector.
- if (ContainerVT != OrigContainerVT)
- SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, OrigContainerVT, OrigVec,
- SubVec, DAG.getVectorIdxConstant(0, DL));
-
if (VecVT.isFixedLengthVector())
SubVec = convertFromScalableVector(VecVT, SubVec, DAG, Subtarget);
return DAG.getBitcast(Op.getValueType(), SubVec);
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll
index 15669f03e893d74..a77c49c942561b6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll
@@ -14,7 +14,7 @@ define <vscale x 8 x i32> @insert_nxv8i32_v2i32_0(<vscale x 8 x i32> %vec, ptr %
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
; CHECK-NEXT: vle32.v v12, (a0)
-; CHECK-NEXT: vsetivli zero, 2, e32, m1, tu, ma
+; CHECK-NEXT: vsetivli zero, 2, e32, m4, tu, ma
; CHECK-NEXT: vmv.v.v v8, v12
; CHECK-NEXT: ret
%sv = load <2 x i32>, ptr %svp
@@ -27,7 +27,7 @@ define <vscale x 8 x i32> @insert_nxv8i32_v2i32_2(<vscale x 8 x i32> %vec, ptr %
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
; CHECK-NEXT: vle32.v v12, (a0)
-; CHECK-NEXT: vsetivli zero, 4, e32, m1, tu, ma
+; CHECK-NEXT: vsetivli zero, 4, e32, m4, tu, ma
; CHECK-NEXT: vslideup.vi v8, v12, 2
; CHECK-NEXT: ret
%sv = load <2 x i32>, ptr %svp
@@ -40,7 +40,7 @@ define <vscale x 8 x i32> @insert_nxv8i32_v2i32_6(<vscale x 8 x i32> %vec, ptr %
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
; CHECK-NEXT: vle32.v v12, (a0)
-; CHECK-NEXT: vsetivli zero, 8, e32, m2, tu, ma
+; CHECK-NEXT: vsetivli zero, 8, e32, m4, tu, ma
; CHECK-NEXT: vslideup.vi v8, v12, 6
; CHECK-NEXT: ret
%sv = load <2 x i32>, ptr %svp
@@ -51,19 +51,22 @@ define <vscale x 8 x i32> @insert_nxv8i32_v2i32_6(<vscale x 8 x i32> %vec, ptr %
define <vscale x 8 x i32> @insert_nxv8i32_v8i32_0(<vscale x 8 x i32> %vec, ptr %svp) {
; LMULMAX2-LABEL: insert_nxv8i32_v8i32_0:
; LMULMAX2: # %bb.0:
-; LMULMAX2-NEXT: vsetivli zero, 8, e32, m2, tu, ma
-; LMULMAX2-NEXT: vle32.v v8, (a0)
+; LMULMAX2-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; LMULMAX2-NEXT: vle32.v v12, (a0)
+; LMULMAX2-NEXT: vsetivli zero, 8, e32, m4, tu, ma
+; LMULMAX2-NEXT: vmv.v.v v8, v12
; LMULMAX2-NEXT: ret
;
; LMULMAX1-LABEL: insert_nxv8i32_v8i32_0:
; LMULMAX1: # %bb.0:
-; LMULMAX1-NEXT: addi a1, a0, 16
; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-NEXT: vle32.v v12, (a1)
-; LMULMAX1-NEXT: vsetvli zero, zero, e32, m1, tu, ma
-; LMULMAX1-NEXT: vle32.v v8, (a0)
-; LMULMAX1-NEXT: vsetivli zero, 8, e32, m2, tu, ma
-; LMULMAX1-NEXT: vslideup.vi v8, v12, 4
+; LMULMAX1-NEXT: vle32.v v12, (a0)
+; LMULMAX1-NEXT: addi a0, a0, 16
+; LMULMAX1-NEXT: vle32.v v16, (a0)
+; LMULMAX1-NEXT: vsetivli zero, 4, e32, m4, tu, ma
+; LMULMAX1-NEXT: vmv.v.v v8, v12
+; LMULMAX1-NEXT: vsetivli zero, 8, e32, m4, tu, ma
+; LMULMAX1-NEXT: vslideup.vi v8, v16, 4
; LMULMAX1-NEXT: ret
%sv = load <8 x i32>, ptr %svp
%v = call <vscale x 8 x i32> @llvm.vector.insert.v8i32.nxv8i32(<vscale x 8 x i32> %vec, <8 x i32> %sv, i64 0)
@@ -81,14 +84,14 @@ define <vscale x 8 x i32> @insert_nxv8i32_v8i32_8(<vscale x 8 x i32> %vec, ptr %
;
; LMULMAX1-LABEL: insert_nxv8i32_v8i32_8:
; LMULMAX1: # %bb.0:
+; LMULMAX1-NEXT: addi a1, a0, 16
; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-NEXT: vle32.v v12, (a0)
-; LMULMAX1-NEXT: addi a0, a0, 16
+; LMULMAX1-NEXT: vle32.v v12, (a1)
; LMULMAX1-NEXT: vle32.v v16, (a0)
; LMULMAX1-NEXT: vsetivli zero, 12, e32, m4, tu, ma
-; LMULMAX1-NEXT: vslideup.vi v8, v12, 8
+; LMULMAX1-NEXT: vslideup.vi v8, v16, 8
; LMULMAX1-NEXT: vsetivli zero, 16, e32, m4, tu, ma
-; LMULMAX1-NEXT: vslideup.vi v8, v16, 12
+; LMULMAX1-NEXT: vslideup.vi v8, v12, 12
; LMULMAX1-NEXT: ret
%sv = load <8 x i32>, ptr %svp
%v = call <vscale x 8 x i32> @llvm.vector.insert.v8i32.nxv8i32(<vscale x 8 x i32> %vec, <8 x i32> %sv, i64 8)
@@ -163,7 +166,7 @@ define void @insert_v8i32_v2i32_0(ptr %vp, ptr %svp) {
; LMULMAX2-NEXT: vle32.v v8, (a1)
; LMULMAX2-NEXT: vsetivli zero, 8, e32, m2, ta, ma
; LMULMAX2-NEXT: vle32.v v10, (a0)
-; LMULMAX2-NEXT: vsetivli zero, 2, e32, m1, tu, ma
+; LMULMAX2-NEXT: vsetivli zero, 2, e32, m2, tu, ma
; LMULMAX2-NEXT: vmv.v.v v10, v8
; LMULMAX2-NEXT: vsetivli zero, 8, e32, m2, ta, ma
; LMULMAX2-NEXT: vse32.v v10, (a0)
@@ -194,7 +197,7 @@ define void @insert_v8i32_v2i32_2(ptr %vp, ptr %svp) {
; LMULMAX2-NEXT: vle32.v v8, (a1)
; LMULMAX2-NEXT: vsetivli zero, 8, e32, m2, ta, ma
; LMULMAX2-NEXT: vle32.v v10, (a0)
-; LMULMAX2-NEXT: vsetivli zero, 4, e32, m1, tu, ma
+; LMULMAX2-NEXT: vsetivli zero, 4, e32, m2, tu, ma
; LMULMAX2-NEXT: vslideup.vi v10, v8, 2
; LMULMAX2-NEXT: vsetivli zero, 8, e32, m2, ta, ma
; LMULMAX2-NEXT: vse32.v v10, (a0)
@@ -505,9 +508,9 @@ define void @insert_v2i64_nxv16i64(ptr %psv0, ptr %psv1, <vscale x 16 x i64>* %o
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; CHECK-NEXT: vle64.v v8, (a0)
-; CHECK-NEXT: vle64.v v12, (a1)
-; CHECK-NEXT: vsetivli zero, 6, e64, m4, tu, ma
-; CHECK-NEXT: vslideup.vi v8, v12, 4
+; CHECK-NEXT: vle64.v v16, (a1)
+; CHECK-NEXT: vsetivli zero, 6, e64, m8, tu, ma
+; CHECK-NEXT: vslideup.vi v8, v16, 4
; CHECK-NEXT: vs8r.v v8, (a2)
; CHECK-NEXT: ret
%sv0 = load <2 x i64>, ptr %psv0
@@ -536,7 +539,7 @@ define void @insert_v2i64_nxv16i64_lo2(ptr %psv, <vscale x 16 x i64>* %out) {
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; CHECK-NEXT: vle64.v v8, (a0)
-; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-NEXT: vsetivli zero, 4, e64, m8, ta, ma
; CHECK-NEXT: vslideup.vi v16, v8, 2
; CHECK-NEXT: vs8r.v v16, (a1)
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-combine.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-combine.ll
index 805557905117add..f52ba6f51d5c897 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-combine.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-combine.ll
@@ -27,13 +27,13 @@ define void @widen_3xv4i16(ptr %x, ptr %z) {
; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
; CHECK-NEXT: vle16.v v8, (a0)
; CHECK-NEXT: addi a2, a0, 8
-; CHECK-NEXT: vle16.v v9, (a2)
+; CHECK-NEXT: vle16.v v10, (a2)
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vle16.v v10, (a0)
-; CHECK-NEXT: vsetivli zero, 8, e16, m1, tu, ma
-; CHECK-NEXT: vslideup.vi v8, v9, 4
+; CHECK-NEXT: vle16.v v12, (a0)
+; CHECK-NEXT: vsetivli zero, 8, e16, m2, tu, ma
+; CHECK-NEXT: vslideup.vi v8, v10, 4
; CHECK-NEXT: vsetivli zero, 12, e16, m2, tu, ma
-; CHECK-NEXT: vslideup.vi v8, v10, 8
+; CHECK-NEXT: vslideup.vi v8, v12, 8
; CHECK-NEXT: vse16.v v8, (a1)
; CHECK-NEXT: ret
%a = load <4 x i16>, ptr %x
@@ -75,17 +75,17 @@ define void @widen_4xv4i16_unaligned(ptr %x, ptr %z) {
; CHECK-NO-MISALIGN-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
; CHECK-NO-MISALIGN-NEXT: vle8.v v8, (a0)
; CHECK-NO-MISALIGN-NEXT: addi a2, a0, 8
-; CHECK-NO-MISALIGN-NEXT: vle8.v v9, (a2)
-; CHECK-NO-MISALIGN-NEXT: addi a2, a0, 16
; CHECK-NO-MISALIGN-NEXT: vle8.v v10, (a2)
+; CHECK-NO-MISALIGN-NEXT: addi a2, a0, 16
+; CHECK-NO-MISALIGN-NEXT: vle8.v v12, (a2)
; CHECK-NO-MISALIGN-NEXT: addi a0, a0, 24
-; CHECK-NO-MISALIGN-NEXT: vle8.v v12, (a0)
-; CHECK-NO-MISALIGN-NEXT: vsetvli zero, zero, e16, m1, tu, ma
-; CHECK-NO-MISALIGN-NEXT: vslideup.vi v8, v9, 4
+; CHECK-NO-MISALIGN-NEXT: vle8.v v14, (a0)
+; CHECK-NO-MISALIGN-NEXT: vsetivli zero, 8, e16, m2, tu, ma
+; CHECK-NO-MISALIGN-NEXT: vslideup.vi v8, v10, 4
; CHECK-NO-MISALIGN-NEXT: vsetivli zero, 12, e16, m2, tu, ma
-; CHECK-NO-MISALIGN-NEXT: vslideup.vi v8, v10, 8
+; CHECK-NO-MISALIGN-NEXT: vslideup.vi v8, v12, 8
; CHECK-NO-MISALIGN-NEXT: vsetivli zero, 16, e16, m2, ta, ma
-; CHECK-NO-MISALIGN-NEXT: vslideup.vi v8, v12, 12
+; CHECK-NO-MISALIGN-NEXT: vslideup.vi v8, v14, 12
; CHECK-NO-MISALIGN-NEXT: vse16.v v8, (a1)
; CHECK-NO-MISALIGN-NEXT: ret
;
@@ -188,17 +188,17 @@ define void @strided_constant_mismatch_4xv4i16(ptr %x, ptr %z) {
; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
; CHECK-NEXT: vle16.v v8, (a0)
; CHECK-NEXT: addi a2, a0, 2
-; CHECK-NEXT: vle16.v v9, (a2)
-; CHECK-NEXT: addi a2, a0, 6
; CHECK-NEXT: vle16.v v10, (a2)
+; CHECK-NEXT: addi a2, a0, 6
+; CHECK-NEXT: vle16.v v12, (a2)
; CHECK-NEXT: addi a0, a0, 8
-; CHECK-NEXT: vle16.v v12, (a0)
-; CHECK-NEXT: vsetivli zero, 8, e16, m1, tu, ma
-; CHECK-NEXT: vslideup.vi v8, v9, 4
+; CHECK-NEXT: vle16.v v14, (a0)
+; CHECK-NEXT: vsetivli zero, 8, e16, m2, tu, ma
+; CHECK-NEXT: vslideup.vi v8, v10, 4
; CHECK-NEXT: vsetivli zero, 12, e16, m2, tu, ma
-; CHECK-NEXT: vslideup.vi v8, v10, 8
+; CHECK-NEXT: vslideup.vi v8, v12, 8
; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
-; CHECK-NEXT: vslideup.vi v8, v12, 12
+; CHECK-NEXT: vslideup.vi v8, v14, 12
; CHECK-NEXT: vse16.v v8, (a1)
; CHECK-NEXT: ret
%a = load <4 x i16>, ptr %x
@@ -258,17 +258,17 @@ define void @strided_runtime_mismatch_4xv4i16(ptr %x, ptr %z, i64 %s, i64 %t) {
; RV32-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
; RV32-NEXT: vle16.v v8, (a0)
; RV32-NEXT: add a0, a0, a2
-; RV32-NEXT: vle16.v v9, (a0)
-; RV32-NEXT: add a0, a0, a4
; RV32-NEXT: vle16.v v10, (a0)
-; RV32-NEXT: add a0, a0, a2
+; RV32-NEXT: add a0, a0, a4
; RV32-NEXT: vle16.v v12, (a0)
-; RV32-NEXT: vsetivli zero, 8, e16, m1, tu, ma
-; RV32-NEXT: vslideup.vi v8, v9, 4
+; RV32-NEXT: add a0, a0, a2
+; RV32-NEXT: vle16.v v14, (a0)
+; RV32-NEXT: vsetivli zero, 8, e16, m2, tu, ma
+; RV32-NEXT: vslideup.vi v8, v10, 4
; RV32-NEXT: vsetivli zero, 12, e16, m2, tu, ma
-; RV32-NEXT: vslideup.vi v8, v10, 8
+; RV32-NEXT: vslideup.vi v8, v12, 8
; RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma
-; RV32-NEXT: vslideup.vi v8, v12, 12
+; RV32-NEXT: vslideup.vi v8, v14, 12
; RV32-NEXT: vse16.v v8, (a1)
; RV32-NEXT: ret
;
@@ -277,17 +277,17 @@ define void @strided_runtime_mismatch_4xv4i16(ptr %x, ptr %z, i64 %s, i64 %t) {
; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
; RV64-NEXT: vle16.v v8, (a0)
; RV64-NEXT: add a0, a0, a2
-; RV64-NEXT: vle16.v v9, (a0)
-; RV64-NEXT: add a0, a0, a3
; RV64-NEXT: vle16.v v10, (a0)
-; RV64-NEXT: add a0, a0, a2
+; RV64-NEXT: add a0, a0, a3
; RV64-NEXT: vle16.v v12, (a0)
-; RV64-NEXT: vsetivli zero, 8, e16, m1, tu, ma
-; RV64-NEXT: vslideup.vi v8, v9, 4
+; RV64-NEXT: add a0, a0, a2
+; RV64-NEXT: vle16.v v14, (a0)
+; RV64-NEXT: vsetivli zero, 8, e16, m2, tu, ma
+; RV64-NEXT: vslideup.vi v8, v10, 4
; RV64-NEXT: vsetivli zero, 12, e16, m2, tu, ma
-; RV64-NEXT: vslideup.vi v8, v10, 8
+; RV64-NEXT: vslideup.vi v8, v12, 8
; RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma
-; RV64-NEXT: vslideup.vi v8, v12, 12
+; RV64-NEXT: vslideup.vi v8, v14, 12
; RV64-NEXT: vse16.v v8, (a1)
; RV64-NEXT: ret
;
@@ -296,17 +296,17 @@ define void @strided_runtime_mismatch_4xv4i16(ptr %x, ptr %z, i64 %s, i64 %t) {
; ZVE64F-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
; ZVE64F-NEXT: vle16.v v8, (a0)
; ZVE64F-NEXT: add a0, a0, a2
-; ZVE64F-NEXT: vle16.v v9, (a0)
-; ZVE64F-NEXT: add a0, a0, a3
; ZVE64F-NEXT: vle16.v v10, (a0)
-; ZVE64F-NEXT: add a0, a0, a2
+; ZVE64F-NEXT: add a0, a0, a3
; ZVE64F-NEXT: vle16.v v12, (a0)
-; ZVE64F-NEXT: vsetivli zero, 8, e16, m1, tu, ma
-; ZVE64F-NEXT: vslideup.vi v8, v9, 4
+; ZVE64F-NEXT: add a0, a0, a2
+; ZVE64F-NEXT: vle16.v v14, (a0)
+; ZVE64F-NEXT: vsetivli zero, 8, e16, m2, tu, ma
+; ZVE64F-NEXT: vslideup.vi v8, v10, 4
; ZVE64F-NEXT: vsetivli zero, 12, e16, m2, tu, ma
-; ZVE64F-NEXT: vslideup.vi v8, v10, 8
+; ZVE64F-NEXT: vslideup.vi v8, v12, 8
; ZVE64F-NEXT: vsetivli zero, 16, e16, m2, ta, ma
-; ZVE64F-NEXT: vslideup.vi v8, v12, 12
+; ZVE64F-NEXT: vslideup.vi v8, v14, 12
; ZVE64F-NEXT: vse16.v v8, (a1)
; ZVE64F-NEXT: ret
%a = load <4 x i16>, ptr %x
diff --git a/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll b/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll
index 6cfa504b501bacb..7497051027fa372 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll
@@ -460,49 +460,54 @@ define <4 x i32> @stest_f16i32(<4 x half> %x) {
; CHECK-V-NEXT: sub sp, sp, a1
; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 4 * vlenb
; CHECK-V-NEXT: lhu s0, 24(a0)
-; CHECK-V-NEXT: lhu s1, 0(a0)
-; CHECK-V-NEXT: lhu s2, 8(a0)
-; CHECK-V-NEXT: lhu a0, 16(a0)
+; CHECK-V-NEXT: lhu s1, 16(a0)
+; CHECK-V-NEXT: lhu s2, 0(a0)
+; CHECK-V-NEXT: lhu a0, 8(a0)
; CHECK-V-NEXT: fmv.w.x fa0, a0
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz
; CHECK-V-NEXT: vsetivli zero, 1, e64, m1, ta, ma
; CHECK-V-NEXT: vmv.s.x v8, a0
-; CHECK-V-NEXT: csrr a0, vlenb
-; CHECK-V-NEXT: slli a0, a0, 1
-; CHECK-V-NEXT: add a0, sp, a0
-; CHECK-V-NEXT: addi a0, a0, 16
+; CHECK-V-NEXT: addi a0, sp, 16
; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill
; CHECK-V-NEXT: fmv.w.x fa0, s2
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT: vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-V-NEXT: fmv.w.x fa0, s1
+; CHECK-V-NEXT: vsetivli zero, 2, e64, m2, tu, ma
; CHECK-V-NEXT: vmv.s.x v8, a0
; CHECK-V-NEXT: addi a0, sp, 16
-; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT: vslideup.vi v8, v10, 1
+; CHECK-V-NEXT: csrr a0, vlenb
+; CHECK-V-NEXT: slli a0, a0, 1
+; CHECK-V-NEXT: add a0, sp, a0
+; CHECK-V-NEXT: addi a0, a0, 16
+; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT: fmv.w.x fa0, s1
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT: vsetivli zero, 2, e64, m1, tu, ma
-; CHECK-V-NEXT: vmv.s.x v10, a0
-; CHECK-V-NEXT: addi a0, sp, 16
-; CHECK-V-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT: vslideup.vi v10, v8, 1
; CHECK-V-NEXT: vsetivli zero, 3, e64, m2, tu, ma
+; CHECK-V-NEXT: vmv.s.x v8, a0
; CHECK-V-NEXT: csrr a0, vlenb
; CHECK-V-NEXT: slli a0, a0, 1
; CHECK-V-NEXT: add a0, sp, a0
; CHECK-V-NEXT: addi a0, a0, 16
-; CHECK-V-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload
; CHECK-V-NEXT: vslideup.vi v10, v8, 2
-; CHECK-V-NEXT: addi a0, sp, 16
+; CHECK-V-NEXT: csrr a0, vlenb
+; CHECK-V-NEXT: slli a0, a0, 1
+; CHECK-V-NEXT: add a0, sp, a0
+; CHECK-V-NEXT: addi a0, a0, 16
; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill
; CHECK-V-NEXT: fmv.w.x fa0, s0
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz
; CHECK-V-NEXT: vsetivli zero, 4, e64, m2, ta, ma
; CHECK-V-NEXT: vmv.s.x v8, a0
-; CHECK-V-NEXT: addi a0, sp, 16
+; CHECK-V-NEXT: csrr a0, vlenb
+; CHECK-V-NEXT: slli a0, a0, 1
+; CHECK-V-NEXT: add a0, sp, a0
+; CHECK-V-NEXT: addi a0, a0, 16
; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload
; CHECK-V-NEXT: vslideup.vi v10, v8, 3
; CHECK-V-NEXT: lui a0, 524288
@@ -627,49 +632,54 @@ define <4 x i32> @utesth_f16i32(<4 x half> %x) {
; CHECK-V-NEXT: sub sp, sp, a1
; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 4 * vlenb
; CHECK-V-NEXT: lhu s0, 24(a0)
-; CHECK-V-NEXT: lhu s1, 0(a0)
-; CHECK-V-NEXT: lhu s2, 8(a0)
-; CHECK-V-NEXT: lhu a0, 16(a0)
+; CHECK-V-NEXT: lhu s1, 16(a0)
+; CHECK-V-NEXT: lhu s2, 0(a0)
+; CHECK-V-NEXT: lhu a0, 8(a0)
; CHECK-V-NEXT: fmv.w.x fa0, a0
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz
; CHECK-V-NEXT: vsetivli zero, 1, e64, m1, ta, ma
; CHECK-V-NEXT: vmv.s.x v8, a0
-; CHECK-V-NEXT: csrr a0, vlenb
-; CHECK-V-NEXT: slli a0, a0, 1
-; CHECK-V-NEXT: add a0, sp, a0
-; CHECK-V-NEXT: addi a0, a0, 16
+; CHECK-V-NEXT: addi a0, sp, 16
; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill
; CHECK-V-NEXT: fmv.w.x fa0, s2
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz
-; CHECK-V-NEXT: vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-V-NEXT: fmv.w.x fa0, s1
+; CHECK-V-NEXT: vsetivli zero, 2, e64, m2, tu, ma
; CHECK-V-NEXT: vmv.s.x v8, a0
; CHECK-V-NEXT: addi a0, sp, 16
-; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT: vslideup.vi v8, v10, 1
+; CHECK-V-NEXT: csrr a0, vlenb
+; CHECK-V-NEXT: slli a0, a0, 1
+; CHECK-V-NEXT: add a0, sp, a0
+; CHECK-V-NEXT: addi a0, a0, 16
+; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT: fmv.w.x fa0, s1
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz
-; CHECK-V-NEXT: vsetivli zero, 2, e64, m1, tu, ma
-; CHECK-V-NEXT: vmv.s.x v10, a0
-; CHECK-V-NEXT: addi a0, sp, 16
-; CHECK-V-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT: vslideup.vi v10, v8, 1
; CHECK-V-NEXT: vsetivli zero, 3, e64, m2, tu, ma
+; CHECK-V-NEXT: vmv.s.x v8, a0
; CHECK-V-NEXT: csrr a0, vlenb
; CHECK-V-NEXT: slli a0, a0, 1
; CHECK-V-NEXT: add a0, sp, a0
; CHECK-V-NEXT: addi a0, a0, 16
-; CHECK-V-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload
; CHECK-V-NEXT: vslideup.vi v10, v8, 2
-; CHECK-V-NEXT: addi a0, sp, 16
+; CHECK-V-NEXT: csrr a0, vlenb
+; CHECK-V-NEXT: slli a0, a0, 1
+; CHECK-V-NEXT: add a0, sp, a0
+; CHECK-V-NEXT: addi a0, a0, 16
; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill
; CHECK-V-NEXT: fmv.w.x fa0, s0
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz
; CHECK-V-NEXT: vsetivli zero, 4, e64, m2, ta, ma
; CHECK-V-NEXT: vmv.s.x v8, a0
-; CHECK-V-NEXT: addi a0, sp, 16
+; CHECK-V-NEXT: csrr a0, vlenb
+; CHECK-V-NEXT: slli a0, a0, 1
+; CHECK-V-NEXT: add a0, sp, a0
+; CHECK-V-NEXT: addi a0, a0, 16
; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload
; CHECK-V-NEXT: vslideup.vi v10, v8, 3
; CHECK-V-NEXT: li a0, -1
@@ -803,49 +813,54 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) {
; CHECK-V-NEXT: sub sp, sp, a1
; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 4 * vlenb
; CHECK-V-NEXT: lhu s0, 24(a0)
-; CHECK-V-NEXT: lhu s1, 0(a0)
-; CHECK-V-NEXT: lhu s2, 8(a0)
-; CHECK-V-NEXT: lhu a0, 16(a0)
+; CHECK-V-NEXT: lhu s1, 16(a0)
+; CHECK-V-NEXT: lhu s2, 0(a0)
+; CHECK-V-NEXT: lhu a0, 8(a0)
; CHECK-V-NEXT: fmv.w.x fa0, a0
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz
; CHECK-V-NEXT: vsetivli zero, 1, e64, m1, ta, ma
; CHECK-V-NEXT: vmv.s.x v8, a0
-; CHECK-V-NEXT: csrr a0, vlenb
-; CHECK-V-NEXT: slli a0, a0, 1
-; CHECK-V-NEXT: add a0, sp, a0
-; CHECK-V-NEXT: addi a0, a0, 16
+; CHECK-V-NEXT: addi a0, sp, 16
; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill
; CHECK-V-NEXT: fmv.w.x fa0, s2
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT: vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-V-NEXT: fmv.w.x fa0, s1
+; CHECK-V-NEXT: vsetivli zero, 2, e64, m2, tu, ma
; CHECK-V-NEXT: vmv.s.x v8, a0
; CHECK-V-NEXT: addi a0, sp, 16
-; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT: vslideup.vi v8, v10, 1
+; CHECK-V-NEXT: csrr a0, vlenb
+; CHECK-V-NEXT: slli a0, a0, 1
+; CHECK-V-NEXT: add a0, sp, a0
+; CHECK-V-NEXT: addi a0, a0, 16
+; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT: fmv.w.x fa0, s1
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT: vsetivli zero, 2, e64, m1, tu, ma
-; CHECK-V-NEXT: vmv.s.x v10, a0
-; CHECK-V-NEXT: addi a0, sp, 16
-; CHECK-V-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT: vslideup.vi v10, v8, 1
; CHECK-V-NEXT: vsetivli zero, 3, e64, m2, tu, ma
+; CHECK-V-NEXT: vmv.s.x v8, a0
; CHECK-V-NEXT: csrr a0, vlenb
; CHECK-V-NEXT: slli a0, a0, 1
; CHECK-V-NEXT: add a0, sp, a0
; CHECK-V-NEXT: addi a0, a0, 16
-; CHECK-V-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload
; CHECK-V-NEXT: vslideup.vi v10, v8, 2
-; CHECK-V-NEXT: addi a0, sp, 16
+; CHECK-V-NEXT: csrr a0, vlenb
+; CHECK-V-NEXT: slli a0, a0, 1
+; CHECK-V-NEXT: add a0, sp, a0
+; CHECK-V-NEXT: addi a0, a0, 16
; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill
; CHECK-V-NEXT: fmv.w.x fa0, s0
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz
; CHECK-V-NEXT: vsetivli zero, 4, e64, m2, ta, ma
; CHECK-V-NEXT: vmv.s.x v8, a0
-; CHECK-V-NEXT: addi a0, sp, 16
+; CHECK-V-NEXT: csrr a0, vlenb
+; CHECK-V-NEXT: slli a0, a0, 1
+; CHECK-V-NEXT: add a0, sp, a0
+; CHECK-V-NEXT: addi a0, a0, 16
; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload
; CHECK-V-NEXT: vslideup.vi v10, v8, 3
; CHECK-V-NEXT: li a0, -1
@@ -1439,8 +1454,8 @@ define <8 x i16> @stest_f16i16(<8 x half> %x) {
; CHECK-V-NEXT: lhu s3, 32(a0)
; CHECK-V-NEXT: lhu s4, 24(a0)
; CHECK-V-NEXT: lhu s5, 16(a0)
-; CHECK-V-NEXT: lhu s6, 8(a0)
-; CHECK-V-NEXT: lhu a0, 0(a0)
+; CHECK-V-NEXT: lhu s6, 0(a0)
+; CHECK-V-NEXT: lhu a0, 8(a0)
; CHECK-V-NEXT: fmv.w.x fa0, a0
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz
@@ -1451,16 +1466,16 @@ define <8 x i16> @stest_f16i16(<8 x half> %x) {
; CHECK-V-NEXT: fmv.w.x fa0, s6
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT: vsetivli zero, 2, e32, m1, tu, ma
+; CHECK-V-NEXT: vsetivli zero, 2, e32, m2, tu, ma
; CHECK-V-NEXT: vmv.s.x v8, a0
; CHECK-V-NEXT: addi a0, sp, 16
; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT: vslideup.vi v10, v8, 1
-; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT: vslideup.vi v8, v10, 1
+; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill
; CHECK-V-NEXT: fmv.w.x fa0, s5
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT: vsetivli zero, 3, e32, m1, tu, ma
+; CHECK-V-NEXT: vsetivli zero, 3, e32, m2, tu, ma
; CHECK-V-NEXT: vmv.s.x v8, a0
; CHECK-V-NEXT: addi a0, sp, 16
; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload
@@ -1469,7 +1484,7 @@ define <8 x i16> @stest_f16i16(<8 x half> %x) {
; CHECK-V-NEXT: fmv.w.x fa0, s4
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT: vsetivli zero, 4, e32, m1, tu, ma
+; CHECK-V-NEXT: vsetivli zero, 4, e32, m2, tu, ma
; CHECK-V-NEXT: vmv.s.x v8, a0
; CHECK-V-NEXT: addi a0, sp, 16
; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload
@@ -1722,8 +1737,8 @@ define <8 x i16> @utesth_f16i16(<8 x half> %x) {
; CHECK-V-NEXT: lhu s3, 32(a0)
; CHECK-V-NEXT: lhu s4, 24(a0)
; CHECK-V-NEXT: lhu s5, 16(a0)
-; CHECK-V-NEXT: lhu s6, 8(a0)
-; CHECK-V-NEXT: lhu a0, 0(a0)
+; CHECK-V-NEXT: lhu s6, 0(a0)
+; CHECK-V-NEXT: lhu a0, 8(a0)
; CHECK-V-NEXT: fmv.w.x fa0, a0
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz
@@ -1734,16 +1749,16 @@ define <8 x i16> @utesth_f16i16(<8 x half> %x) {
; CHECK-V-NEXT: fmv.w.x fa0, s6
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz
-; CHECK-V-NEXT: vsetivli zero, 2, e32, m1, tu, ma
+; CHECK-V-NEXT: vsetivli zero, 2, e32, m2, tu, ma
; CHECK-V-NEXT: vmv.s.x v8, a0
; CHECK-V-NEXT: addi a0, sp, 16
; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT: vslideup.vi v10, v8, 1
-; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT: vslideup.vi v8, v10, 1
+; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill
; CHECK-V-NEXT: fmv.w.x fa0, s5
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz
-; CHECK-V-NEXT: vsetivli zero, 3, e32, m1, tu, ma
+; CHECK-V-NEXT: vsetivli zero, 3, e32, m2, tu, ma
; CHECK-V-NEXT: vmv.s.x v8, a0
; CHECK-V-NEXT: addi a0, sp, 16
; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload
@@ -1752,7 +1767,7 @@ define <8 x i16> @utesth_f16i16(<8 x half> %x) {
; CHECK-V-NEXT: fmv.w.x fa0, s4
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz
-; CHECK-V-NEXT: vsetivli zero, 4, e32, m1, tu, ma
+; CHECK-V-NEXT: vsetivli zero, 4, e32, m2, tu, ma
; CHECK-V-NEXT: vmv.s.x v8, a0
; CHECK-V-NEXT: addi a0, sp, 16
; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload
@@ -2025,8 +2040,8 @@ define <8 x i16> @ustest_f16i16(<8 x half> %x) {
; CHECK-V-NEXT: lhu s3, 32(a0)
; CHECK-V-NEXT: lhu s4, 24(a0)
; CHECK-V-NEXT: lhu s5, 16(a0)
-; CHECK-V-NEXT: lhu s6, 8(a0)
-; CHECK-V-NEXT: lhu a0, 0(a0)
+; CHECK-V-NEXT: lhu s6, 0(a0)
+; CHECK-V-NEXT: lhu a0, 8(a0)
; CHECK-V-NEXT: fmv.w.x fa0, a0
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz
@@ -2037,16 +2052,16 @@ define <8 x i16> @ustest_f16i16(<8 x half> %x) {
; CHECK-V-NEXT: fmv.w.x fa0, s6
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT: vsetivli zero, 2, e32, m1, tu, ma
+; CHECK-V-NEXT: vsetivli zero, 2, e32, m2, tu, ma
; CHECK-V-NEXT: vmv.s.x v8, a0
; CHECK-V-NEXT: addi a0, sp, 16
; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT: vslideup.vi v10, v8, 1
-; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT: vslideup.vi v8, v10, 1
+; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill
; CHECK-V-NEXT: fmv.w.x fa0, s5
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT: vsetivli zero, 3, e32, m1, tu, ma
+; CHECK-V-NEXT: vsetivli zero, 3, e32, m2, tu, ma
; CHECK-V-NEXT: vmv.s.x v8, a0
; CHECK-V-NEXT: addi a0, sp, 16
; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload
@@ -2055,7 +2070,7 @@ define <8 x i16> @ustest_f16i16(<8 x half> %x) {
; CHECK-V-NEXT: fmv.w.x fa0, s4
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT: vsetivli zero, 4, e32, m1, tu, ma
+; CHECK-V-NEXT: vsetivli zero, 4, e32, m2, tu, ma
; CHECK-V-NEXT: vmv.s.x v8, a0
; CHECK-V-NEXT: addi a0, sp, 16
; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload
@@ -3781,49 +3796,54 @@ define <4 x i32> @stest_f16i32_mm(<4 x half> %x) {
; CHECK-V-NEXT: sub sp, sp, a1
; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 4 * vlenb
; CHECK-V-NEXT: lhu s0, 24(a0)
-; CHECK-V-NEXT: lhu s1, 0(a0)
-; CHECK-V-NEXT: lhu s2, 8(a0)
-; CHECK-V-NEXT: lhu a0, 16(a0)
+; CHECK-V-NEXT: lhu s1, 16(a0)
+; CHECK-V-NEXT: lhu s2, 0(a0)
+; CHECK-V-NEXT: lhu a0, 8(a0)
; CHECK-V-NEXT: fmv.w.x fa0, a0
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz
; CHECK-V-NEXT: vsetivli zero, 1, e64, m1, ta, ma
; CHECK-V-NEXT: vmv.s.x v8, a0
-; CHECK-V-NEXT: csrr a0, vlenb
-; CHECK-V-NEXT: slli a0, a0, 1
-; CHECK-V-NEXT: add a0, sp, a0
-; CHECK-V-NEXT: addi a0, a0, 16
+; CHECK-V-NEXT: addi a0, sp, 16
; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill
; CHECK-V-NEXT: fmv.w.x fa0, s2
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT: vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-V-NEXT: fmv.w.x fa0, s1
+; CHECK-V-NEXT: vsetivli zero, 2, e64, m2, tu, ma
; CHECK-V-NEXT: vmv.s.x v8, a0
; CHECK-V-NEXT: addi a0, sp, 16
-; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT: vslideup.vi v8, v10, 1
+; CHECK-V-NEXT: csrr a0, vlenb
+; CHECK-V-NEXT: slli a0, a0, 1
+; CHECK-V-NEXT: add a0, sp, a0
+; CHECK-V-NEXT: addi a0, a0, 16
+; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT: fmv.w.x fa0, s1
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT: vsetivli zero, 2, e64, m1, tu, ma
-; CHECK-V-NEXT: vmv.s.x v10, a0
-; CHECK-V-NEXT: addi a0, sp, 16
-; CHECK-V-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT: vslideup.vi v10, v8, 1
; CHECK-V-NEXT: vsetivli zero, 3, e64, m2, tu, ma
+; CHECK-V-NEXT: vmv.s.x v8, a0
; CHECK-V-NEXT: csrr a0, vlenb
; CHECK-V-NEXT: slli a0, a0, 1
; CHECK-V-NEXT: add a0, sp, a0
; CHECK-V-NEXT: addi a0, a0, 16
-; CHECK-V-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload
; CHECK-V-NEXT: vslideup.vi v10, v8, 2
-; CHECK-V-NEXT: addi a0, sp, 16
+; CHECK-V-NEXT: csrr a0, vlenb
+; CHECK-V-NEXT: slli a0, a0, 1
+; CHECK-V-NEXT: add a0, sp, a0
+; CHECK-V-NEXT: addi a0, a0, 16
; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill
; CHECK-V-NEXT: fmv.w.x fa0, s0
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz
; CHECK-V-NEXT: vsetivli zero, 4, e64, m2, ta, ma
; CHECK-V-NEXT: vmv.s.x v8, a0
-; CHECK-V-NEXT: addi a0, sp, 16
+; CHECK-V-NEXT: csrr a0, vlenb
+; CHECK-V-NEXT: slli a0, a0, 1
+; CHECK-V-NEXT: add a0, sp, a0
+; CHECK-V-NEXT: addi a0, a0, 16
; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload
; CHECK-V-NEXT: vslideup.vi v10, v8, 3
; CHECK-V-NEXT: lui a0, 524288
@@ -3946,49 +3966,54 @@ define <4 x i32> @utesth_f16i32_mm(<4 x half> %x) {
; CHECK-V-NEXT: sub sp, sp, a1
; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 4 * vlenb
; CHECK-V-NEXT: lhu s0, 24(a0)
-; CHECK-V-NEXT: lhu s1, 0(a0)
-; CHECK-V-NEXT: lhu s2, 8(a0)
-; CHECK-V-NEXT: lhu a0, 16(a0)
+; CHECK-V-NEXT: lhu s1, 16(a0)
+; CHECK-V-NEXT: lhu s2, 0(a0)
+; CHECK-V-NEXT: lhu a0, 8(a0)
; CHECK-V-NEXT: fmv.w.x fa0, a0
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz
; CHECK-V-NEXT: vsetivli zero, 1, e64, m1, ta, ma
; CHECK-V-NEXT: vmv.s.x v8, a0
-; CHECK-V-NEXT: csrr a0, vlenb
-; CHECK-V-NEXT: slli a0, a0, 1
-; CHECK-V-NEXT: add a0, sp, a0
-; CHECK-V-NEXT: addi a0, a0, 16
+; CHECK-V-NEXT: addi a0, sp, 16
; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill
; CHECK-V-NEXT: fmv.w.x fa0, s2
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz
-; CHECK-V-NEXT: vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-V-NEXT: fmv.w.x fa0, s1
+; CHECK-V-NEXT: vsetivli zero, 2, e64, m2, tu, ma
; CHECK-V-NEXT: vmv.s.x v8, a0
; CHECK-V-NEXT: addi a0, sp, 16
-; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT: vslideup.vi v8, v10, 1
+; CHECK-V-NEXT: csrr a0, vlenb
+; CHECK-V-NEXT: slli a0, a0, 1
+; CHECK-V-NEXT: add a0, sp, a0
+; CHECK-V-NEXT: addi a0, a0, 16
+; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT: fmv.w.x fa0, s1
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz
-; CHECK-V-NEXT: vsetivli zero, 2, e64, m1, tu, ma
-; CHECK-V-NEXT: vmv.s.x v10, a0
-; CHECK-V-NEXT: addi a0, sp, 16
-; CHECK-V-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT: vslideup.vi v10, v8, 1
; CHECK-V-NEXT: vsetivli zero, 3, e64, m2, tu, ma
+; CHECK-V-NEXT: vmv.s.x v8, a0
; CHECK-V-NEXT: csrr a0, vlenb
; CHECK-V-NEXT: slli a0, a0, 1
; CHECK-V-NEXT: add a0, sp, a0
; CHECK-V-NEXT: addi a0, a0, 16
-; CHECK-V-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload
; CHECK-V-NEXT: vslideup.vi v10, v8, 2
-; CHECK-V-NEXT: addi a0, sp, 16
+; CHECK-V-NEXT: csrr a0, vlenb
+; CHECK-V-NEXT: slli a0, a0, 1
+; CHECK-V-NEXT: add a0, sp, a0
+; CHECK-V-NEXT: addi a0, a0, 16
; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill
; CHECK-V-NEXT: fmv.w.x fa0, s0
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz
; CHECK-V-NEXT: vsetivli zero, 4, e64, m2, ta, ma
; CHECK-V-NEXT: vmv.s.x v8, a0
-; CHECK-V-NEXT: addi a0, sp, 16
+; CHECK-V-NEXT: csrr a0, vlenb
+; CHECK-V-NEXT: slli a0, a0, 1
+; CHECK-V-NEXT: add a0, sp, a0
+; CHECK-V-NEXT: addi a0, a0, 16
; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload
; CHECK-V-NEXT: vslideup.vi v10, v8, 3
; CHECK-V-NEXT: li a0, -1
@@ -4121,49 +4146,54 @@ define <4 x i32> @ustest_f16i32_mm(<4 x half> %x) {
; CHECK-V-NEXT: sub sp, sp, a1
; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 4 * vlenb
; CHECK-V-NEXT: lhu s0, 24(a0)
-; CHECK-V-NEXT: lhu s1, 0(a0)
-; CHECK-V-NEXT: lhu s2, 8(a0)
-; CHECK-V-NEXT: lhu a0, 16(a0)
+; CHECK-V-NEXT: lhu s1, 16(a0)
+; CHECK-V-NEXT: lhu s2, 0(a0)
+; CHECK-V-NEXT: lhu a0, 8(a0)
; CHECK-V-NEXT: fmv.w.x fa0, a0
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz
; CHECK-V-NEXT: vsetivli zero, 1, e64, m1, ta, ma
; CHECK-V-NEXT: vmv.s.x v8, a0
-; CHECK-V-NEXT: csrr a0, vlenb
-; CHECK-V-NEXT: slli a0, a0, 1
-; CHECK-V-NEXT: add a0, sp, a0
-; CHECK-V-NEXT: addi a0, a0, 16
+; CHECK-V-NEXT: addi a0, sp, 16
; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill
; CHECK-V-NEXT: fmv.w.x fa0, s2
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT: vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-V-NEXT: fmv.w.x fa0, s1
+; CHECK-V-NEXT: vsetivli zero, 2, e64, m2, tu, ma
; CHECK-V-NEXT: vmv.s.x v8, a0
; CHECK-V-NEXT: addi a0, sp, 16
-; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT: vslideup.vi v8, v10, 1
+; CHECK-V-NEXT: csrr a0, vlenb
+; CHECK-V-NEXT: slli a0, a0, 1
+; CHECK-V-NEXT: add a0, sp, a0
+; CHECK-V-NEXT: addi a0, a0, 16
+; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT: fmv.w.x fa0, s1
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT: vsetivli zero, 2, e64, m1, tu, ma
-; CHECK-V-NEXT: vmv.s.x v10, a0
-; CHECK-V-NEXT: addi a0, sp, 16
-; CHECK-V-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT: vslideup.vi v10, v8, 1
; CHECK-V-NEXT: vsetivli zero, 3, e64, m2, tu, ma
+; CHECK-V-NEXT: vmv.s.x v8, a0
; CHECK-V-NEXT: csrr a0, vlenb
; CHECK-V-NEXT: slli a0, a0, 1
; CHECK-V-NEXT: add a0, sp, a0
; CHECK-V-NEXT: addi a0, a0, 16
-; CHECK-V-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload
; CHECK-V-NEXT: vslideup.vi v10, v8, 2
-; CHECK-V-NEXT: addi a0, sp, 16
+; CHECK-V-NEXT: csrr a0, vlenb
+; CHECK-V-NEXT: slli a0, a0, 1
+; CHECK-V-NEXT: add a0, sp, a0
+; CHECK-V-NEXT: addi a0, a0, 16
; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill
; CHECK-V-NEXT: fmv.w.x fa0, s0
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz
; CHECK-V-NEXT: vsetivli zero, 4, e64, m2, ta, ma
; CHECK-V-NEXT: vmv.s.x v8, a0
-; CHECK-V-NEXT: addi a0, sp, 16
+; CHECK-V-NEXT: csrr a0, vlenb
+; CHECK-V-NEXT: slli a0, a0, 1
+; CHECK-V-NEXT: add a0, sp, a0
+; CHECK-V-NEXT: addi a0, a0, 16
; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload
; CHECK-V-NEXT: vslideup.vi v10, v8, 3
; CHECK-V-NEXT: li a0, -1
@@ -4745,8 +4775,8 @@ define <8 x i16> @stest_f16i16_mm(<8 x half> %x) {
; CHECK-V-NEXT: lhu s3, 32(a0)
; CHECK-V-NEXT: lhu s4, 24(a0)
; CHECK-V-NEXT: lhu s5, 16(a0)
-; CHECK-V-NEXT: lhu s6, 8(a0)
-; CHECK-V-NEXT: lhu a0, 0(a0)
+; CHECK-V-NEXT: lhu s6, 0(a0)
+; CHECK-V-NEXT: lhu a0, 8(a0)
; CHECK-V-NEXT: fmv.w.x fa0, a0
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz
@@ -4757,16 +4787,16 @@ define <8 x i16> @stest_f16i16_mm(<8 x half> %x) {
; CHECK-V-NEXT: fmv.w.x fa0, s6
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT: vsetivli zero, 2, e32, m1, tu, ma
+; CHECK-V-NEXT: vsetivli zero, 2, e32, m2, tu, ma
; CHECK-V-NEXT: vmv.s.x v8, a0
; CHECK-V-NEXT: addi a0, sp, 16
; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT: vslideup.vi v10, v8, 1
-; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT: vslideup.vi v8, v10, 1
+; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill
; CHECK-V-NEXT: fmv.w.x fa0, s5
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT: vsetivli zero, 3, e32, m1, tu, ma
+; CHECK-V-NEXT: vsetivli zero, 3, e32, m2, tu, ma
; CHECK-V-NEXT: vmv.s.x v8, a0
; CHECK-V-NEXT: addi a0, sp, 16
; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload
@@ -4775,7 +4805,7 @@ define <8 x i16> @stest_f16i16_mm(<8 x half> %x) {
; CHECK-V-NEXT: fmv.w.x fa0, s4
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT: vsetivli zero, 4, e32, m1, tu, ma
+; CHECK-V-NEXT: vsetivli zero, 4, e32, m2, tu, ma
; CHECK-V-NEXT: vmv.s.x v8, a0
; CHECK-V-NEXT: addi a0, sp, 16
; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload
@@ -5024,8 +5054,8 @@ define <8 x i16> @utesth_f16i16_mm(<8 x half> %x) {
; CHECK-V-NEXT: lhu s3, 32(a0)
; CHECK-V-NEXT: lhu s4, 24(a0)
; CHECK-V-NEXT: lhu s5, 16(a0)
-; CHECK-V-NEXT: lhu s6, 8(a0)
-; CHECK-V-NEXT: lhu a0, 0(a0)
+; CHECK-V-NEXT: lhu s6, 0(a0)
+; CHECK-V-NEXT: lhu a0, 8(a0)
; CHECK-V-NEXT: fmv.w.x fa0, a0
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz
@@ -5036,16 +5066,16 @@ define <8 x i16> @utesth_f16i16_mm(<8 x half> %x) {
; CHECK-V-NEXT: fmv.w.x fa0, s6
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz
-; CHECK-V-NEXT: vsetivli zero, 2, e32, m1, tu, ma
+; CHECK-V-NEXT: vsetivli zero, 2, e32, m2, tu, ma
; CHECK-V-NEXT: vmv.s.x v8, a0
; CHECK-V-NEXT: addi a0, sp, 16
; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT: vslideup.vi v10, v8, 1
-; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT: vslideup.vi v8, v10, 1
+; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill
; CHECK-V-NEXT: fmv.w.x fa0, s5
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz
-; CHECK-V-NEXT: vsetivli zero, 3, e32, m1, tu, ma
+; CHECK-V-NEXT: vsetivli zero, 3, e32, m2, tu, ma
; CHECK-V-NEXT: vmv.s.x v8, a0
; CHECK-V-NEXT: addi a0, sp, 16
; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload
@@ -5054,7 +5084,7 @@ define <8 x i16> @utesth_f16i16_mm(<8 x half> %x) {
; CHECK-V-NEXT: fmv.w.x fa0, s4
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz
-; CHECK-V-NEXT: vsetivli zero, 4, e32, m1, tu, ma
+; CHECK-V-NEXT: vsetivli zero, 4, e32, m2, tu, ma
; CHECK-V-NEXT: vmv.s.x v8, a0
; CHECK-V-NEXT: addi a0, sp, 16
; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload
@@ -5326,8 +5356,8 @@ define <8 x i16> @ustest_f16i16_mm(<8 x half> %x) {
; CHECK-V-NEXT: lhu s3, 32(a0)
; CHECK-V-NEXT: lhu s4, 24(a0)
; CHECK-V-NEXT: lhu s5, 16(a0)
-; CHECK-V-NEXT: lhu s6, 8(a0)
-; CHECK-V-NEXT: lhu a0, 0(a0)
+; CHECK-V-NEXT: lhu s6, 0(a0)
+; CHECK-V-NEXT: lhu a0, 8(a0)
; CHECK-V-NEXT: fmv.w.x fa0, a0
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz
@@ -5338,16 +5368,16 @@ define <8 x i16> @ustest_f16i16_mm(<8 x half> %x) {
; CHECK-V-NEXT: fmv.w.x fa0, s6
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT: vsetivli zero, 2, e32, m1, tu, ma
+; CHECK-V-NEXT: vsetivli zero, 2, e32, m2, tu, ma
; CHECK-V-NEXT: vmv.s.x v8, a0
; CHECK-V-NEXT: addi a0, sp, 16
; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT: vslideup.vi v10, v8, 1
-; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT: vslideup.vi v8, v10, 1
+; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill
; CHECK-V-NEXT: fmv.w.x fa0, s5
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT: vsetivli zero, 3, e32, m1, tu, ma
+; CHECK-V-NEXT: vsetivli zero, 3, e32, m2, tu, ma
; CHECK-V-NEXT: vmv.s.x v8, a0
; CHECK-V-NEXT: addi a0, sp, 16
; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload
@@ -5356,7 +5386,7 @@ define <8 x i16> @ustest_f16i16_mm(<8 x half> %x) {
; CHECK-V-NEXT: fmv.w.x fa0, s4
; CHECK-V-NEXT: call __extendhfsf2 at plt
; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT: vsetivli zero, 4, e32, m1, tu, ma
+; CHECK-V-NEXT: vsetivli zero, 4, e32, m2, tu, ma
; CHECK-V-NEXT: vmv.s.x v8, a0
; CHECK-V-NEXT: addi a0, sp, 16
; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload
More information about the llvm-commits
mailing list