[llvm] 545a71c - [RISCV] Pre-promote v1i1/v2i1/v4i1->i1/i2/i4 bitcasts before type legalization
Craig Topper via llvm-commits
llvm-commits at lists.llvm.org
Sat Jun 18 11:16:19 PDT 2022
Author: Craig Topper
Date: 2022-06-18T11:06:45-07:00
New Revision: 545a71c0d66eb93eb47ad49ec6f91965cca8f137
URL: https://github.com/llvm/llvm-project/commit/545a71c0d66eb93eb47ad49ec6f91965cca8f137
DIFF: https://github.com/llvm/llvm-project/commit/545a71c0d66eb93eb47ad49ec6f91965cca8f137.diff
LOG: [RISCV] Pre-promote v1i1/v2i1/v4i1->i1/i2/i4 bitcasts before type legalization
Type legalization will convert the bitcast into a vector store and
scalar load.
Instead this patch widens the vector to v8i1 with undef, and bitcasts
it to i8. v8i1->i8 has custom handling for type legalization already to
bitcast to a v1i8 vector and use an extract_element.
The code here was lifted from X86's avx512 support.
Reviewed By: reames
Differential Revision: https://reviews.llvm.org/D128099
Added:
Modified:
llvm/lib/Target/RISCV/RISCVISelLowering.cpp
llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll
llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 75f144742c710..8dc10a99e427f 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -942,6 +942,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
setTargetDAGCombine({ISD::FCOPYSIGN, ISD::MGATHER, ISD::MSCATTER,
ISD::VP_GATHER, ISD::VP_SCATTER, ISD::SRA, ISD::SRL,
ISD::SHL, ISD::STORE, ISD::SPLAT_VECTOR});
+ if (Subtarget.useRVVForFixedLengthVectors())
+ setTargetDAGCombine(ISD::BITCAST);
setLibcallName(RTLIB::FPEXT_F16_F32, "__extendhfsf2");
setLibcallName(RTLIB::FPROUND_F32_F16, "__truncsfhf2");
@@ -9049,6 +9051,26 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
}
}
}
+ case ISD::BITCAST: {
+ assert(Subtarget.useRVVForFixedLengthVectors());
+ SDValue N0 = N->getOperand(0);
+ EVT VT = N->getValueType(0);
+ EVT SrcVT = N0.getValueType();
+ // If this is a bitcast between a MVT::v4i1/v2i1/v1i1 and an illegal integer
+ // type, widen both sides to avoid a trip through memory.
+ if ((SrcVT == MVT::v1i1 || SrcVT == MVT::v2i1 || SrcVT == MVT::v4i1) &&
+ VT.isScalarInteger()) {
+ unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
+ SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(SrcVT));
+ Ops[0] = N0;
+ SDLoc DL(N);
+ N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i1, Ops);
+ N0 = DAG.getBitcast(MVT::i8, N0);
+ return DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
+ }
+
+ return SDValue();
+ }
}
return SDValue();
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
index 8b73efd7151e9..cb1d2e40bc8ae 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
@@ -75,27 +75,14 @@ define <2 x i8> @mgather_v2i8(<2 x i8*> %ptrs, <2 x i1> %m, <2 x i8> %passthru)
;
; RV64ZVE32F-LABEL: mgather_v2i8:
; RV64ZVE32F: # %bb.0:
-; RV64ZVE32F-NEXT: addi sp, sp, -16
-; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16
-; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v9, 0
-; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v10, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu
-; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0
-; RV64ZVE32F-NEXT: addi a2, sp, 15
-; RV64ZVE32F-NEXT: vsm.v v9, (a2)
-; RV64ZVE32F-NEXT: lbu a2, 15(sp)
+; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu
+; RV64ZVE32F-NEXT: vmv.x.s a2, v0
; RV64ZVE32F-NEXT: andi a3, a2, 1
; RV64ZVE32F-NEXT: bnez a3, .LBB1_3
; RV64ZVE32F-NEXT: # %bb.1: # %else
; RV64ZVE32F-NEXT: andi a0, a2, 2
; RV64ZVE32F-NEXT: bnez a0, .LBB1_4
; RV64ZVE32F-NEXT: .LBB1_2: # %else2
-; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
; RV64ZVE32F-NEXT: .LBB1_3: # %cond.load
; RV64ZVE32F-NEXT: lb a0, 0(a0)
@@ -109,7 +96,6 @@ define <2 x i8> @mgather_v2i8(<2 x i8*> %ptrs, <2 x i1> %m, <2 x i8> %passthru)
; RV64ZVE32F-NEXT: vmv.s.x v9, a0
; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, mf4, tu, mu
; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1
-; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
%v = call <2 x i8> @llvm.masked.gather.v2i8.v2p0i8(<2 x i8*> %ptrs, i32 1, <2 x i1> %m, <2 x i8> %passthru)
ret <2 x i8> %v
@@ -142,20 +128,8 @@ define <2 x i16> @mgather_v2i8_sextload_v2i16(<2 x i8*> %ptrs, <2 x i1> %m, <2 x
;
; RV64ZVE32F-LABEL: mgather_v2i8_sextload_v2i16:
; RV64ZVE32F: # %bb.0:
-; RV64ZVE32F-NEXT: addi sp, sp, -16
-; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16
-; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v9, 0
-; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v10, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu
-; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0
-; RV64ZVE32F-NEXT: addi a2, sp, 15
-; RV64ZVE32F-NEXT: vsm.v v9, (a2)
-; RV64ZVE32F-NEXT: lbu a2, 15(sp)
+; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu
+; RV64ZVE32F-NEXT: vmv.x.s a2, v0
; RV64ZVE32F-NEXT: andi a3, a2, 1
; RV64ZVE32F-NEXT: beqz a3, .LBB2_2
; RV64ZVE32F-NEXT: # %bb.1: # %cond.load
@@ -175,7 +149,6 @@ define <2 x i16> @mgather_v2i8_sextload_v2i16(<2 x i8*> %ptrs, <2 x i1> %m, <2 x
; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu
; RV64ZVE32F-NEXT: vsext.vf2 v9, v8
; RV64ZVE32F-NEXT: vmv1r.v v8, v9
-; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
%v = call <2 x i8> @llvm.masked.gather.v2i8.v2p0i8(<2 x i8*> %ptrs, i32 1, <2 x i1> %m, <2 x i8> %passthru)
%ev = sext <2 x i8> %v to <2 x i16>
@@ -209,20 +182,8 @@ define <2 x i16> @mgather_v2i8_zextload_v2i16(<2 x i8*> %ptrs, <2 x i1> %m, <2 x
;
; RV64ZVE32F-LABEL: mgather_v2i8_zextload_v2i16:
; RV64ZVE32F: # %bb.0:
-; RV64ZVE32F-NEXT: addi sp, sp, -16
-; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16
-; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v9, 0
-; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v10, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu
-; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0
-; RV64ZVE32F-NEXT: addi a2, sp, 15
-; RV64ZVE32F-NEXT: vsm.v v9, (a2)
-; RV64ZVE32F-NEXT: lbu a2, 15(sp)
+; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu
+; RV64ZVE32F-NEXT: vmv.x.s a2, v0
; RV64ZVE32F-NEXT: andi a3, a2, 1
; RV64ZVE32F-NEXT: beqz a3, .LBB3_2
; RV64ZVE32F-NEXT: # %bb.1: # %cond.load
@@ -242,7 +203,6 @@ define <2 x i16> @mgather_v2i8_zextload_v2i16(<2 x i8*> %ptrs, <2 x i1> %m, <2 x
; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu
; RV64ZVE32F-NEXT: vzext.vf2 v9, v8
; RV64ZVE32F-NEXT: vmv1r.v v8, v9
-; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
%v = call <2 x i8> @llvm.masked.gather.v2i8.v2p0i8(<2 x i8*> %ptrs, i32 1, <2 x i1> %m, <2 x i8> %passthru)
%ev = zext <2 x i8> %v to <2 x i16>
@@ -276,20 +236,8 @@ define <2 x i32> @mgather_v2i8_sextload_v2i32(<2 x i8*> %ptrs, <2 x i1> %m, <2 x
;
; RV64ZVE32F-LABEL: mgather_v2i8_sextload_v2i32:
; RV64ZVE32F: # %bb.0:
-; RV64ZVE32F-NEXT: addi sp, sp, -16
-; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16
-; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v9, 0
-; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v10, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu
-; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0
-; RV64ZVE32F-NEXT: addi a2, sp, 15
-; RV64ZVE32F-NEXT: vsm.v v9, (a2)
-; RV64ZVE32F-NEXT: lbu a2, 15(sp)
+; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu
+; RV64ZVE32F-NEXT: vmv.x.s a2, v0
; RV64ZVE32F-NEXT: andi a3, a2, 1
; RV64ZVE32F-NEXT: beqz a3, .LBB4_2
; RV64ZVE32F-NEXT: # %bb.1: # %cond.load
@@ -309,7 +257,6 @@ define <2 x i32> @mgather_v2i8_sextload_v2i32(<2 x i8*> %ptrs, <2 x i1> %m, <2 x
; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu
; RV64ZVE32F-NEXT: vsext.vf4 v9, v8
; RV64ZVE32F-NEXT: vmv.v.v v8, v9
-; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
%v = call <2 x i8> @llvm.masked.gather.v2i8.v2p0i8(<2 x i8*> %ptrs, i32 1, <2 x i1> %m, <2 x i8> %passthru)
%ev = sext <2 x i8> %v to <2 x i32>
@@ -343,20 +290,8 @@ define <2 x i32> @mgather_v2i8_zextload_v2i32(<2 x i8*> %ptrs, <2 x i1> %m, <2 x
;
; RV64ZVE32F-LABEL: mgather_v2i8_zextload_v2i32:
; RV64ZVE32F: # %bb.0:
-; RV64ZVE32F-NEXT: addi sp, sp, -16
-; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16
-; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v9, 0
-; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v10, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu
-; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0
-; RV64ZVE32F-NEXT: addi a2, sp, 15
-; RV64ZVE32F-NEXT: vsm.v v9, (a2)
-; RV64ZVE32F-NEXT: lbu a2, 15(sp)
+; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu
+; RV64ZVE32F-NEXT: vmv.x.s a2, v0
; RV64ZVE32F-NEXT: andi a3, a2, 1
; RV64ZVE32F-NEXT: beqz a3, .LBB5_2
; RV64ZVE32F-NEXT: # %bb.1: # %cond.load
@@ -376,7 +311,6 @@ define <2 x i32> @mgather_v2i8_zextload_v2i32(<2 x i8*> %ptrs, <2 x i1> %m, <2 x
; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu
; RV64ZVE32F-NEXT: vzext.vf4 v9, v8
; RV64ZVE32F-NEXT: vmv.v.v v8, v9
-; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
%v = call <2 x i8> @llvm.masked.gather.v2i8.v2p0i8(<2 x i8*> %ptrs, i32 1, <2 x i1> %m, <2 x i8> %passthru)
%ev = zext <2 x i8> %v to <2 x i32>
@@ -418,20 +352,8 @@ define <2 x i64> @mgather_v2i8_sextload_v2i64(<2 x i8*> %ptrs, <2 x i1> %m, <2 x
;
; RV64ZVE32F-LABEL: mgather_v2i8_sextload_v2i64:
; RV64ZVE32F: # %bb.0:
-; RV64ZVE32F-NEXT: addi sp, sp, -16
-; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16
-; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v9, 0
-; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v10, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu
-; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0
-; RV64ZVE32F-NEXT: addi a2, sp, 15
-; RV64ZVE32F-NEXT: vsm.v v9, (a2)
-; RV64ZVE32F-NEXT: lbu a2, 15(sp)
+; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu
+; RV64ZVE32F-NEXT: vmv.x.s a2, v0
; RV64ZVE32F-NEXT: andi a3, a2, 1
; RV64ZVE32F-NEXT: beqz a3, .LBB6_2
; RV64ZVE32F-NEXT: # %bb.1: # %cond.load
@@ -452,7 +374,6 @@ define <2 x i64> @mgather_v2i8_sextload_v2i64(<2 x i8*> %ptrs, <2 x i1> %m, <2 x
; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1
; RV64ZVE32F-NEXT: vmv.x.s a1, v9
; RV64ZVE32F-NEXT: vmv.x.s a0, v8
-; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
%v = call <2 x i8> @llvm.masked.gather.v2i8.v2p0i8(<2 x i8*> %ptrs, i32 1, <2 x i1> %m, <2 x i8> %passthru)
%ev = sext <2 x i8> %v to <2 x i64>
@@ -494,20 +415,8 @@ define <2 x i64> @mgather_v2i8_zextload_v2i64(<2 x i8*> %ptrs, <2 x i1> %m, <2 x
;
; RV64ZVE32F-LABEL: mgather_v2i8_zextload_v2i64:
; RV64ZVE32F: # %bb.0:
-; RV64ZVE32F-NEXT: addi sp, sp, -16
-; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16
-; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v9, 0
-; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v10, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu
-; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0
-; RV64ZVE32F-NEXT: addi a2, sp, 15
-; RV64ZVE32F-NEXT: vsm.v v9, (a2)
-; RV64ZVE32F-NEXT: lbu a2, 15(sp)
+; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu
+; RV64ZVE32F-NEXT: vmv.x.s a2, v0
; RV64ZVE32F-NEXT: andi a3, a2, 1
; RV64ZVE32F-NEXT: beqz a3, .LBB7_2
; RV64ZVE32F-NEXT: # %bb.1: # %cond.load
@@ -530,7 +439,6 @@ define <2 x i64> @mgather_v2i8_zextload_v2i64(<2 x i8*> %ptrs, <2 x i1> %m, <2 x
; RV64ZVE32F-NEXT: andi a1, a0, 255
; RV64ZVE32F-NEXT: vmv.x.s a0, v8
; RV64ZVE32F-NEXT: andi a0, a0, 255
-; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
%v = call <2 x i8> @llvm.masked.gather.v2i8.v2p0i8(<2 x i8*> %ptrs, i32 1, <2 x i1> %m, <2 x i8> %passthru)
%ev = zext <2 x i8> %v to <2 x i64>
@@ -556,20 +464,8 @@ define <4 x i8> @mgather_v4i8(<4 x i8*> %ptrs, <4 x i1> %m, <4 x i8> %passthru)
;
; RV64ZVE32F-LABEL: mgather_v4i8:
; RV64ZVE32F: # %bb.0:
-; RV64ZVE32F-NEXT: addi sp, sp, -16
-; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v9, 0
-; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v10, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu
-; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0
-; RV64ZVE32F-NEXT: addi a1, sp, 15
-; RV64ZVE32F-NEXT: vsm.v v9, (a1)
-; RV64ZVE32F-NEXT: lbu a1, 15(sp)
+; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu
+; RV64ZVE32F-NEXT: vmv.x.s a1, v0
; RV64ZVE32F-NEXT: andi a2, a1, 1
; RV64ZVE32F-NEXT: bnez a2, .LBB8_5
; RV64ZVE32F-NEXT: # %bb.1: # %else
@@ -582,7 +478,6 @@ define <4 x i8> @mgather_v4i8(<4 x i8*> %ptrs, <4 x i1> %m, <4 x i8> %passthru)
; RV64ZVE32F-NEXT: andi a1, a1, 8
; RV64ZVE32F-NEXT: bnez a1, .LBB8_8
; RV64ZVE32F-NEXT: .LBB8_4: # %else8
-; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
; RV64ZVE32F-NEXT: .LBB8_5: # %cond.load
; RV64ZVE32F-NEXT: ld a2, 0(a0)
@@ -616,7 +511,6 @@ define <4 x i8> @mgather_v4i8(<4 x i8*> %ptrs, <4 x i1> %m, <4 x i8> %passthru)
; RV64ZVE32F-NEXT: vmv.s.x v9, a0
; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, mf4, tu, mu
; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 3
-; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
%v = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> %ptrs, i32 1, <4 x i1> %m, <4 x i8> %passthru)
ret <4 x i8> %v
@@ -639,67 +533,53 @@ define <4 x i8> @mgather_truemask_v4i8(<4 x i8*> %ptrs, <4 x i8> %passthru) {
;
; RV64ZVE32F-LABEL: mgather_truemask_v4i8:
; RV64ZVE32F: # %bb.0:
-; RV64ZVE32F-NEXT: addi sp, sp, -16
-; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16
-; RV64ZVE32F-NEXT: ld a1, 24(a0)
-; RV64ZVE32F-NEXT: ld a2, 16(a0)
-; RV64ZVE32F-NEXT: ld a3, 8(a0)
-; RV64ZVE32F-NEXT: ld a4, 0(a0)
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu
-; RV64ZVE32F-NEXT: vmset.m v0
-; RV64ZVE32F-NEXT: vmv.v.i v9, 0
-; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v10, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu
-; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0
; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0
-; RV64ZVE32F-NEXT: addi a0, sp, 15
-; RV64ZVE32F-NEXT: vsm.v v9, (a0)
-; RV64ZVE32F-NEXT: lb a0, 15(sp)
+; RV64ZVE32F-NEXT: vmset.m v9
+; RV64ZVE32F-NEXT: vmv.x.s a1, v9
; RV64ZVE32F-NEXT: beqz zero, .LBB9_5
; RV64ZVE32F-NEXT: # %bb.1: # %else
-; RV64ZVE32F-NEXT: andi a4, a0, 2
-; RV64ZVE32F-NEXT: bnez a4, .LBB9_6
+; RV64ZVE32F-NEXT: andi a2, a1, 2
+; RV64ZVE32F-NEXT: bnez a2, .LBB9_6
; RV64ZVE32F-NEXT: .LBB9_2: # %else2
-; RV64ZVE32F-NEXT: andi a3, a0, 4
-; RV64ZVE32F-NEXT: bnez a3, .LBB9_7
+; RV64ZVE32F-NEXT: andi a2, a1, 4
+; RV64ZVE32F-NEXT: bnez a2, .LBB9_7
; RV64ZVE32F-NEXT: .LBB9_3: # %else5
-; RV64ZVE32F-NEXT: andi a0, a0, 8
-; RV64ZVE32F-NEXT: bnez a0, .LBB9_8
+; RV64ZVE32F-NEXT: andi a1, a1, 8
+; RV64ZVE32F-NEXT: bnez a1, .LBB9_8
; RV64ZVE32F-NEXT: .LBB9_4: # %else8
-; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
; RV64ZVE32F-NEXT: .LBB9_5: # %cond.load
-; RV64ZVE32F-NEXT: lb a4, 0(a4)
+; RV64ZVE32F-NEXT: ld a2, 0(a0)
+; RV64ZVE32F-NEXT: lb a2, 0(a2)
; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, tu, mu
-; RV64ZVE32F-NEXT: vmv.s.x v8, a4
-; RV64ZVE32F-NEXT: andi a4, a0, 2
-; RV64ZVE32F-NEXT: beqz a4, .LBB9_2
+; RV64ZVE32F-NEXT: vmv.s.x v8, a2
+; RV64ZVE32F-NEXT: andi a2, a1, 2
+; RV64ZVE32F-NEXT: beqz a2, .LBB9_2
; RV64ZVE32F-NEXT: .LBB9_6: # %cond.load1
-; RV64ZVE32F-NEXT: lb a3, 0(a3)
+; RV64ZVE32F-NEXT: ld a2, 8(a0)
+; RV64ZVE32F-NEXT: lb a2, 0(a2)
; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu
-; RV64ZVE32F-NEXT: vmv.s.x v9, a3
+; RV64ZVE32F-NEXT: vmv.s.x v9, a2
; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, tu, mu
; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1
-; RV64ZVE32F-NEXT: andi a3, a0, 4
-; RV64ZVE32F-NEXT: beqz a3, .LBB9_3
+; RV64ZVE32F-NEXT: andi a2, a1, 4
+; RV64ZVE32F-NEXT: beqz a2, .LBB9_3
; RV64ZVE32F-NEXT: .LBB9_7: # %cond.load4
+; RV64ZVE32F-NEXT: ld a2, 16(a0)
; RV64ZVE32F-NEXT: lb a2, 0(a2)
; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu
; RV64ZVE32F-NEXT: vmv.s.x v9, a2
; RV64ZVE32F-NEXT: vsetivli zero, 3, e8, mf4, tu, mu
; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 2
-; RV64ZVE32F-NEXT: andi a0, a0, 8
-; RV64ZVE32F-NEXT: beqz a0, .LBB9_4
+; RV64ZVE32F-NEXT: andi a1, a1, 8
+; RV64ZVE32F-NEXT: beqz a1, .LBB9_4
; RV64ZVE32F-NEXT: .LBB9_8: # %cond.load7
-; RV64ZVE32F-NEXT: lb a0, 0(a1)
+; RV64ZVE32F-NEXT: ld a0, 24(a0)
+; RV64ZVE32F-NEXT: lb a0, 0(a0)
; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu
; RV64ZVE32F-NEXT: vmv.s.x v9, a0
; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, mf4, tu, mu
; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 3
-; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
%mhead = insertelement <4 x i1> poison, i1 1, i32 0
%mtrue = shufflevector <4 x i1> %mhead, <4 x i1> poison, <4 x i32> zeroinitializer
@@ -1045,27 +925,14 @@ define <2 x i16> @mgather_v2i16(<2 x i16*> %ptrs, <2 x i1> %m, <2 x i16> %passth
;
; RV64ZVE32F-LABEL: mgather_v2i16:
; RV64ZVE32F: # %bb.0:
-; RV64ZVE32F-NEXT: addi sp, sp, -16
-; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16
-; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v9, 0
-; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v10, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu
-; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0
-; RV64ZVE32F-NEXT: addi a2, sp, 15
-; RV64ZVE32F-NEXT: vsm.v v9, (a2)
-; RV64ZVE32F-NEXT: lbu a2, 15(sp)
+; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu
+; RV64ZVE32F-NEXT: vmv.x.s a2, v0
; RV64ZVE32F-NEXT: andi a3, a2, 1
; RV64ZVE32F-NEXT: bnez a3, .LBB14_3
; RV64ZVE32F-NEXT: # %bb.1: # %else
; RV64ZVE32F-NEXT: andi a0, a2, 2
; RV64ZVE32F-NEXT: bnez a0, .LBB14_4
; RV64ZVE32F-NEXT: .LBB14_2: # %else2
-; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
; RV64ZVE32F-NEXT: .LBB14_3: # %cond.load
; RV64ZVE32F-NEXT: lh a0, 0(a0)
@@ -1079,7 +946,6 @@ define <2 x i16> @mgather_v2i16(<2 x i16*> %ptrs, <2 x i1> %m, <2 x i16> %passth
; RV64ZVE32F-NEXT: vmv.s.x v9, a0
; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, tu, mu
; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1
-; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
%v = call <2 x i16> @llvm.masked.gather.v2i16.v2p0i16(<2 x i16*> %ptrs, i32 2, <2 x i1> %m, <2 x i16> %passthru)
ret <2 x i16> %v
@@ -1112,20 +978,8 @@ define <2 x i32> @mgather_v2i16_sextload_v2i32(<2 x i16*> %ptrs, <2 x i1> %m, <2
;
; RV64ZVE32F-LABEL: mgather_v2i16_sextload_v2i32:
; RV64ZVE32F: # %bb.0:
-; RV64ZVE32F-NEXT: addi sp, sp, -16
-; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16
-; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v9, 0
-; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v10, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu
-; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0
-; RV64ZVE32F-NEXT: addi a2, sp, 15
-; RV64ZVE32F-NEXT: vsm.v v9, (a2)
-; RV64ZVE32F-NEXT: lbu a2, 15(sp)
+; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu
+; RV64ZVE32F-NEXT: vmv.x.s a2, v0
; RV64ZVE32F-NEXT: andi a3, a2, 1
; RV64ZVE32F-NEXT: beqz a3, .LBB15_2
; RV64ZVE32F-NEXT: # %bb.1: # %cond.load
@@ -1145,7 +999,6 @@ define <2 x i32> @mgather_v2i16_sextload_v2i32(<2 x i16*> %ptrs, <2 x i1> %m, <2
; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu
; RV64ZVE32F-NEXT: vsext.vf2 v9, v8
; RV64ZVE32F-NEXT: vmv.v.v v8, v9
-; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
%v = call <2 x i16> @llvm.masked.gather.v2i16.v2p0i16(<2 x i16*> %ptrs, i32 2, <2 x i1> %m, <2 x i16> %passthru)
%ev = sext <2 x i16> %v to <2 x i32>
@@ -1179,20 +1032,8 @@ define <2 x i32> @mgather_v2i16_zextload_v2i32(<2 x i16*> %ptrs, <2 x i1> %m, <2
;
; RV64ZVE32F-LABEL: mgather_v2i16_zextload_v2i32:
; RV64ZVE32F: # %bb.0:
-; RV64ZVE32F-NEXT: addi sp, sp, -16
-; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16
-; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v9, 0
-; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v10, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu
-; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0
-; RV64ZVE32F-NEXT: addi a2, sp, 15
-; RV64ZVE32F-NEXT: vsm.v v9, (a2)
-; RV64ZVE32F-NEXT: lbu a2, 15(sp)
+; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu
+; RV64ZVE32F-NEXT: vmv.x.s a2, v0
; RV64ZVE32F-NEXT: andi a3, a2, 1
; RV64ZVE32F-NEXT: beqz a3, .LBB16_2
; RV64ZVE32F-NEXT: # %bb.1: # %cond.load
@@ -1212,7 +1053,6 @@ define <2 x i32> @mgather_v2i16_zextload_v2i32(<2 x i16*> %ptrs, <2 x i1> %m, <2
; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu
; RV64ZVE32F-NEXT: vzext.vf2 v9, v8
; RV64ZVE32F-NEXT: vmv.v.v v8, v9
-; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
%v = call <2 x i16> @llvm.masked.gather.v2i16.v2p0i16(<2 x i16*> %ptrs, i32 2, <2 x i1> %m, <2 x i16> %passthru)
%ev = zext <2 x i16> %v to <2 x i32>
@@ -1254,20 +1094,8 @@ define <2 x i64> @mgather_v2i16_sextload_v2i64(<2 x i16*> %ptrs, <2 x i1> %m, <2
;
; RV64ZVE32F-LABEL: mgather_v2i16_sextload_v2i64:
; RV64ZVE32F: # %bb.0:
-; RV64ZVE32F-NEXT: addi sp, sp, -16
-; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16
-; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v9, 0
-; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v10, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu
-; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0
-; RV64ZVE32F-NEXT: addi a2, sp, 15
-; RV64ZVE32F-NEXT: vsm.v v9, (a2)
-; RV64ZVE32F-NEXT: lbu a2, 15(sp)
+; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu
+; RV64ZVE32F-NEXT: vmv.x.s a2, v0
; RV64ZVE32F-NEXT: andi a3, a2, 1
; RV64ZVE32F-NEXT: beqz a3, .LBB17_2
; RV64ZVE32F-NEXT: # %bb.1: # %cond.load
@@ -1288,7 +1116,6 @@ define <2 x i64> @mgather_v2i16_sextload_v2i64(<2 x i16*> %ptrs, <2 x i1> %m, <2
; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1
; RV64ZVE32F-NEXT: vmv.x.s a1, v9
; RV64ZVE32F-NEXT: vmv.x.s a0, v8
-; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
%v = call <2 x i16> @llvm.masked.gather.v2i16.v2p0i16(<2 x i16*> %ptrs, i32 2, <2 x i1> %m, <2 x i16> %passthru)
%ev = sext <2 x i16> %v to <2 x i64>
@@ -1332,20 +1159,8 @@ define <2 x i64> @mgather_v2i16_zextload_v2i64(<2 x i16*> %ptrs, <2 x i1> %m, <2
;
; RV64ZVE32F-LABEL: mgather_v2i16_zextload_v2i64:
; RV64ZVE32F: # %bb.0:
-; RV64ZVE32F-NEXT: addi sp, sp, -16
-; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16
-; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v9, 0
-; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v10, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu
-; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0
-; RV64ZVE32F-NEXT: addi a2, sp, 15
-; RV64ZVE32F-NEXT: vsm.v v9, (a2)
-; RV64ZVE32F-NEXT: lbu a2, 15(sp)
+; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu
+; RV64ZVE32F-NEXT: vmv.x.s a2, v0
; RV64ZVE32F-NEXT: andi a3, a2, 1
; RV64ZVE32F-NEXT: beqz a3, .LBB18_2
; RV64ZVE32F-NEXT: # %bb.1: # %cond.load
@@ -1371,7 +1186,6 @@ define <2 x i64> @mgather_v2i16_zextload_v2i64(<2 x i16*> %ptrs, <2 x i1> %m, <2
; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
; RV64ZVE32F-NEXT: vmv.x.s a2, v8
; RV64ZVE32F-NEXT: and a1, a2, a1
-; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
%v = call <2 x i16> @llvm.masked.gather.v2i16.v2p0i16(<2 x i16*> %ptrs, i32 2, <2 x i1> %m, <2 x i16> %passthru)
%ev = zext <2 x i16> %v to <2 x i64>
@@ -1397,20 +1211,8 @@ define <4 x i16> @mgather_v4i16(<4 x i16*> %ptrs, <4 x i1> %m, <4 x i16> %passth
;
; RV64ZVE32F-LABEL: mgather_v4i16:
; RV64ZVE32F: # %bb.0:
-; RV64ZVE32F-NEXT: addi sp, sp, -16
-; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v9, 0
-; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v10, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu
-; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0
-; RV64ZVE32F-NEXT: addi a1, sp, 15
-; RV64ZVE32F-NEXT: vsm.v v9, (a1)
-; RV64ZVE32F-NEXT: lbu a1, 15(sp)
+; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu
+; RV64ZVE32F-NEXT: vmv.x.s a1, v0
; RV64ZVE32F-NEXT: andi a2, a1, 1
; RV64ZVE32F-NEXT: bnez a2, .LBB19_5
; RV64ZVE32F-NEXT: # %bb.1: # %else
@@ -1423,7 +1225,6 @@ define <4 x i16> @mgather_v4i16(<4 x i16*> %ptrs, <4 x i1> %m, <4 x i16> %passth
; RV64ZVE32F-NEXT: andi a1, a1, 8
; RV64ZVE32F-NEXT: bnez a1, .LBB19_8
; RV64ZVE32F-NEXT: .LBB19_4: # %else8
-; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
; RV64ZVE32F-NEXT: .LBB19_5: # %cond.load
; RV64ZVE32F-NEXT: ld a2, 0(a0)
@@ -1457,7 +1258,6 @@ define <4 x i16> @mgather_v4i16(<4 x i16*> %ptrs, <4 x i1> %m, <4 x i16> %passth
; RV64ZVE32F-NEXT: vmv.s.x v9, a0
; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, tu, mu
; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 3
-; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
%v = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %ptrs, i32 2, <4 x i1> %m, <4 x i16> %passthru)
ret <4 x i16> %v
@@ -1480,67 +1280,53 @@ define <4 x i16> @mgather_truemask_v4i16(<4 x i16*> %ptrs, <4 x i16> %passthru)
;
; RV64ZVE32F-LABEL: mgather_truemask_v4i16:
; RV64ZVE32F: # %bb.0:
-; RV64ZVE32F-NEXT: addi sp, sp, -16
-; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16
-; RV64ZVE32F-NEXT: ld a1, 24(a0)
-; RV64ZVE32F-NEXT: ld a2, 16(a0)
-; RV64ZVE32F-NEXT: ld a3, 8(a0)
-; RV64ZVE32F-NEXT: ld a4, 0(a0)
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu
-; RV64ZVE32F-NEXT: vmset.m v0
-; RV64ZVE32F-NEXT: vmv.v.i v9, 0
-; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v10, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu
-; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0
; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0
-; RV64ZVE32F-NEXT: addi a0, sp, 15
-; RV64ZVE32F-NEXT: vsm.v v9, (a0)
-; RV64ZVE32F-NEXT: lb a0, 15(sp)
+; RV64ZVE32F-NEXT: vmset.m v9
+; RV64ZVE32F-NEXT: vmv.x.s a1, v9
; RV64ZVE32F-NEXT: beqz zero, .LBB20_5
; RV64ZVE32F-NEXT: # %bb.1: # %else
-; RV64ZVE32F-NEXT: andi a4, a0, 2
-; RV64ZVE32F-NEXT: bnez a4, .LBB20_6
+; RV64ZVE32F-NEXT: andi a2, a1, 2
+; RV64ZVE32F-NEXT: bnez a2, .LBB20_6
; RV64ZVE32F-NEXT: .LBB20_2: # %else2
-; RV64ZVE32F-NEXT: andi a3, a0, 4
-; RV64ZVE32F-NEXT: bnez a3, .LBB20_7
+; RV64ZVE32F-NEXT: andi a2, a1, 4
+; RV64ZVE32F-NEXT: bnez a2, .LBB20_7
; RV64ZVE32F-NEXT: .LBB20_3: # %else5
-; RV64ZVE32F-NEXT: andi a0, a0, 8
-; RV64ZVE32F-NEXT: bnez a0, .LBB20_8
+; RV64ZVE32F-NEXT: andi a1, a1, 8
+; RV64ZVE32F-NEXT: bnez a1, .LBB20_8
; RV64ZVE32F-NEXT: .LBB20_4: # %else8
-; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
; RV64ZVE32F-NEXT: .LBB20_5: # %cond.load
-; RV64ZVE32F-NEXT: lh a4, 0(a4)
+; RV64ZVE32F-NEXT: ld a2, 0(a0)
+; RV64ZVE32F-NEXT: lh a2, 0(a2)
; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, tu, mu
-; RV64ZVE32F-NEXT: vmv.s.x v8, a4
-; RV64ZVE32F-NEXT: andi a4, a0, 2
-; RV64ZVE32F-NEXT: beqz a4, .LBB20_2
+; RV64ZVE32F-NEXT: vmv.s.x v8, a2
+; RV64ZVE32F-NEXT: andi a2, a1, 2
+; RV64ZVE32F-NEXT: beqz a2, .LBB20_2
; RV64ZVE32F-NEXT: .LBB20_6: # %cond.load1
-; RV64ZVE32F-NEXT: lh a3, 0(a3)
+; RV64ZVE32F-NEXT: ld a2, 8(a0)
+; RV64ZVE32F-NEXT: lh a2, 0(a2)
; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmv.s.x v9, a3
+; RV64ZVE32F-NEXT: vmv.s.x v9, a2
; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, tu, mu
; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1
-; RV64ZVE32F-NEXT: andi a3, a0, 4
-; RV64ZVE32F-NEXT: beqz a3, .LBB20_3
+; RV64ZVE32F-NEXT: andi a2, a1, 4
+; RV64ZVE32F-NEXT: beqz a2, .LBB20_3
; RV64ZVE32F-NEXT: .LBB20_7: # %cond.load4
+; RV64ZVE32F-NEXT: ld a2, 16(a0)
; RV64ZVE32F-NEXT: lh a2, 0(a2)
; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, mu
; RV64ZVE32F-NEXT: vmv.s.x v9, a2
; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, mf2, tu, mu
; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 2
-; RV64ZVE32F-NEXT: andi a0, a0, 8
-; RV64ZVE32F-NEXT: beqz a0, .LBB20_4
+; RV64ZVE32F-NEXT: andi a1, a1, 8
+; RV64ZVE32F-NEXT: beqz a1, .LBB20_4
; RV64ZVE32F-NEXT: .LBB20_8: # %cond.load7
-; RV64ZVE32F-NEXT: lh a0, 0(a1)
+; RV64ZVE32F-NEXT: ld a0, 24(a0)
+; RV64ZVE32F-NEXT: lh a0, 0(a0)
; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, mu
; RV64ZVE32F-NEXT: vmv.s.x v9, a0
; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, tu, mu
; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 3
-; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
%mhead = insertelement <4 x i1> poison, i1 1, i32 0
%mtrue = shufflevector <4 x i1> %mhead, <4 x i1> poison, <4 x i32> zeroinitializer
@@ -2357,27 +2143,14 @@ define <2 x i32> @mgather_v2i32(<2 x i32*> %ptrs, <2 x i1> %m, <2 x i32> %passth
;
; RV64ZVE32F-LABEL: mgather_v2i32:
; RV64ZVE32F: # %bb.0:
-; RV64ZVE32F-NEXT: addi sp, sp, -16
-; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16
-; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v9, 0
-; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v10, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu
-; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0
-; RV64ZVE32F-NEXT: addi a2, sp, 15
-; RV64ZVE32F-NEXT: vsm.v v9, (a2)
-; RV64ZVE32F-NEXT: lbu a2, 15(sp)
+; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu
+; RV64ZVE32F-NEXT: vmv.x.s a2, v0
; RV64ZVE32F-NEXT: andi a3, a2, 1
; RV64ZVE32F-NEXT: bnez a3, .LBB28_3
; RV64ZVE32F-NEXT: # %bb.1: # %else
; RV64ZVE32F-NEXT: andi a0, a2, 2
; RV64ZVE32F-NEXT: bnez a0, .LBB28_4
; RV64ZVE32F-NEXT: .LBB28_2: # %else2
-; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
; RV64ZVE32F-NEXT: .LBB28_3: # %cond.load
; RV64ZVE32F-NEXT: lw a0, 0(a0)
@@ -2391,7 +2164,6 @@ define <2 x i32> @mgather_v2i32(<2 x i32*> %ptrs, <2 x i1> %m, <2 x i32> %passth
; RV64ZVE32F-NEXT: vmv.s.x v9, a0
; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, tu, mu
; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1
-; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
%v = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %ptrs, i32 4, <2 x i1> %m, <2 x i32> %passthru)
ret <2 x i32> %v
@@ -2433,20 +2205,8 @@ define <2 x i64> @mgather_v2i32_sextload_v2i64(<2 x i32*> %ptrs, <2 x i1> %m, <2
;
; RV64ZVE32F-LABEL: mgather_v2i32_sextload_v2i64:
; RV64ZVE32F: # %bb.0:
-; RV64ZVE32F-NEXT: addi sp, sp, -16
-; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16
-; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v9, 0
-; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v10, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu
-; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0
-; RV64ZVE32F-NEXT: addi a2, sp, 15
-; RV64ZVE32F-NEXT: vsm.v v9, (a2)
-; RV64ZVE32F-NEXT: lbu a2, 15(sp)
+; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu
+; RV64ZVE32F-NEXT: vmv.x.s a2, v0
; RV64ZVE32F-NEXT: andi a3, a2, 1
; RV64ZVE32F-NEXT: beqz a3, .LBB29_2
; RV64ZVE32F-NEXT: # %bb.1: # %cond.load
@@ -2467,7 +2227,6 @@ define <2 x i64> @mgather_v2i32_sextload_v2i64(<2 x i32*> %ptrs, <2 x i1> %m, <2
; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1
; RV64ZVE32F-NEXT: vmv.x.s a1, v9
; RV64ZVE32F-NEXT: vmv.x.s a0, v8
-; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
%v = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %ptrs, i32 4, <2 x i1> %m, <2 x i32> %passthru)
%ev = sext <2 x i32> %v to <2 x i64>
@@ -2506,20 +2265,8 @@ define <2 x i64> @mgather_v2i32_zextload_v2i64(<2 x i32*> %ptrs, <2 x i1> %m, <2
;
; RV64ZVE32F-LABEL: mgather_v2i32_zextload_v2i64:
; RV64ZVE32F: # %bb.0:
-; RV64ZVE32F-NEXT: addi sp, sp, -16
-; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16
-; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v9, 0
-; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v10, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu
-; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0
-; RV64ZVE32F-NEXT: addi a2, sp, 15
-; RV64ZVE32F-NEXT: vsm.v v9, (a2)
-; RV64ZVE32F-NEXT: lbu a2, 15(sp)
+; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu
+; RV64ZVE32F-NEXT: vmv.x.s a2, v0
; RV64ZVE32F-NEXT: andi a3, a2, 1
; RV64ZVE32F-NEXT: beqz a3, .LBB30_2
; RV64ZVE32F-NEXT: # %bb.1: # %cond.load
@@ -2544,7 +2291,6 @@ define <2 x i64> @mgather_v2i32_zextload_v2i64(<2 x i32*> %ptrs, <2 x i1> %m, <2
; RV64ZVE32F-NEXT: vmv.x.s a0, v8
; RV64ZVE32F-NEXT: slli a0, a0, 32
; RV64ZVE32F-NEXT: srli a0, a0, 32
-; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
%v = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %ptrs, i32 4, <2 x i1> %m, <2 x i32> %passthru)
%ev = zext <2 x i32> %v to <2 x i64>
@@ -2570,20 +2316,8 @@ define <4 x i32> @mgather_v4i32(<4 x i32*> %ptrs, <4 x i1> %m, <4 x i32> %passth
;
; RV64ZVE32F-LABEL: mgather_v4i32:
; RV64ZVE32F: # %bb.0:
-; RV64ZVE32F-NEXT: addi sp, sp, -16
-; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v9, 0
-; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v10, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu
-; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0
-; RV64ZVE32F-NEXT: addi a1, sp, 15
-; RV64ZVE32F-NEXT: vsm.v v9, (a1)
-; RV64ZVE32F-NEXT: lbu a1, 15(sp)
+; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu
+; RV64ZVE32F-NEXT: vmv.x.s a1, v0
; RV64ZVE32F-NEXT: andi a2, a1, 1
; RV64ZVE32F-NEXT: bnez a2, .LBB31_5
; RV64ZVE32F-NEXT: # %bb.1: # %else
@@ -2596,7 +2330,6 @@ define <4 x i32> @mgather_v4i32(<4 x i32*> %ptrs, <4 x i1> %m, <4 x i32> %passth
; RV64ZVE32F-NEXT: andi a1, a1, 8
; RV64ZVE32F-NEXT: bnez a1, .LBB31_8
; RV64ZVE32F-NEXT: .LBB31_4: # %else8
-; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
; RV64ZVE32F-NEXT: .LBB31_5: # %cond.load
; RV64ZVE32F-NEXT: ld a2, 0(a0)
@@ -2630,7 +2363,6 @@ define <4 x i32> @mgather_v4i32(<4 x i32*> %ptrs, <4 x i1> %m, <4 x i32> %passth
; RV64ZVE32F-NEXT: vmv.s.x v9, a0
; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, tu, mu
; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 3
-; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
%v = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 4, <4 x i1> %m, <4 x i32> %passthru)
ret <4 x i32> %v
@@ -2652,67 +2384,53 @@ define <4 x i32> @mgather_truemask_v4i32(<4 x i32*> %ptrs, <4 x i32> %passthru)
;
; RV64ZVE32F-LABEL: mgather_truemask_v4i32:
; RV64ZVE32F: # %bb.0:
-; RV64ZVE32F-NEXT: addi sp, sp, -16
-; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16
-; RV64ZVE32F-NEXT: ld a1, 24(a0)
-; RV64ZVE32F-NEXT: ld a2, 16(a0)
-; RV64ZVE32F-NEXT: ld a3, 8(a0)
-; RV64ZVE32F-NEXT: ld a4, 0(a0)
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu
-; RV64ZVE32F-NEXT: vmset.m v0
-; RV64ZVE32F-NEXT: vmv.v.i v9, 0
-; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v10, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu
-; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0
; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0
-; RV64ZVE32F-NEXT: addi a0, sp, 15
-; RV64ZVE32F-NEXT: vsm.v v9, (a0)
-; RV64ZVE32F-NEXT: lb a0, 15(sp)
+; RV64ZVE32F-NEXT: vmset.m v9
+; RV64ZVE32F-NEXT: vmv.x.s a1, v9
; RV64ZVE32F-NEXT: beqz zero, .LBB32_5
; RV64ZVE32F-NEXT: # %bb.1: # %else
-; RV64ZVE32F-NEXT: andi a4, a0, 2
-; RV64ZVE32F-NEXT: bnez a4, .LBB32_6
+; RV64ZVE32F-NEXT: andi a2, a1, 2
+; RV64ZVE32F-NEXT: bnez a2, .LBB32_6
; RV64ZVE32F-NEXT: .LBB32_2: # %else2
-; RV64ZVE32F-NEXT: andi a3, a0, 4
-; RV64ZVE32F-NEXT: bnez a3, .LBB32_7
+; RV64ZVE32F-NEXT: andi a2, a1, 4
+; RV64ZVE32F-NEXT: bnez a2, .LBB32_7
; RV64ZVE32F-NEXT: .LBB32_3: # %else5
-; RV64ZVE32F-NEXT: andi a0, a0, 8
-; RV64ZVE32F-NEXT: bnez a0, .LBB32_8
+; RV64ZVE32F-NEXT: andi a1, a1, 8
+; RV64ZVE32F-NEXT: bnez a1, .LBB32_8
; RV64ZVE32F-NEXT: .LBB32_4: # %else8
-; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
; RV64ZVE32F-NEXT: .LBB32_5: # %cond.load
-; RV64ZVE32F-NEXT: lw a4, 0(a4)
+; RV64ZVE32F-NEXT: ld a2, 0(a0)
+; RV64ZVE32F-NEXT: lw a2, 0(a2)
; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, tu, mu
-; RV64ZVE32F-NEXT: vmv.s.x v8, a4
-; RV64ZVE32F-NEXT: andi a4, a0, 2
-; RV64ZVE32F-NEXT: beqz a4, .LBB32_2
+; RV64ZVE32F-NEXT: vmv.s.x v8, a2
+; RV64ZVE32F-NEXT: andi a2, a1, 2
+; RV64ZVE32F-NEXT: beqz a2, .LBB32_2
; RV64ZVE32F-NEXT: .LBB32_6: # %cond.load1
-; RV64ZVE32F-NEXT: lw a3, 0(a3)
+; RV64ZVE32F-NEXT: ld a2, 8(a0)
+; RV64ZVE32F-NEXT: lw a2, 0(a2)
; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, ta, mu
-; RV64ZVE32F-NEXT: vmv.s.x v9, a3
+; RV64ZVE32F-NEXT: vmv.s.x v9, a2
; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, tu, mu
; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1
-; RV64ZVE32F-NEXT: andi a3, a0, 4
-; RV64ZVE32F-NEXT: beqz a3, .LBB32_3
+; RV64ZVE32F-NEXT: andi a2, a1, 4
+; RV64ZVE32F-NEXT: beqz a2, .LBB32_3
; RV64ZVE32F-NEXT: .LBB32_7: # %cond.load4
+; RV64ZVE32F-NEXT: ld a2, 16(a0)
; RV64ZVE32F-NEXT: lw a2, 0(a2)
; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, ta, mu
; RV64ZVE32F-NEXT: vmv.s.x v9, a2
; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m1, tu, mu
; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 2
-; RV64ZVE32F-NEXT: andi a0, a0, 8
-; RV64ZVE32F-NEXT: beqz a0, .LBB32_4
+; RV64ZVE32F-NEXT: andi a1, a1, 8
+; RV64ZVE32F-NEXT: beqz a1, .LBB32_4
; RV64ZVE32F-NEXT: .LBB32_8: # %cond.load7
-; RV64ZVE32F-NEXT: lw a0, 0(a1)
+; RV64ZVE32F-NEXT: ld a0, 24(a0)
+; RV64ZVE32F-NEXT: lw a0, 0(a0)
; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, ta, mu
; RV64ZVE32F-NEXT: vmv.s.x v9, a0
; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, tu, mu
; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 3
-; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
%mhead = insertelement <4 x i1> poison, i1 1, i32 0
%mtrue = shufflevector <4 x i1> %mhead, <4 x i1> poison, <4 x i32> zeroinitializer
@@ -3990,24 +3708,12 @@ define <2 x i64> @mgather_v2i64(<2 x i64*> %ptrs, <2 x i1> %m, <2 x i64> %passth
;
; RV32ZVE32F-LABEL: mgather_v2i64:
; RV32ZVE32F: # %bb.0:
-; RV32ZVE32F-NEXT: addi sp, sp, -16
-; RV32ZVE32F-NEXT: .cfi_def_cfa_offset 16
-; RV32ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu
-; RV32ZVE32F-NEXT: vmv.v.i v9, 0
-; RV32ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0
-; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV32ZVE32F-NEXT: vmv.v.i v10, 0
-; RV32ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu
-; RV32ZVE32F-NEXT: vslideup.vi v10, v9, 0
-; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV32ZVE32F-NEXT: vmsne.vi v9, v10, 0
-; RV32ZVE32F-NEXT: addi a2, sp, 15
-; RV32ZVE32F-NEXT: vsm.v v9, (a2)
-; RV32ZVE32F-NEXT: lbu a4, 15(sp)
+; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu
+; RV32ZVE32F-NEXT: vmv.x.s a4, v0
; RV32ZVE32F-NEXT: andi a2, a4, 1
; RV32ZVE32F-NEXT: beqz a2, .LBB43_3
; RV32ZVE32F-NEXT: # %bb.1: # %cond.load
-; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, mu
+; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, mu
; RV32ZVE32F-NEXT: vmv.x.s a3, v8
; RV32ZVE32F-NEXT: lw a2, 4(a3)
; RV32ZVE32F-NEXT: lw a3, 0(a3)
@@ -4033,25 +3739,12 @@ define <2 x i64> @mgather_v2i64(<2 x i64*> %ptrs, <2 x i1> %m, <2 x i64> %passth
; RV32ZVE32F-NEXT: sw a2, 4(a0)
; RV32ZVE32F-NEXT: sw a1, 8(a0)
; RV32ZVE32F-NEXT: sw a4, 12(a0)
-; RV32ZVE32F-NEXT: addi sp, sp, 16
; RV32ZVE32F-NEXT: ret
;
; RV64ZVE32F-LABEL: mgather_v2i64:
; RV64ZVE32F: # %bb.0:
-; RV64ZVE32F-NEXT: addi sp, sp, -16
-; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16
-; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v8, 0
-; RV64ZVE32F-NEXT: vmerge.vim v8, v8, 1, v0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v9, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu
-; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmsne.vi v8, v9, 0
-; RV64ZVE32F-NEXT: addi a4, sp, 15
-; RV64ZVE32F-NEXT: vsm.v v8, (a4)
-; RV64ZVE32F-NEXT: lbu a4, 15(sp)
+; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu
+; RV64ZVE32F-NEXT: vmv.x.s a4, v0
; RV64ZVE32F-NEXT: andi a5, a4, 1
; RV64ZVE32F-NEXT: beqz a5, .LBB43_2
; RV64ZVE32F-NEXT: # %bb.1: # %cond.load
@@ -4064,7 +3757,6 @@ define <2 x i64> @mgather_v2i64(<2 x i64*> %ptrs, <2 x i1> %m, <2 x i64> %passth
; RV64ZVE32F-NEXT: .LBB43_4: # %else2
; RV64ZVE32F-NEXT: mv a0, a2
; RV64ZVE32F-NEXT: mv a1, a3
-; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
%v = call <2 x i64> @llvm.masked.gather.v2i64.v2p0i64(<2 x i64*> %ptrs, i32 8, <2 x i1> %m, <2 x i64> %passthru)
ret <2 x i64> %v
@@ -4089,24 +3781,12 @@ define <4 x i64> @mgather_v4i64(<4 x i64*> %ptrs, <4 x i1> %m, <4 x i64> %passth
;
; RV32ZVE32F-LABEL: mgather_v4i64:
; RV32ZVE32F: # %bb.0:
-; RV32ZVE32F-NEXT: addi sp, sp, -16
-; RV32ZVE32F-NEXT: .cfi_def_cfa_offset 16
-; RV32ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu
-; RV32ZVE32F-NEXT: vmv.v.i v9, 0
-; RV32ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0
-; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV32ZVE32F-NEXT: vmv.v.i v10, 0
-; RV32ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu
-; RV32ZVE32F-NEXT: vslideup.vi v10, v9, 0
-; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV32ZVE32F-NEXT: vmsne.vi v9, v10, 0
-; RV32ZVE32F-NEXT: addi a2, sp, 15
-; RV32ZVE32F-NEXT: vsm.v v9, (a2)
-; RV32ZVE32F-NEXT: lbu a6, 15(sp)
+; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu
+; RV32ZVE32F-NEXT: vmv.x.s a6, v0
; RV32ZVE32F-NEXT: andi a2, a6, 1
; RV32ZVE32F-NEXT: beqz a2, .LBB44_5
; RV32ZVE32F-NEXT: # %bb.1: # %cond.load
-; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, mu
+; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, mu
; RV32ZVE32F-NEXT: vmv.x.s a3, v8
; RV32ZVE32F-NEXT: lw a2, 4(a3)
; RV32ZVE32F-NEXT: lw a3, 0(a3)
@@ -4162,25 +3842,12 @@ define <4 x i64> @mgather_v4i64(<4 x i64*> %ptrs, <4 x i1> %m, <4 x i64> %passth
; RV32ZVE32F-NEXT: sw a7, 20(a0)
; RV32ZVE32F-NEXT: sw a1, 24(a0)
; RV32ZVE32F-NEXT: sw a6, 28(a0)
-; RV32ZVE32F-NEXT: addi sp, sp, 16
; RV32ZVE32F-NEXT: ret
;
; RV64ZVE32F-LABEL: mgather_v4i64:
; RV64ZVE32F: # %bb.0:
-; RV64ZVE32F-NEXT: addi sp, sp, -16
-; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v8, 0
-; RV64ZVE32F-NEXT: vmerge.vim v8, v8, 1, v0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v9, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu
-; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmsne.vi v8, v9, 0
-; RV64ZVE32F-NEXT: addi a3, sp, 15
-; RV64ZVE32F-NEXT: vsm.v v8, (a3)
-; RV64ZVE32F-NEXT: lbu a5, 15(sp)
+; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu
+; RV64ZVE32F-NEXT: vmv.x.s a5, v0
; RV64ZVE32F-NEXT: andi a3, a5, 1
; RV64ZVE32F-NEXT: beqz a3, .LBB44_5
; RV64ZVE32F-NEXT: # %bb.1: # %cond.load
@@ -4221,7 +3888,6 @@ define <4 x i64> @mgather_v4i64(<4 x i64*> %ptrs, <4 x i1> %m, <4 x i64> %passth
; RV64ZVE32F-NEXT: sd a4, 8(a0)
; RV64ZVE32F-NEXT: sd a6, 16(a0)
; RV64ZVE32F-NEXT: sd a1, 24(a0)
-; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
%v = call <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*> %ptrs, i32 8, <4 x i1> %m, <4 x i64> %passthru)
ret <4 x i64> %v
@@ -4243,138 +3909,114 @@ define <4 x i64> @mgather_truemask_v4i64(<4 x i64*> %ptrs, <4 x i64> %passthru)
;
; RV32ZVE32F-LABEL: mgather_truemask_v4i64:
; RV32ZVE32F: # %bb.0:
-; RV32ZVE32F-NEXT: addi sp, sp, -16
-; RV32ZVE32F-NEXT: .cfi_def_cfa_offset 16
-; RV32ZVE32F-NEXT: lw a2, 28(a1)
-; RV32ZVE32F-NEXT: lw a3, 24(a1)
-; RV32ZVE32F-NEXT: lw a4, 20(a1)
-; RV32ZVE32F-NEXT: lw a5, 16(a1)
-; RV32ZVE32F-NEXT: lw a6, 12(a1)
-; RV32ZVE32F-NEXT: lw t0, 8(a1)
-; RV32ZVE32F-NEXT: lw a7, 4(a1)
-; RV32ZVE32F-NEXT: lw a1, 0(a1)
-; RV32ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu
-; RV32ZVE32F-NEXT: vmset.m v0
-; RV32ZVE32F-NEXT: vmv.v.i v9, 0
-; RV32ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0
; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV32ZVE32F-NEXT: vmv.v.i v10, 0
-; RV32ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu
-; RV32ZVE32F-NEXT: vslideup.vi v10, v9, 0
-; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV32ZVE32F-NEXT: vmsne.vi v9, v10, 0
-; RV32ZVE32F-NEXT: addi t1, sp, 15
-; RV32ZVE32F-NEXT: vsm.v v9, (t1)
-; RV32ZVE32F-NEXT: lb t1, 15(sp)
-; RV32ZVE32F-NEXT: beqz zero, .LBB45_6
-; RV32ZVE32F-NEXT: # %bb.1: # %else
-; RV32ZVE32F-NEXT: andi t2, t1, 2
-; RV32ZVE32F-NEXT: bnez t2, .LBB45_7
-; RV32ZVE32F-NEXT: .LBB45_2: # %else2
-; RV32ZVE32F-NEXT: andi t2, t1, 4
-; RV32ZVE32F-NEXT: bnez t2, .LBB45_8
-; RV32ZVE32F-NEXT: .LBB45_3: # %else5
-; RV32ZVE32F-NEXT: andi t1, t1, 8
-; RV32ZVE32F-NEXT: beqz t1, .LBB45_5
-; RV32ZVE32F-NEXT: .LBB45_4: # %cond.load7
-; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu
-; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 3
+; RV32ZVE32F-NEXT: vmset.m v9
+; RV32ZVE32F-NEXT: vmv.x.s a6, v9
+; RV32ZVE32F-NEXT: bnez zero, .LBB45_5
+; RV32ZVE32F-NEXT: # %bb.1: # %cond.load
+; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, mu
; RV32ZVE32F-NEXT: vmv.x.s a3, v8
; RV32ZVE32F-NEXT: lw a2, 4(a3)
; RV32ZVE32F-NEXT: lw a3, 0(a3)
-; RV32ZVE32F-NEXT: .LBB45_5: # %else8
-; RV32ZVE32F-NEXT: sw a1, 0(a0)
-; RV32ZVE32F-NEXT: sw a7, 4(a0)
-; RV32ZVE32F-NEXT: sw t0, 8(a0)
-; RV32ZVE32F-NEXT: sw a6, 12(a0)
-; RV32ZVE32F-NEXT: sw a5, 16(a0)
-; RV32ZVE32F-NEXT: sw a4, 20(a0)
-; RV32ZVE32F-NEXT: sw a3, 24(a0)
-; RV32ZVE32F-NEXT: sw a2, 28(a0)
-; RV32ZVE32F-NEXT: addi sp, sp, 16
-; RV32ZVE32F-NEXT: ret
-; RV32ZVE32F-NEXT: .LBB45_6: # %cond.load
-; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, mu
-; RV32ZVE32F-NEXT: vmv.x.s a1, v8
-; RV32ZVE32F-NEXT: lw a7, 4(a1)
-; RV32ZVE32F-NEXT: lw a1, 0(a1)
-; RV32ZVE32F-NEXT: andi t2, t1, 2
-; RV32ZVE32F-NEXT: beqz t2, .LBB45_2
-; RV32ZVE32F-NEXT: .LBB45_7: # %cond.load1
+; RV32ZVE32F-NEXT: andi a4, a6, 2
+; RV32ZVE32F-NEXT: bnez a4, .LBB45_6
+; RV32ZVE32F-NEXT: .LBB45_2:
+; RV32ZVE32F-NEXT: lw a4, 12(a1)
+; RV32ZVE32F-NEXT: lw a5, 8(a1)
+; RV32ZVE32F-NEXT: andi a7, a6, 4
+; RV32ZVE32F-NEXT: bnez a7, .LBB45_7
+; RV32ZVE32F-NEXT: .LBB45_3:
+; RV32ZVE32F-NEXT: lw a7, 20(a1)
+; RV32ZVE32F-NEXT: lw t0, 16(a1)
+; RV32ZVE32F-NEXT: andi a6, a6, 8
+; RV32ZVE32F-NEXT: bnez a6, .LBB45_8
+; RV32ZVE32F-NEXT: .LBB45_4:
+; RV32ZVE32F-NEXT: lw a6, 28(a1)
+; RV32ZVE32F-NEXT: lw a1, 24(a1)
+; RV32ZVE32F-NEXT: j .LBB45_9
+; RV32ZVE32F-NEXT: .LBB45_5:
+; RV32ZVE32F-NEXT: lw a2, 4(a1)
+; RV32ZVE32F-NEXT: lw a3, 0(a1)
+; RV32ZVE32F-NEXT: andi a4, a6, 2
+; RV32ZVE32F-NEXT: beqz a4, .LBB45_2
+; RV32ZVE32F-NEXT: .LBB45_6: # %cond.load1
; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu
; RV32ZVE32F-NEXT: vslidedown.vi v9, v8, 1
-; RV32ZVE32F-NEXT: vmv.x.s t0, v9
-; RV32ZVE32F-NEXT: lw a6, 4(t0)
-; RV32ZVE32F-NEXT: lw t0, 0(t0)
-; RV32ZVE32F-NEXT: andi t2, t1, 4
-; RV32ZVE32F-NEXT: beqz t2, .LBB45_3
-; RV32ZVE32F-NEXT: .LBB45_8: # %cond.load4
-; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu
-; RV32ZVE32F-NEXT: vslidedown.vi v9, v8, 2
; RV32ZVE32F-NEXT: vmv.x.s a5, v9
; RV32ZVE32F-NEXT: lw a4, 4(a5)
; RV32ZVE32F-NEXT: lw a5, 0(a5)
-; RV32ZVE32F-NEXT: andi t1, t1, 8
-; RV32ZVE32F-NEXT: bnez t1, .LBB45_4
-; RV32ZVE32F-NEXT: j .LBB45_5
+; RV32ZVE32F-NEXT: andi a7, a6, 4
+; RV32ZVE32F-NEXT: beqz a7, .LBB45_3
+; RV32ZVE32F-NEXT: .LBB45_7: # %cond.load4
+; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu
+; RV32ZVE32F-NEXT: vslidedown.vi v9, v8, 2
+; RV32ZVE32F-NEXT: vmv.x.s t0, v9
+; RV32ZVE32F-NEXT: lw a7, 4(t0)
+; RV32ZVE32F-NEXT: lw t0, 0(t0)
+; RV32ZVE32F-NEXT: andi a6, a6, 8
+; RV32ZVE32F-NEXT: beqz a6, .LBB45_4
+; RV32ZVE32F-NEXT: .LBB45_8: # %cond.load7
+; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu
+; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 3
+; RV32ZVE32F-NEXT: vmv.x.s a1, v8
+; RV32ZVE32F-NEXT: lw a6, 4(a1)
+; RV32ZVE32F-NEXT: lw a1, 0(a1)
+; RV32ZVE32F-NEXT: .LBB45_9: # %else8
+; RV32ZVE32F-NEXT: sw a3, 0(a0)
+; RV32ZVE32F-NEXT: sw a2, 4(a0)
+; RV32ZVE32F-NEXT: sw a5, 8(a0)
+; RV32ZVE32F-NEXT: sw a4, 12(a0)
+; RV32ZVE32F-NEXT: sw t0, 16(a0)
+; RV32ZVE32F-NEXT: sw a7, 20(a0)
+; RV32ZVE32F-NEXT: sw a1, 24(a0)
+; RV32ZVE32F-NEXT: sw a6, 28(a0)
+; RV32ZVE32F-NEXT: ret
;
; RV64ZVE32F-LABEL: mgather_truemask_v4i64:
; RV64ZVE32F: # %bb.0:
-; RV64ZVE32F-NEXT: addi sp, sp, -16
-; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16
-; RV64ZVE32F-NEXT: ld a3, 24(a2)
-; RV64ZVE32F-NEXT: ld a4, 16(a2)
-; RV64ZVE32F-NEXT: ld a5, 8(a2)
-; RV64ZVE32F-NEXT: ld a2, 0(a2)
-; RV64ZVE32F-NEXT: ld a6, 24(a1)
-; RV64ZVE32F-NEXT: ld a7, 16(a1)
-; RV64ZVE32F-NEXT: ld t0, 8(a1)
-; RV64ZVE32F-NEXT: ld t1, 0(a1)
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu
-; RV64ZVE32F-NEXT: vmset.m v0
-; RV64ZVE32F-NEXT: vmv.v.i v8, 0
-; RV64ZVE32F-NEXT: vmerge.vim v8, v8, 1, v0
; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v9, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu
-; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmsne.vi v8, v9, 0
-; RV64ZVE32F-NEXT: addi a1, sp, 15
-; RV64ZVE32F-NEXT: vsm.v v8, (a1)
-; RV64ZVE32F-NEXT: lb a1, 15(sp)
-; RV64ZVE32F-NEXT: beqz zero, .LBB45_6
-; RV64ZVE32F-NEXT: # %bb.1: # %else
-; RV64ZVE32F-NEXT: andi t1, a1, 2
-; RV64ZVE32F-NEXT: bnez t1, .LBB45_7
-; RV64ZVE32F-NEXT: .LBB45_2: # %else2
-; RV64ZVE32F-NEXT: andi t0, a1, 4
-; RV64ZVE32F-NEXT: bnez t0, .LBB45_8
-; RV64ZVE32F-NEXT: .LBB45_3: # %else5
-; RV64ZVE32F-NEXT: andi a1, a1, 8
-; RV64ZVE32F-NEXT: beqz a1, .LBB45_5
-; RV64ZVE32F-NEXT: .LBB45_4: # %cond.load7
-; RV64ZVE32F-NEXT: ld a3, 0(a6)
-; RV64ZVE32F-NEXT: .LBB45_5: # %else8
-; RV64ZVE32F-NEXT: sd a2, 0(a0)
-; RV64ZVE32F-NEXT: sd a5, 8(a0)
-; RV64ZVE32F-NEXT: sd a4, 16(a0)
-; RV64ZVE32F-NEXT: sd a3, 24(a0)
-; RV64ZVE32F-NEXT: addi sp, sp, 16
-; RV64ZVE32F-NEXT: ret
-; RV64ZVE32F-NEXT: .LBB45_6: # %cond.load
-; RV64ZVE32F-NEXT: ld a2, 0(t1)
-; RV64ZVE32F-NEXT: andi t1, a1, 2
-; RV64ZVE32F-NEXT: beqz t1, .LBB45_2
-; RV64ZVE32F-NEXT: .LBB45_7: # %cond.load1
-; RV64ZVE32F-NEXT: ld a5, 0(t0)
-; RV64ZVE32F-NEXT: andi t0, a1, 4
-; RV64ZVE32F-NEXT: beqz t0, .LBB45_3
-; RV64ZVE32F-NEXT: .LBB45_8: # %cond.load4
-; RV64ZVE32F-NEXT: ld a4, 0(a7)
-; RV64ZVE32F-NEXT: andi a1, a1, 8
-; RV64ZVE32F-NEXT: bnez a1, .LBB45_4
-; RV64ZVE32F-NEXT: j .LBB45_5
+; RV64ZVE32F-NEXT: vmset.m v8
+; RV64ZVE32F-NEXT: vmv.x.s a5, v8
+; RV64ZVE32F-NEXT: bnez zero, .LBB45_5
+; RV64ZVE32F-NEXT: # %bb.1: # %cond.load
+; RV64ZVE32F-NEXT: ld a3, 0(a1)
+; RV64ZVE32F-NEXT: ld a3, 0(a3)
+; RV64ZVE32F-NEXT: andi a4, a5, 2
+; RV64ZVE32F-NEXT: bnez a4, .LBB45_6
+; RV64ZVE32F-NEXT: .LBB45_2:
+; RV64ZVE32F-NEXT: ld a4, 8(a2)
+; RV64ZVE32F-NEXT: andi a6, a5, 4
+; RV64ZVE32F-NEXT: bnez a6, .LBB45_7
+; RV64ZVE32F-NEXT: .LBB45_3:
+; RV64ZVE32F-NEXT: ld a6, 16(a2)
+; RV64ZVE32F-NEXT: andi a5, a5, 8
+; RV64ZVE32F-NEXT: bnez a5, .LBB45_8
+; RV64ZVE32F-NEXT: .LBB45_4:
+; RV64ZVE32F-NEXT: ld a1, 24(a2)
+; RV64ZVE32F-NEXT: j .LBB45_9
+; RV64ZVE32F-NEXT: .LBB45_5:
+; RV64ZVE32F-NEXT: ld a3, 0(a2)
+; RV64ZVE32F-NEXT: andi a4, a5, 2
+; RV64ZVE32F-NEXT: beqz a4, .LBB45_2
+; RV64ZVE32F-NEXT: .LBB45_6: # %cond.load1
+; RV64ZVE32F-NEXT: ld a4, 8(a1)
+; RV64ZVE32F-NEXT: ld a4, 0(a4)
+; RV64ZVE32F-NEXT: andi a6, a5, 4
+; RV64ZVE32F-NEXT: beqz a6, .LBB45_3
+; RV64ZVE32F-NEXT: .LBB45_7: # %cond.load4
+; RV64ZVE32F-NEXT: ld a6, 16(a1)
+; RV64ZVE32F-NEXT: ld a6, 0(a6)
+; RV64ZVE32F-NEXT: andi a5, a5, 8
+; RV64ZVE32F-NEXT: beqz a5, .LBB45_4
+; RV64ZVE32F-NEXT: .LBB45_8: # %cond.load7
+; RV64ZVE32F-NEXT: ld a1, 24(a1)
+; RV64ZVE32F-NEXT: ld a1, 0(a1)
+; RV64ZVE32F-NEXT: .LBB45_9: # %else8
+; RV64ZVE32F-NEXT: sd a3, 0(a0)
+; RV64ZVE32F-NEXT: sd a4, 8(a0)
+; RV64ZVE32F-NEXT: sd a6, 16(a0)
+; RV64ZVE32F-NEXT: sd a1, 24(a0)
+; RV64ZVE32F-NEXT: ret
%mhead = insertelement <4 x i1> poison, i1 1, i32 0
%mtrue = shufflevector <4 x i1> %mhead, <4 x i1> poison, <4 x i32> zeroinitializer
%v = call <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*> %ptrs, i32 8, <4 x i1> %mtrue, <4 x i64> %passthru)
@@ -7605,27 +7247,14 @@ define <2 x half> @mgather_v2f16(<2 x half*> %ptrs, <2 x i1> %m, <2 x half> %pas
;
; RV64ZVE32F-LABEL: mgather_v2f16:
; RV64ZVE32F: # %bb.0:
-; RV64ZVE32F-NEXT: addi sp, sp, -16
-; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16
-; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v9, 0
-; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v10, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu
-; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0
-; RV64ZVE32F-NEXT: addi a2, sp, 15
-; RV64ZVE32F-NEXT: vsm.v v9, (a2)
-; RV64ZVE32F-NEXT: lbu a2, 15(sp)
+; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu
+; RV64ZVE32F-NEXT: vmv.x.s a2, v0
; RV64ZVE32F-NEXT: andi a3, a2, 1
; RV64ZVE32F-NEXT: bnez a3, .LBB59_3
; RV64ZVE32F-NEXT: # %bb.1: # %else
; RV64ZVE32F-NEXT: andi a0, a2, 2
; RV64ZVE32F-NEXT: bnez a0, .LBB59_4
; RV64ZVE32F-NEXT: .LBB59_2: # %else2
-; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
; RV64ZVE32F-NEXT: .LBB59_3: # %cond.load
; RV64ZVE32F-NEXT: flh ft0, 0(a0)
@@ -7639,7 +7268,6 @@ define <2 x half> @mgather_v2f16(<2 x half*> %ptrs, <2 x i1> %m, <2 x half> %pas
; RV64ZVE32F-NEXT: vfmv.s.f v9, ft0
; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, tu, mu
; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1
-; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
%v = call <2 x half> @llvm.masked.gather.v2f16.v2p0f16(<2 x half*> %ptrs, i32 2, <2 x i1> %m, <2 x half> %passthru)
ret <2 x half> %v
@@ -7664,20 +7292,8 @@ define <4 x half> @mgather_v4f16(<4 x half*> %ptrs, <4 x i1> %m, <4 x half> %pas
;
; RV64ZVE32F-LABEL: mgather_v4f16:
; RV64ZVE32F: # %bb.0:
-; RV64ZVE32F-NEXT: addi sp, sp, -16
-; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v9, 0
-; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v10, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu
-; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0
-; RV64ZVE32F-NEXT: addi a1, sp, 15
-; RV64ZVE32F-NEXT: vsm.v v9, (a1)
-; RV64ZVE32F-NEXT: lbu a1, 15(sp)
+; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu
+; RV64ZVE32F-NEXT: vmv.x.s a1, v0
; RV64ZVE32F-NEXT: andi a2, a1, 1
; RV64ZVE32F-NEXT: bnez a2, .LBB60_5
; RV64ZVE32F-NEXT: # %bb.1: # %else
@@ -7690,7 +7306,6 @@ define <4 x half> @mgather_v4f16(<4 x half*> %ptrs, <4 x i1> %m, <4 x half> %pas
; RV64ZVE32F-NEXT: andi a1, a1, 8
; RV64ZVE32F-NEXT: bnez a1, .LBB60_8
; RV64ZVE32F-NEXT: .LBB60_4: # %else8
-; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
; RV64ZVE32F-NEXT: .LBB60_5: # %cond.load
; RV64ZVE32F-NEXT: ld a2, 0(a0)
@@ -7724,7 +7339,6 @@ define <4 x half> @mgather_v4f16(<4 x half*> %ptrs, <4 x i1> %m, <4 x half> %pas
; RV64ZVE32F-NEXT: vfmv.s.f v9, ft0
; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, tu, mu
; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 3
-; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
%v = call <4 x half> @llvm.masked.gather.v4f16.v4p0f16(<4 x half*> %ptrs, i32 2, <4 x i1> %m, <4 x half> %passthru)
ret <4 x half> %v
@@ -7747,67 +7361,53 @@ define <4 x half> @mgather_truemask_v4f16(<4 x half*> %ptrs, <4 x half> %passthr
;
; RV64ZVE32F-LABEL: mgather_truemask_v4f16:
; RV64ZVE32F: # %bb.0:
-; RV64ZVE32F-NEXT: addi sp, sp, -16
-; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16
-; RV64ZVE32F-NEXT: ld a1, 24(a0)
-; RV64ZVE32F-NEXT: ld a2, 16(a0)
-; RV64ZVE32F-NEXT: ld a3, 8(a0)
-; RV64ZVE32F-NEXT: ld a4, 0(a0)
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu
-; RV64ZVE32F-NEXT: vmset.m v0
-; RV64ZVE32F-NEXT: vmv.v.i v9, 0
-; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v10, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu
-; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0
; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0
-; RV64ZVE32F-NEXT: addi a0, sp, 15
-; RV64ZVE32F-NEXT: vsm.v v9, (a0)
-; RV64ZVE32F-NEXT: lb a0, 15(sp)
+; RV64ZVE32F-NEXT: vmset.m v9
+; RV64ZVE32F-NEXT: vmv.x.s a1, v9
; RV64ZVE32F-NEXT: beqz zero, .LBB61_5
; RV64ZVE32F-NEXT: # %bb.1: # %else
-; RV64ZVE32F-NEXT: andi a4, a0, 2
-; RV64ZVE32F-NEXT: bnez a4, .LBB61_6
+; RV64ZVE32F-NEXT: andi a2, a1, 2
+; RV64ZVE32F-NEXT: bnez a2, .LBB61_6
; RV64ZVE32F-NEXT: .LBB61_2: # %else2
-; RV64ZVE32F-NEXT: andi a3, a0, 4
-; RV64ZVE32F-NEXT: bnez a3, .LBB61_7
+; RV64ZVE32F-NEXT: andi a2, a1, 4
+; RV64ZVE32F-NEXT: bnez a2, .LBB61_7
; RV64ZVE32F-NEXT: .LBB61_3: # %else5
-; RV64ZVE32F-NEXT: andi a0, a0, 8
-; RV64ZVE32F-NEXT: bnez a0, .LBB61_8
+; RV64ZVE32F-NEXT: andi a1, a1, 8
+; RV64ZVE32F-NEXT: bnez a1, .LBB61_8
; RV64ZVE32F-NEXT: .LBB61_4: # %else8
-; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
; RV64ZVE32F-NEXT: .LBB61_5: # %cond.load
-; RV64ZVE32F-NEXT: flh ft0, 0(a4)
+; RV64ZVE32F-NEXT: ld a2, 0(a0)
+; RV64ZVE32F-NEXT: flh ft0, 0(a2)
; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, tu, mu
; RV64ZVE32F-NEXT: vfmv.s.f v8, ft0
-; RV64ZVE32F-NEXT: andi a4, a0, 2
-; RV64ZVE32F-NEXT: beqz a4, .LBB61_2
+; RV64ZVE32F-NEXT: andi a2, a1, 2
+; RV64ZVE32F-NEXT: beqz a2, .LBB61_2
; RV64ZVE32F-NEXT: .LBB61_6: # %cond.load1
-; RV64ZVE32F-NEXT: flh ft0, 0(a3)
+; RV64ZVE32F-NEXT: ld a2, 8(a0)
+; RV64ZVE32F-NEXT: flh ft0, 0(a2)
; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, mu
; RV64ZVE32F-NEXT: vfmv.s.f v9, ft0
; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, tu, mu
; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1
-; RV64ZVE32F-NEXT: andi a3, a0, 4
-; RV64ZVE32F-NEXT: beqz a3, .LBB61_3
+; RV64ZVE32F-NEXT: andi a2, a1, 4
+; RV64ZVE32F-NEXT: beqz a2, .LBB61_3
; RV64ZVE32F-NEXT: .LBB61_7: # %cond.load4
+; RV64ZVE32F-NEXT: ld a2, 16(a0)
; RV64ZVE32F-NEXT: flh ft0, 0(a2)
; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, mu
; RV64ZVE32F-NEXT: vfmv.s.f v9, ft0
; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, mf2, tu, mu
; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 2
-; RV64ZVE32F-NEXT: andi a0, a0, 8
-; RV64ZVE32F-NEXT: beqz a0, .LBB61_4
+; RV64ZVE32F-NEXT: andi a1, a1, 8
+; RV64ZVE32F-NEXT: beqz a1, .LBB61_4
; RV64ZVE32F-NEXT: .LBB61_8: # %cond.load7
-; RV64ZVE32F-NEXT: flh ft0, 0(a1)
+; RV64ZVE32F-NEXT: ld a0, 24(a0)
+; RV64ZVE32F-NEXT: flh ft0, 0(a0)
; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, mu
; RV64ZVE32F-NEXT: vfmv.s.f v9, ft0
; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, tu, mu
; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 3
-; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
%mhead = insertelement <4 x i1> poison, i1 1, i32 0
%mtrue = shufflevector <4 x i1> %mhead, <4 x i1> poison, <4 x i32> zeroinitializer
@@ -8624,27 +8224,14 @@ define <2 x float> @mgather_v2f32(<2 x float*> %ptrs, <2 x i1> %m, <2 x float> %
;
; RV64ZVE32F-LABEL: mgather_v2f32:
; RV64ZVE32F: # %bb.0:
-; RV64ZVE32F-NEXT: addi sp, sp, -16
-; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16
-; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v9, 0
-; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v10, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu
-; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0
-; RV64ZVE32F-NEXT: addi a2, sp, 15
-; RV64ZVE32F-NEXT: vsm.v v9, (a2)
-; RV64ZVE32F-NEXT: lbu a2, 15(sp)
+; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu
+; RV64ZVE32F-NEXT: vmv.x.s a2, v0
; RV64ZVE32F-NEXT: andi a3, a2, 1
; RV64ZVE32F-NEXT: bnez a3, .LBB69_3
; RV64ZVE32F-NEXT: # %bb.1: # %else
; RV64ZVE32F-NEXT: andi a0, a2, 2
; RV64ZVE32F-NEXT: bnez a0, .LBB69_4
; RV64ZVE32F-NEXT: .LBB69_2: # %else2
-; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
; RV64ZVE32F-NEXT: .LBB69_3: # %cond.load
; RV64ZVE32F-NEXT: flw ft0, 0(a0)
@@ -8658,7 +8245,6 @@ define <2 x float> @mgather_v2f32(<2 x float*> %ptrs, <2 x i1> %m, <2 x float> %
; RV64ZVE32F-NEXT: vfmv.s.f v9, ft0
; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, tu, mu
; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1
-; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
%v = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> %ptrs, i32 4, <2 x i1> %m, <2 x float> %passthru)
ret <2 x float> %v
@@ -8683,20 +8269,8 @@ define <4 x float> @mgather_v4f32(<4 x float*> %ptrs, <4 x i1> %m, <4 x float> %
;
; RV64ZVE32F-LABEL: mgather_v4f32:
; RV64ZVE32F: # %bb.0:
-; RV64ZVE32F-NEXT: addi sp, sp, -16
-; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v9, 0
-; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v10, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu
-; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0
-; RV64ZVE32F-NEXT: addi a1, sp, 15
-; RV64ZVE32F-NEXT: vsm.v v9, (a1)
-; RV64ZVE32F-NEXT: lbu a1, 15(sp)
+; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu
+; RV64ZVE32F-NEXT: vmv.x.s a1, v0
; RV64ZVE32F-NEXT: andi a2, a1, 1
; RV64ZVE32F-NEXT: bnez a2, .LBB70_5
; RV64ZVE32F-NEXT: # %bb.1: # %else
@@ -8709,7 +8283,6 @@ define <4 x float> @mgather_v4f32(<4 x float*> %ptrs, <4 x i1> %m, <4 x float> %
; RV64ZVE32F-NEXT: andi a1, a1, 8
; RV64ZVE32F-NEXT: bnez a1, .LBB70_8
; RV64ZVE32F-NEXT: .LBB70_4: # %else8
-; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
; RV64ZVE32F-NEXT: .LBB70_5: # %cond.load
; RV64ZVE32F-NEXT: ld a2, 0(a0)
@@ -8743,7 +8316,6 @@ define <4 x float> @mgather_v4f32(<4 x float*> %ptrs, <4 x i1> %m, <4 x float> %
; RV64ZVE32F-NEXT: vfmv.s.f v9, ft0
; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, tu, mu
; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 3
-; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
%v = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %ptrs, i32 4, <4 x i1> %m, <4 x float> %passthru)
ret <4 x float> %v
@@ -8765,67 +8337,53 @@ define <4 x float> @mgather_truemask_v4f32(<4 x float*> %ptrs, <4 x float> %pass
;
; RV64ZVE32F-LABEL: mgather_truemask_v4f32:
; RV64ZVE32F: # %bb.0:
-; RV64ZVE32F-NEXT: addi sp, sp, -16
-; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16
-; RV64ZVE32F-NEXT: ld a1, 24(a0)
-; RV64ZVE32F-NEXT: ld a2, 16(a0)
-; RV64ZVE32F-NEXT: ld a3, 8(a0)
-; RV64ZVE32F-NEXT: ld a4, 0(a0)
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu
-; RV64ZVE32F-NEXT: vmset.m v0
-; RV64ZVE32F-NEXT: vmv.v.i v9, 0
-; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v10, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu
-; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0
; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0
-; RV64ZVE32F-NEXT: addi a0, sp, 15
-; RV64ZVE32F-NEXT: vsm.v v9, (a0)
-; RV64ZVE32F-NEXT: lb a0, 15(sp)
+; RV64ZVE32F-NEXT: vmset.m v9
+; RV64ZVE32F-NEXT: vmv.x.s a1, v9
; RV64ZVE32F-NEXT: beqz zero, .LBB71_5
; RV64ZVE32F-NEXT: # %bb.1: # %else
-; RV64ZVE32F-NEXT: andi a4, a0, 2
-; RV64ZVE32F-NEXT: bnez a4, .LBB71_6
+; RV64ZVE32F-NEXT: andi a2, a1, 2
+; RV64ZVE32F-NEXT: bnez a2, .LBB71_6
; RV64ZVE32F-NEXT: .LBB71_2: # %else2
-; RV64ZVE32F-NEXT: andi a3, a0, 4
-; RV64ZVE32F-NEXT: bnez a3, .LBB71_7
+; RV64ZVE32F-NEXT: andi a2, a1, 4
+; RV64ZVE32F-NEXT: bnez a2, .LBB71_7
; RV64ZVE32F-NEXT: .LBB71_3: # %else5
-; RV64ZVE32F-NEXT: andi a0, a0, 8
-; RV64ZVE32F-NEXT: bnez a0, .LBB71_8
+; RV64ZVE32F-NEXT: andi a1, a1, 8
+; RV64ZVE32F-NEXT: bnez a1, .LBB71_8
; RV64ZVE32F-NEXT: .LBB71_4: # %else8
-; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
; RV64ZVE32F-NEXT: .LBB71_5: # %cond.load
-; RV64ZVE32F-NEXT: flw ft0, 0(a4)
+; RV64ZVE32F-NEXT: ld a2, 0(a0)
+; RV64ZVE32F-NEXT: flw ft0, 0(a2)
; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, tu, mu
; RV64ZVE32F-NEXT: vfmv.s.f v8, ft0
-; RV64ZVE32F-NEXT: andi a4, a0, 2
-; RV64ZVE32F-NEXT: beqz a4, .LBB71_2
+; RV64ZVE32F-NEXT: andi a2, a1, 2
+; RV64ZVE32F-NEXT: beqz a2, .LBB71_2
; RV64ZVE32F-NEXT: .LBB71_6: # %cond.load1
-; RV64ZVE32F-NEXT: flw ft0, 0(a3)
+; RV64ZVE32F-NEXT: ld a2, 8(a0)
+; RV64ZVE32F-NEXT: flw ft0, 0(a2)
; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, ta, mu
; RV64ZVE32F-NEXT: vfmv.s.f v9, ft0
; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, tu, mu
; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1
-; RV64ZVE32F-NEXT: andi a3, a0, 4
-; RV64ZVE32F-NEXT: beqz a3, .LBB71_3
+; RV64ZVE32F-NEXT: andi a2, a1, 4
+; RV64ZVE32F-NEXT: beqz a2, .LBB71_3
; RV64ZVE32F-NEXT: .LBB71_7: # %cond.load4
+; RV64ZVE32F-NEXT: ld a2, 16(a0)
; RV64ZVE32F-NEXT: flw ft0, 0(a2)
; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, ta, mu
; RV64ZVE32F-NEXT: vfmv.s.f v9, ft0
; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m1, tu, mu
; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 2
-; RV64ZVE32F-NEXT: andi a0, a0, 8
-; RV64ZVE32F-NEXT: beqz a0, .LBB71_4
+; RV64ZVE32F-NEXT: andi a1, a1, 8
+; RV64ZVE32F-NEXT: beqz a1, .LBB71_4
; RV64ZVE32F-NEXT: .LBB71_8: # %cond.load7
-; RV64ZVE32F-NEXT: flw ft0, 0(a1)
+; RV64ZVE32F-NEXT: ld a0, 24(a0)
+; RV64ZVE32F-NEXT: flw ft0, 0(a0)
; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, ta, mu
; RV64ZVE32F-NEXT: vfmv.s.f v9, ft0
; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, tu, mu
; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 3
-; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
%mhead = insertelement <4 x i1> poison, i1 1, i32 0
%mtrue = shufflevector <4 x i1> %mhead, <4 x i1> poison, <4 x i32> zeroinitializer
@@ -10101,30 +9659,17 @@ define <2 x double> @mgather_v2f64(<2 x double*> %ptrs, <2 x i1> %m, <2 x double
;
; RV32ZVE32F-LABEL: mgather_v2f64:
; RV32ZVE32F: # %bb.0:
-; RV32ZVE32F-NEXT: addi sp, sp, -16
-; RV32ZVE32F-NEXT: .cfi_def_cfa_offset 16
-; RV32ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu
-; RV32ZVE32F-NEXT: vmv.v.i v9, 0
-; RV32ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0
-; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV32ZVE32F-NEXT: vmv.v.i v10, 0
-; RV32ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu
-; RV32ZVE32F-NEXT: vslideup.vi v10, v9, 0
-; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV32ZVE32F-NEXT: vmsne.vi v9, v10, 0
-; RV32ZVE32F-NEXT: addi a0, sp, 15
-; RV32ZVE32F-NEXT: vsm.v v9, (a0)
-; RV32ZVE32F-NEXT: lbu a0, 15(sp)
+; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu
+; RV32ZVE32F-NEXT: vmv.x.s a0, v0
; RV32ZVE32F-NEXT: andi a1, a0, 1
; RV32ZVE32F-NEXT: bnez a1, .LBB82_3
; RV32ZVE32F-NEXT: # %bb.1: # %else
; RV32ZVE32F-NEXT: andi a0, a0, 2
; RV32ZVE32F-NEXT: bnez a0, .LBB82_4
; RV32ZVE32F-NEXT: .LBB82_2: # %else2
-; RV32ZVE32F-NEXT: addi sp, sp, 16
; RV32ZVE32F-NEXT: ret
; RV32ZVE32F-NEXT: .LBB82_3: # %cond.load
-; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, mu
+; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, mu
; RV32ZVE32F-NEXT: vmv.x.s a1, v8
; RV32ZVE32F-NEXT: fld fa0, 0(a1)
; RV32ZVE32F-NEXT: andi a0, a0, 2
@@ -10134,32 +9679,18 @@ define <2 x double> @mgather_v2f64(<2 x double*> %ptrs, <2 x i1> %m, <2 x double
; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 1
; RV32ZVE32F-NEXT: vmv.x.s a0, v8
; RV32ZVE32F-NEXT: fld fa1, 0(a0)
-; RV32ZVE32F-NEXT: addi sp, sp, 16
; RV32ZVE32F-NEXT: ret
;
; RV64ZVE32F-LABEL: mgather_v2f64:
; RV64ZVE32F: # %bb.0:
-; RV64ZVE32F-NEXT: addi sp, sp, -16
-; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16
-; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v8, 0
-; RV64ZVE32F-NEXT: vmerge.vim v8, v8, 1, v0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v9, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu
-; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmsne.vi v8, v9, 0
-; RV64ZVE32F-NEXT: addi a2, sp, 15
-; RV64ZVE32F-NEXT: vsm.v v8, (a2)
-; RV64ZVE32F-NEXT: lbu a2, 15(sp)
+; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu
+; RV64ZVE32F-NEXT: vmv.x.s a2, v0
; RV64ZVE32F-NEXT: andi a3, a2, 1
; RV64ZVE32F-NEXT: bnez a3, .LBB82_3
; RV64ZVE32F-NEXT: # %bb.1: # %else
; RV64ZVE32F-NEXT: andi a0, a2, 2
; RV64ZVE32F-NEXT: bnez a0, .LBB82_4
; RV64ZVE32F-NEXT: .LBB82_2: # %else2
-; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
; RV64ZVE32F-NEXT: .LBB82_3: # %cond.load
; RV64ZVE32F-NEXT: fld fa0, 0(a0)
@@ -10167,7 +9698,6 @@ define <2 x double> @mgather_v2f64(<2 x double*> %ptrs, <2 x i1> %m, <2 x double
; RV64ZVE32F-NEXT: beqz a0, .LBB82_2
; RV64ZVE32F-NEXT: .LBB82_4: # %cond.load1
; RV64ZVE32F-NEXT: fld fa1, 0(a1)
-; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
%v = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> %ptrs, i32 8, <2 x i1> %m, <2 x double> %passthru)
ret <2 x double> %v
@@ -10192,20 +9722,8 @@ define <4 x double> @mgather_v4f64(<4 x double*> %ptrs, <4 x i1> %m, <4 x double
;
; RV32ZVE32F-LABEL: mgather_v4f64:
; RV32ZVE32F: # %bb.0:
-; RV32ZVE32F-NEXT: addi sp, sp, -16
-; RV32ZVE32F-NEXT: .cfi_def_cfa_offset 16
-; RV32ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu
-; RV32ZVE32F-NEXT: vmv.v.i v9, 0
-; RV32ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0
-; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV32ZVE32F-NEXT: vmv.v.i v10, 0
-; RV32ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu
-; RV32ZVE32F-NEXT: vslideup.vi v10, v9, 0
-; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV32ZVE32F-NEXT: vmsne.vi v9, v10, 0
-; RV32ZVE32F-NEXT: addi a1, sp, 15
-; RV32ZVE32F-NEXT: vsm.v v9, (a1)
-; RV32ZVE32F-NEXT: lbu a1, 15(sp)
+; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu
+; RV32ZVE32F-NEXT: vmv.x.s a1, v0
; RV32ZVE32F-NEXT: andi a2, a1, 1
; RV32ZVE32F-NEXT: bnez a2, .LBB83_6
; RV32ZVE32F-NEXT: # %bb.1: # %else
@@ -10227,10 +9745,9 @@ define <4 x double> @mgather_v4f64(<4 x double*> %ptrs, <4 x i1> %m, <4 x double
; RV32ZVE32F-NEXT: fsd fa1, 8(a0)
; RV32ZVE32F-NEXT: fsd fa2, 16(a0)
; RV32ZVE32F-NEXT: fsd fa3, 24(a0)
-; RV32ZVE32F-NEXT: addi sp, sp, 16
; RV32ZVE32F-NEXT: ret
; RV32ZVE32F-NEXT: .LBB83_6: # %cond.load
-; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, mu
+; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, mu
; RV32ZVE32F-NEXT: vmv.x.s a2, v8
; RV32ZVE32F-NEXT: fld fa0, 0(a2)
; RV32ZVE32F-NEXT: andi a2, a1, 2
@@ -10253,20 +9770,8 @@ define <4 x double> @mgather_v4f64(<4 x double*> %ptrs, <4 x i1> %m, <4 x double
;
; RV64ZVE32F-LABEL: mgather_v4f64:
; RV64ZVE32F: # %bb.0:
-; RV64ZVE32F-NEXT: addi sp, sp, -16
-; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v8, 0
-; RV64ZVE32F-NEXT: vmerge.vim v8, v8, 1, v0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v9, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu
-; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmsne.vi v8, v9, 0
-; RV64ZVE32F-NEXT: addi a2, sp, 15
-; RV64ZVE32F-NEXT: vsm.v v8, (a2)
-; RV64ZVE32F-NEXT: lbu a2, 15(sp)
+; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu
+; RV64ZVE32F-NEXT: vmv.x.s a2, v0
; RV64ZVE32F-NEXT: andi a3, a2, 1
; RV64ZVE32F-NEXT: bnez a3, .LBB83_6
; RV64ZVE32F-NEXT: # %bb.1: # %else
@@ -10286,7 +9791,6 @@ define <4 x double> @mgather_v4f64(<4 x double*> %ptrs, <4 x i1> %m, <4 x double
; RV64ZVE32F-NEXT: fsd fa1, 8(a0)
; RV64ZVE32F-NEXT: fsd fa2, 16(a0)
; RV64ZVE32F-NEXT: fsd fa3, 24(a0)
-; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
; RV64ZVE32F-NEXT: .LBB83_6: # %cond.load
; RV64ZVE32F-NEXT: ld a3, 0(a1)
@@ -10324,21 +9828,9 @@ define <4 x double> @mgather_truemask_v4f64(<4 x double*> %ptrs, <4 x double> %p
;
; RV32ZVE32F-LABEL: mgather_truemask_v4f64:
; RV32ZVE32F: # %bb.0:
-; RV32ZVE32F-NEXT: addi sp, sp, -16
-; RV32ZVE32F-NEXT: .cfi_def_cfa_offset 16
-; RV32ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu
-; RV32ZVE32F-NEXT: vmset.m v0
-; RV32ZVE32F-NEXT: vmv.v.i v9, 0
-; RV32ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0
-; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV32ZVE32F-NEXT: vmv.v.i v10, 0
-; RV32ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu
-; RV32ZVE32F-NEXT: vslideup.vi v10, v9, 0
; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV32ZVE32F-NEXT: vmsne.vi v9, v10, 0
-; RV32ZVE32F-NEXT: addi a1, sp, 15
-; RV32ZVE32F-NEXT: vsm.v v9, (a1)
-; RV32ZVE32F-NEXT: lb a1, 15(sp)
+; RV32ZVE32F-NEXT: vmset.m v9
+; RV32ZVE32F-NEXT: vmv.x.s a1, v9
; RV32ZVE32F-NEXT: beqz zero, .LBB84_6
; RV32ZVE32F-NEXT: # %bb.1: # %else
; RV32ZVE32F-NEXT: andi a2, a1, 2
@@ -10359,7 +9851,6 @@ define <4 x double> @mgather_truemask_v4f64(<4 x double*> %ptrs, <4 x double> %p
; RV32ZVE32F-NEXT: fsd fa1, 8(a0)
; RV32ZVE32F-NEXT: fsd fa2, 16(a0)
; RV32ZVE32F-NEXT: fsd fa3, 24(a0)
-; RV32ZVE32F-NEXT: addi sp, sp, 16
; RV32ZVE32F-NEXT: ret
; RV32ZVE32F-NEXT: .LBB84_6: # %cond.load
; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, mu
@@ -10385,56 +9876,43 @@ define <4 x double> @mgather_truemask_v4f64(<4 x double*> %ptrs, <4 x double> %p
;
; RV64ZVE32F-LABEL: mgather_truemask_v4f64:
; RV64ZVE32F: # %bb.0:
-; RV64ZVE32F-NEXT: addi sp, sp, -16
-; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16
-; RV64ZVE32F-NEXT: ld a2, 24(a1)
-; RV64ZVE32F-NEXT: ld a3, 16(a1)
-; RV64ZVE32F-NEXT: ld a4, 8(a1)
-; RV64ZVE32F-NEXT: ld a5, 0(a1)
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu
-; RV64ZVE32F-NEXT: vmset.m v0
-; RV64ZVE32F-NEXT: vmv.v.i v8, 0
-; RV64ZVE32F-NEXT: vmerge.vim v8, v8, 1, v0
; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v9, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu
-; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmsne.vi v8, v9, 0
-; RV64ZVE32F-NEXT: addi a1, sp, 15
-; RV64ZVE32F-NEXT: vsm.v v8, (a1)
-; RV64ZVE32F-NEXT: lb a1, 15(sp)
+; RV64ZVE32F-NEXT: vmset.m v8
+; RV64ZVE32F-NEXT: vmv.x.s a2, v8
; RV64ZVE32F-NEXT: beqz zero, .LBB84_6
; RV64ZVE32F-NEXT: # %bb.1: # %else
-; RV64ZVE32F-NEXT: andi a5, a1, 2
-; RV64ZVE32F-NEXT: bnez a5, .LBB84_7
+; RV64ZVE32F-NEXT: andi a3, a2, 2
+; RV64ZVE32F-NEXT: bnez a3, .LBB84_7
; RV64ZVE32F-NEXT: .LBB84_2: # %else2
-; RV64ZVE32F-NEXT: andi a4, a1, 4
-; RV64ZVE32F-NEXT: bnez a4, .LBB84_8
+; RV64ZVE32F-NEXT: andi a3, a2, 4
+; RV64ZVE32F-NEXT: bnez a3, .LBB84_8
; RV64ZVE32F-NEXT: .LBB84_3: # %else5
-; RV64ZVE32F-NEXT: andi a1, a1, 8
-; RV64ZVE32F-NEXT: beqz a1, .LBB84_5
+; RV64ZVE32F-NEXT: andi a2, a2, 8
+; RV64ZVE32F-NEXT: beqz a2, .LBB84_5
; RV64ZVE32F-NEXT: .LBB84_4: # %cond.load7
-; RV64ZVE32F-NEXT: fld fa3, 0(a2)
+; RV64ZVE32F-NEXT: ld a1, 24(a1)
+; RV64ZVE32F-NEXT: fld fa3, 0(a1)
; RV64ZVE32F-NEXT: .LBB84_5: # %else8
; RV64ZVE32F-NEXT: fsd fa0, 0(a0)
; RV64ZVE32F-NEXT: fsd fa1, 8(a0)
; RV64ZVE32F-NEXT: fsd fa2, 16(a0)
; RV64ZVE32F-NEXT: fsd fa3, 24(a0)
-; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
; RV64ZVE32F-NEXT: .LBB84_6: # %cond.load
-; RV64ZVE32F-NEXT: fld fa0, 0(a5)
-; RV64ZVE32F-NEXT: andi a5, a1, 2
-; RV64ZVE32F-NEXT: beqz a5, .LBB84_2
+; RV64ZVE32F-NEXT: ld a3, 0(a1)
+; RV64ZVE32F-NEXT: fld fa0, 0(a3)
+; RV64ZVE32F-NEXT: andi a3, a2, 2
+; RV64ZVE32F-NEXT: beqz a3, .LBB84_2
; RV64ZVE32F-NEXT: .LBB84_7: # %cond.load1
-; RV64ZVE32F-NEXT: fld fa1, 0(a4)
-; RV64ZVE32F-NEXT: andi a4, a1, 4
-; RV64ZVE32F-NEXT: beqz a4, .LBB84_3
+; RV64ZVE32F-NEXT: ld a3, 8(a1)
+; RV64ZVE32F-NEXT: fld fa1, 0(a3)
+; RV64ZVE32F-NEXT: andi a3, a2, 4
+; RV64ZVE32F-NEXT: beqz a3, .LBB84_3
; RV64ZVE32F-NEXT: .LBB84_8: # %cond.load4
+; RV64ZVE32F-NEXT: ld a3, 16(a1)
; RV64ZVE32F-NEXT: fld fa2, 0(a3)
-; RV64ZVE32F-NEXT: andi a1, a1, 8
-; RV64ZVE32F-NEXT: bnez a1, .LBB84_4
+; RV64ZVE32F-NEXT: andi a2, a2, 8
+; RV64ZVE32F-NEXT: bnez a2, .LBB84_4
; RV64ZVE32F-NEXT: j .LBB84_5
%mhead = insertelement <4 x i1> poison, i1 1, i32 0
%mtrue = shufflevector <4 x i1> %mhead, <4 x i1> poison, <4 x i32> zeroinitializer
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll
index 9ae089d3cefc2..935dccf07f7ac 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll
@@ -69,27 +69,14 @@ define void @mscatter_v2i8(<2 x i8> %val, <2 x i8*> %ptrs, <2 x i1> %m) {
;
; RV64ZVE32F-LABEL: mscatter_v2i8:
; RV64ZVE32F: # %bb.0:
-; RV64ZVE32F-NEXT: addi sp, sp, -16
-; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16
-; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v9, 0
-; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v10, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu
-; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0
-; RV64ZVE32F-NEXT: addi a2, sp, 15
-; RV64ZVE32F-NEXT: vsm.v v9, (a2)
-; RV64ZVE32F-NEXT: lbu a2, 15(sp)
+; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu
+; RV64ZVE32F-NEXT: vmv.x.s a2, v0
; RV64ZVE32F-NEXT: andi a3, a2, 1
; RV64ZVE32F-NEXT: bnez a3, .LBB1_3
; RV64ZVE32F-NEXT: # %bb.1: # %else
; RV64ZVE32F-NEXT: andi a0, a2, 2
; RV64ZVE32F-NEXT: bnez a0, .LBB1_4
; RV64ZVE32F-NEXT: .LBB1_2: # %else2
-; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
; RV64ZVE32F-NEXT: .LBB1_3: # %cond.store
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu
@@ -100,7 +87,6 @@ define void @mscatter_v2i8(<2 x i8> %val, <2 x i8*> %ptrs, <2 x i1> %m) {
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu
; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
; RV64ZVE32F-NEXT: vse8.v v8, (a1)
-; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
call void @llvm.masked.scatter.v2i8.v2p0i8(<2 x i8> %val, <2 x i8*> %ptrs, i32 1, <2 x i1> %m)
ret void
@@ -130,21 +116,8 @@ define void @mscatter_v2i16_truncstore_v2i8(<2 x i16> %val, <2 x i8*> %ptrs, <2
;
; RV64ZVE32F-LABEL: mscatter_v2i16_truncstore_v2i8:
; RV64ZVE32F: # %bb.0:
-; RV64ZVE32F-NEXT: addi sp, sp, -16
-; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16
-; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v9, 0
-; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v10, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu
-; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0
-; RV64ZVE32F-NEXT: addi a2, sp, 15
-; RV64ZVE32F-NEXT: vsm.v v9, (a2)
-; RV64ZVE32F-NEXT: lbu a2, 15(sp)
; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu
+; RV64ZVE32F-NEXT: vmv.x.s a2, v0
; RV64ZVE32F-NEXT: andi a3, a2, 1
; RV64ZVE32F-NEXT: vncvt.x.x.w v8, v8
; RV64ZVE32F-NEXT: bnez a3, .LBB2_3
@@ -152,7 +125,6 @@ define void @mscatter_v2i16_truncstore_v2i8(<2 x i16> %val, <2 x i8*> %ptrs, <2
; RV64ZVE32F-NEXT: andi a0, a2, 2
; RV64ZVE32F-NEXT: bnez a0, .LBB2_4
; RV64ZVE32F-NEXT: .LBB2_2: # %else2
-; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
; RV64ZVE32F-NEXT: .LBB2_3: # %cond.store
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu
@@ -163,7 +135,6 @@ define void @mscatter_v2i16_truncstore_v2i8(<2 x i16> %val, <2 x i8*> %ptrs, <2
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu
; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
; RV64ZVE32F-NEXT: vse8.v v8, (a1)
-; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
%tval = trunc <2 x i16> %val to <2 x i8>
call void @llvm.masked.scatter.v2i8.v2p0i8(<2 x i8> %tval, <2 x i8*> %ptrs, i32 1, <2 x i1> %m)
@@ -200,23 +171,10 @@ define void @mscatter_v2i32_truncstore_v2i8(<2 x i32> %val, <2 x i8*> %ptrs, <2
;
; RV64ZVE32F-LABEL: mscatter_v2i32_truncstore_v2i8:
; RV64ZVE32F: # %bb.0:
-; RV64ZVE32F-NEXT: addi sp, sp, -16
-; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16
-; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v9, 0
-; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v10, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu
-; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0
-; RV64ZVE32F-NEXT: addi a2, sp, 15
-; RV64ZVE32F-NEXT: vsm.v v9, (a2)
-; RV64ZVE32F-NEXT: lbu a2, 15(sp)
; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu
; RV64ZVE32F-NEXT: vncvt.x.x.w v8, v8
; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, mf4, ta, mu
+; RV64ZVE32F-NEXT: vmv.x.s a2, v0
; RV64ZVE32F-NEXT: andi a3, a2, 1
; RV64ZVE32F-NEXT: vncvt.x.x.w v8, v8
; RV64ZVE32F-NEXT: bnez a3, .LBB3_3
@@ -224,7 +182,6 @@ define void @mscatter_v2i32_truncstore_v2i8(<2 x i32> %val, <2 x i8*> %ptrs, <2
; RV64ZVE32F-NEXT: andi a0, a2, 2
; RV64ZVE32F-NEXT: bnez a0, .LBB3_4
; RV64ZVE32F-NEXT: .LBB3_2: # %else2
-; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
; RV64ZVE32F-NEXT: .LBB3_3: # %cond.store
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu
@@ -235,7 +192,6 @@ define void @mscatter_v2i32_truncstore_v2i8(<2 x i32> %val, <2 x i8*> %ptrs, <2
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu
; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
; RV64ZVE32F-NEXT: vse8.v v8, (a1)
-; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
%tval = trunc <2 x i32> %val to <2 x i8>
call void @llvm.masked.scatter.v2i8.v2p0i8(<2 x i8> %tval, <2 x i8*> %ptrs, i32 1, <2 x i1> %m)
@@ -288,31 +244,20 @@ define void @mscatter_v2i64_truncstore_v2i8(<2 x i64> %val, <2 x i8*> %ptrs, <2
; RV64ZVE32F: # %bb.0:
; RV64ZVE32F-NEXT: addi sp, sp, -16
; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16
-; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v8, 0
-; RV64ZVE32F-NEXT: vmerge.vim v8, v8, 1, v0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v9, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu
-; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmsne.vi v8, v9, 0
-; RV64ZVE32F-NEXT: addi a4, sp, 15
-; RV64ZVE32F-NEXT: vsm.v v8, (a4)
-; RV64ZVE32F-NEXT: lbu a4, 15(sp)
-; RV64ZVE32F-NEXT: sb a1, 14(sp)
-; RV64ZVE32F-NEXT: sb a0, 13(sp)
+; RV64ZVE32F-NEXT: sb a1, 15(sp)
+; RV64ZVE32F-NEXT: sb a0, 14(sp)
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu
-; RV64ZVE32F-NEXT: addi a0, sp, 14
+; RV64ZVE32F-NEXT: addi a0, sp, 15
; RV64ZVE32F-NEXT: vle8.v v9, (a0)
-; RV64ZVE32F-NEXT: addi a0, sp, 13
+; RV64ZVE32F-NEXT: addi a0, sp, 14
; RV64ZVE32F-NEXT: vle8.v v8, (a0)
; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, tu, mu
-; RV64ZVE32F-NEXT: andi a0, a4, 1
+; RV64ZVE32F-NEXT: vmv.x.s a0, v0
+; RV64ZVE32F-NEXT: andi a1, a0, 1
; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1
-; RV64ZVE32F-NEXT: bnez a0, .LBB4_3
+; RV64ZVE32F-NEXT: bnez a1, .LBB4_3
; RV64ZVE32F-NEXT: # %bb.1: # %else
-; RV64ZVE32F-NEXT: andi a0, a4, 2
+; RV64ZVE32F-NEXT: andi a0, a0, 2
; RV64ZVE32F-NEXT: bnez a0, .LBB4_4
; RV64ZVE32F-NEXT: .LBB4_2: # %else2
; RV64ZVE32F-NEXT: addi sp, sp, 16
@@ -320,7 +265,7 @@ define void @mscatter_v2i64_truncstore_v2i8(<2 x i64> %val, <2 x i8*> %ptrs, <2
; RV64ZVE32F-NEXT: .LBB4_3: # %cond.store
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu
; RV64ZVE32F-NEXT: vse8.v v8, (a2)
-; RV64ZVE32F-NEXT: andi a0, a4, 2
+; RV64ZVE32F-NEXT: andi a0, a0, 2
; RV64ZVE32F-NEXT: beqz a0, .LBB4_2
; RV64ZVE32F-NEXT: .LBB4_4: # %cond.store1
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu
@@ -350,60 +295,46 @@ define void @mscatter_v4i8(<4 x i8> %val, <4 x i8*> %ptrs, <4 x i1> %m) {
;
; RV64ZVE32F-LABEL: mscatter_v4i8:
; RV64ZVE32F: # %bb.0:
-; RV64ZVE32F-NEXT: addi sp, sp, -16
-; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v9, 0
-; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v10, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu
-; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0
-; RV64ZVE32F-NEXT: addi a1, sp, 15
-; RV64ZVE32F-NEXT: vsm.v v9, (a1)
; RV64ZVE32F-NEXT: ld a1, 24(a0)
-; RV64ZVE32F-NEXT: lbu a2, 15(sp)
-; RV64ZVE32F-NEXT: ld a3, 16(a0)
+; RV64ZVE32F-NEXT: ld a2, 16(a0)
; RV64ZVE32F-NEXT: ld a4, 8(a0)
-; RV64ZVE32F-NEXT: andi a5, a2, 1
+; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu
+; RV64ZVE32F-NEXT: vmv.x.s a3, v0
+; RV64ZVE32F-NEXT: andi a5, a3, 1
; RV64ZVE32F-NEXT: bnez a5, .LBB5_5
; RV64ZVE32F-NEXT: # %bb.1: # %else
-; RV64ZVE32F-NEXT: andi a0, a2, 2
+; RV64ZVE32F-NEXT: andi a0, a3, 2
; RV64ZVE32F-NEXT: bnez a0, .LBB5_6
; RV64ZVE32F-NEXT: .LBB5_2: # %else2
-; RV64ZVE32F-NEXT: andi a0, a2, 4
+; RV64ZVE32F-NEXT: andi a0, a3, 4
; RV64ZVE32F-NEXT: bnez a0, .LBB5_7
; RV64ZVE32F-NEXT: .LBB5_3: # %else4
-; RV64ZVE32F-NEXT: andi a0, a2, 8
+; RV64ZVE32F-NEXT: andi a0, a3, 8
; RV64ZVE32F-NEXT: bnez a0, .LBB5_8
; RV64ZVE32F-NEXT: .LBB5_4: # %else6
-; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
; RV64ZVE32F-NEXT: .LBB5_5: # %cond.store
; RV64ZVE32F-NEXT: ld a0, 0(a0)
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu
; RV64ZVE32F-NEXT: vse8.v v8, (a0)
-; RV64ZVE32F-NEXT: andi a0, a2, 2
+; RV64ZVE32F-NEXT: andi a0, a3, 2
; RV64ZVE32F-NEXT: beqz a0, .LBB5_2
; RV64ZVE32F-NEXT: .LBB5_6: # %cond.store1
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu
; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1
; RV64ZVE32F-NEXT: vse8.v v9, (a4)
-; RV64ZVE32F-NEXT: andi a0, a2, 4
+; RV64ZVE32F-NEXT: andi a0, a3, 4
; RV64ZVE32F-NEXT: beqz a0, .LBB5_3
; RV64ZVE32F-NEXT: .LBB5_7: # %cond.store3
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu
; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2
-; RV64ZVE32F-NEXT: vse8.v v9, (a3)
-; RV64ZVE32F-NEXT: andi a0, a2, 8
+; RV64ZVE32F-NEXT: vse8.v v9, (a2)
+; RV64ZVE32F-NEXT: andi a0, a3, 8
; RV64ZVE32F-NEXT: beqz a0, .LBB5_4
; RV64ZVE32F-NEXT: .LBB5_8: # %cond.store5
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu
; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 3
; RV64ZVE32F-NEXT: vse8.v v8, (a1)
-; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> %val, <4 x i8*> %ptrs, i32 1, <4 x i1> %m)
ret void
@@ -424,60 +355,46 @@ define void @mscatter_truemask_v4i8(<4 x i8> %val, <4 x i8*> %ptrs) {
;
; RV64ZVE32F-LABEL: mscatter_truemask_v4i8:
; RV64ZVE32F: # %bb.0:
-; RV64ZVE32F-NEXT: addi sp, sp, -16
-; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16
; RV64ZVE32F-NEXT: ld a1, 24(a0)
; RV64ZVE32F-NEXT: ld a2, 16(a0)
-; RV64ZVE32F-NEXT: ld a3, 8(a0)
-; RV64ZVE32F-NEXT: ld a4, 0(a0)
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu
-; RV64ZVE32F-NEXT: vmset.m v0
-; RV64ZVE32F-NEXT: vmv.v.i v9, 0
-; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v10, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu
-; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0
+; RV64ZVE32F-NEXT: ld a4, 8(a0)
; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0
-; RV64ZVE32F-NEXT: addi a0, sp, 15
-; RV64ZVE32F-NEXT: vsm.v v9, (a0)
-; RV64ZVE32F-NEXT: lb a0, 15(sp)
+; RV64ZVE32F-NEXT: vmset.m v9
+; RV64ZVE32F-NEXT: vmv.x.s a3, v9
; RV64ZVE32F-NEXT: beqz zero, .LBB6_5
; RV64ZVE32F-NEXT: # %bb.1: # %else
-; RV64ZVE32F-NEXT: andi a4, a0, 2
-; RV64ZVE32F-NEXT: bnez a4, .LBB6_6
+; RV64ZVE32F-NEXT: andi a0, a3, 2
+; RV64ZVE32F-NEXT: bnez a0, .LBB6_6
; RV64ZVE32F-NEXT: .LBB6_2: # %else2
-; RV64ZVE32F-NEXT: andi a3, a0, 4
-; RV64ZVE32F-NEXT: bnez a3, .LBB6_7
+; RV64ZVE32F-NEXT: andi a0, a3, 4
+; RV64ZVE32F-NEXT: bnez a0, .LBB6_7
; RV64ZVE32F-NEXT: .LBB6_3: # %else4
-; RV64ZVE32F-NEXT: andi a0, a0, 8
+; RV64ZVE32F-NEXT: andi a0, a3, 8
; RV64ZVE32F-NEXT: bnez a0, .LBB6_8
; RV64ZVE32F-NEXT: .LBB6_4: # %else6
-; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
; RV64ZVE32F-NEXT: .LBB6_5: # %cond.store
+; RV64ZVE32F-NEXT: ld a0, 0(a0)
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu
-; RV64ZVE32F-NEXT: vse8.v v8, (a4)
-; RV64ZVE32F-NEXT: andi a4, a0, 2
-; RV64ZVE32F-NEXT: beqz a4, .LBB6_2
+; RV64ZVE32F-NEXT: vse8.v v8, (a0)
+; RV64ZVE32F-NEXT: andi a0, a3, 2
+; RV64ZVE32F-NEXT: beqz a0, .LBB6_2
; RV64ZVE32F-NEXT: .LBB6_6: # %cond.store1
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu
; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1
-; RV64ZVE32F-NEXT: vse8.v v9, (a3)
-; RV64ZVE32F-NEXT: andi a3, a0, 4
-; RV64ZVE32F-NEXT: beqz a3, .LBB6_3
+; RV64ZVE32F-NEXT: vse8.v v9, (a4)
+; RV64ZVE32F-NEXT: andi a0, a3, 4
+; RV64ZVE32F-NEXT: beqz a0, .LBB6_3
; RV64ZVE32F-NEXT: .LBB6_7: # %cond.store3
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu
; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2
; RV64ZVE32F-NEXT: vse8.v v9, (a2)
-; RV64ZVE32F-NEXT: andi a0, a0, 8
+; RV64ZVE32F-NEXT: andi a0, a3, 8
; RV64ZVE32F-NEXT: beqz a0, .LBB6_4
; RV64ZVE32F-NEXT: .LBB6_8: # %cond.store5
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu
; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 3
; RV64ZVE32F-NEXT: vse8.v v8, (a1)
-; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
%mhead = insertelement <4 x i1> poison, i1 1, i32 0
%mtrue = shufflevector <4 x i1> %mhead, <4 x i1> poison, <4 x i32> zeroinitializer
@@ -777,27 +694,14 @@ define void @mscatter_v2i16(<2 x i16> %val, <2 x i16*> %ptrs, <2 x i1> %m) {
;
; RV64ZVE32F-LABEL: mscatter_v2i16:
; RV64ZVE32F: # %bb.0:
-; RV64ZVE32F-NEXT: addi sp, sp, -16
-; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16
-; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v9, 0
-; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v10, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu
-; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0
-; RV64ZVE32F-NEXT: addi a2, sp, 15
-; RV64ZVE32F-NEXT: vsm.v v9, (a2)
-; RV64ZVE32F-NEXT: lbu a2, 15(sp)
+; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu
+; RV64ZVE32F-NEXT: vmv.x.s a2, v0
; RV64ZVE32F-NEXT: andi a3, a2, 1
; RV64ZVE32F-NEXT: bnez a3, .LBB11_3
; RV64ZVE32F-NEXT: # %bb.1: # %else
; RV64ZVE32F-NEXT: andi a0, a2, 2
; RV64ZVE32F-NEXT: bnez a0, .LBB11_4
; RV64ZVE32F-NEXT: .LBB11_2: # %else2
-; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
; RV64ZVE32F-NEXT: .LBB11_3: # %cond.store
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu
@@ -808,7 +712,6 @@ define void @mscatter_v2i16(<2 x i16> %val, <2 x i16*> %ptrs, <2 x i1> %m) {
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu
; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
; RV64ZVE32F-NEXT: vse16.v v8, (a1)
-; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
call void @llvm.masked.scatter.v2i16.v2p0i16(<2 x i16> %val, <2 x i16*> %ptrs, i32 2, <2 x i1> %m)
ret void
@@ -838,29 +741,16 @@ define void @mscatter_v2i32_truncstore_v2i16(<2 x i32> %val, <2 x i16*> %ptrs, <
;
; RV64ZVE32F-LABEL: mscatter_v2i32_truncstore_v2i16:
; RV64ZVE32F: # %bb.0:
-; RV64ZVE32F-NEXT: addi sp, sp, -16
-; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16
-; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v9, 0
-; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v10, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu
-; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0
-; RV64ZVE32F-NEXT: addi a2, sp, 15
-; RV64ZVE32F-NEXT: vsm.v v9, (a2)
-; RV64ZVE32F-NEXT: lbu a2, 15(sp)
; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu
-; RV64ZVE32F-NEXT: andi a3, a2, 1
; RV64ZVE32F-NEXT: vncvt.x.x.w v8, v8
+; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, mf4, ta, mu
+; RV64ZVE32F-NEXT: vmv.x.s a2, v0
+; RV64ZVE32F-NEXT: andi a3, a2, 1
; RV64ZVE32F-NEXT: bnez a3, .LBB12_3
; RV64ZVE32F-NEXT: # %bb.1: # %else
; RV64ZVE32F-NEXT: andi a0, a2, 2
; RV64ZVE32F-NEXT: bnez a0, .LBB12_4
; RV64ZVE32F-NEXT: .LBB12_2: # %else2
-; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
; RV64ZVE32F-NEXT: .LBB12_3: # %cond.store
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu
@@ -871,7 +761,6 @@ define void @mscatter_v2i32_truncstore_v2i16(<2 x i32> %val, <2 x i16*> %ptrs, <
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu
; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
; RV64ZVE32F-NEXT: vse16.v v8, (a1)
-; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
%tval = trunc <2 x i32> %val to <2 x i16>
call void @llvm.masked.scatter.v2i16.v2p0i16(<2 x i16> %tval, <2 x i16*> %ptrs, i32 2, <2 x i1> %m)
@@ -920,31 +809,21 @@ define void @mscatter_v2i64_truncstore_v2i16(<2 x i64> %val, <2 x i16*> %ptrs, <
; RV64ZVE32F: # %bb.0:
; RV64ZVE32F-NEXT: addi sp, sp, -16
; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16
-; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v8, 0
-; RV64ZVE32F-NEXT: vmerge.vim v8, v8, 1, v0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v9, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu
-; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmsne.vi v8, v9, 0
-; RV64ZVE32F-NEXT: addi a4, sp, 15
-; RV64ZVE32F-NEXT: vsm.v v8, (a4)
-; RV64ZVE32F-NEXT: lbu a4, 15(sp)
-; RV64ZVE32F-NEXT: sh a1, 12(sp)
-; RV64ZVE32F-NEXT: sh a0, 10(sp)
+; RV64ZVE32F-NEXT: sh a1, 14(sp)
+; RV64ZVE32F-NEXT: sh a0, 12(sp)
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu
-; RV64ZVE32F-NEXT: addi a0, sp, 12
+; RV64ZVE32F-NEXT: addi a0, sp, 14
; RV64ZVE32F-NEXT: vle16.v v9, (a0)
-; RV64ZVE32F-NEXT: addi a0, sp, 10
+; RV64ZVE32F-NEXT: addi a0, sp, 12
; RV64ZVE32F-NEXT: vle16.v v8, (a0)
; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, tu, mu
-; RV64ZVE32F-NEXT: andi a0, a4, 1
; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1
-; RV64ZVE32F-NEXT: bnez a0, .LBB13_3
+; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, mf4, ta, mu
+; RV64ZVE32F-NEXT: vmv.x.s a0, v0
+; RV64ZVE32F-NEXT: andi a1, a0, 1
+; RV64ZVE32F-NEXT: bnez a1, .LBB13_3
; RV64ZVE32F-NEXT: # %bb.1: # %else
-; RV64ZVE32F-NEXT: andi a0, a4, 2
+; RV64ZVE32F-NEXT: andi a0, a0, 2
; RV64ZVE32F-NEXT: bnez a0, .LBB13_4
; RV64ZVE32F-NEXT: .LBB13_2: # %else2
; RV64ZVE32F-NEXT: addi sp, sp, 16
@@ -952,7 +831,7 @@ define void @mscatter_v2i64_truncstore_v2i16(<2 x i64> %val, <2 x i16*> %ptrs, <
; RV64ZVE32F-NEXT: .LBB13_3: # %cond.store
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu
; RV64ZVE32F-NEXT: vse16.v v8, (a2)
-; RV64ZVE32F-NEXT: andi a0, a4, 2
+; RV64ZVE32F-NEXT: andi a0, a0, 2
; RV64ZVE32F-NEXT: beqz a0, .LBB13_2
; RV64ZVE32F-NEXT: .LBB13_4: # %cond.store1
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu
@@ -982,60 +861,46 @@ define void @mscatter_v4i16(<4 x i16> %val, <4 x i16*> %ptrs, <4 x i1> %m) {
;
; RV64ZVE32F-LABEL: mscatter_v4i16:
; RV64ZVE32F: # %bb.0:
-; RV64ZVE32F-NEXT: addi sp, sp, -16
-; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v9, 0
-; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v10, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu
-; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0
-; RV64ZVE32F-NEXT: addi a1, sp, 15
-; RV64ZVE32F-NEXT: vsm.v v9, (a1)
; RV64ZVE32F-NEXT: ld a1, 24(a0)
-; RV64ZVE32F-NEXT: lbu a2, 15(sp)
-; RV64ZVE32F-NEXT: ld a3, 16(a0)
+; RV64ZVE32F-NEXT: ld a2, 16(a0)
; RV64ZVE32F-NEXT: ld a4, 8(a0)
-; RV64ZVE32F-NEXT: andi a5, a2, 1
+; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu
+; RV64ZVE32F-NEXT: vmv.x.s a3, v0
+; RV64ZVE32F-NEXT: andi a5, a3, 1
; RV64ZVE32F-NEXT: bnez a5, .LBB14_5
; RV64ZVE32F-NEXT: # %bb.1: # %else
-; RV64ZVE32F-NEXT: andi a0, a2, 2
+; RV64ZVE32F-NEXT: andi a0, a3, 2
; RV64ZVE32F-NEXT: bnez a0, .LBB14_6
; RV64ZVE32F-NEXT: .LBB14_2: # %else2
-; RV64ZVE32F-NEXT: andi a0, a2, 4
+; RV64ZVE32F-NEXT: andi a0, a3, 4
; RV64ZVE32F-NEXT: bnez a0, .LBB14_7
; RV64ZVE32F-NEXT: .LBB14_3: # %else4
-; RV64ZVE32F-NEXT: andi a0, a2, 8
+; RV64ZVE32F-NEXT: andi a0, a3, 8
; RV64ZVE32F-NEXT: bnez a0, .LBB14_8
; RV64ZVE32F-NEXT: .LBB14_4: # %else6
-; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
; RV64ZVE32F-NEXT: .LBB14_5: # %cond.store
; RV64ZVE32F-NEXT: ld a0, 0(a0)
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu
; RV64ZVE32F-NEXT: vse16.v v8, (a0)
-; RV64ZVE32F-NEXT: andi a0, a2, 2
+; RV64ZVE32F-NEXT: andi a0, a3, 2
; RV64ZVE32F-NEXT: beqz a0, .LBB14_2
; RV64ZVE32F-NEXT: .LBB14_6: # %cond.store1
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu
; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1
; RV64ZVE32F-NEXT: vse16.v v9, (a4)
-; RV64ZVE32F-NEXT: andi a0, a2, 4
+; RV64ZVE32F-NEXT: andi a0, a3, 4
; RV64ZVE32F-NEXT: beqz a0, .LBB14_3
; RV64ZVE32F-NEXT: .LBB14_7: # %cond.store3
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu
; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2
-; RV64ZVE32F-NEXT: vse16.v v9, (a3)
-; RV64ZVE32F-NEXT: andi a0, a2, 8
+; RV64ZVE32F-NEXT: vse16.v v9, (a2)
+; RV64ZVE32F-NEXT: andi a0, a3, 8
; RV64ZVE32F-NEXT: beqz a0, .LBB14_4
; RV64ZVE32F-NEXT: .LBB14_8: # %cond.store5
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu
; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 3
; RV64ZVE32F-NEXT: vse16.v v8, (a1)
-; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %val, <4 x i16*> %ptrs, i32 2, <4 x i1> %m)
ret void
@@ -1056,60 +921,46 @@ define void @mscatter_truemask_v4i16(<4 x i16> %val, <4 x i16*> %ptrs) {
;
; RV64ZVE32F-LABEL: mscatter_truemask_v4i16:
; RV64ZVE32F: # %bb.0:
-; RV64ZVE32F-NEXT: addi sp, sp, -16
-; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16
; RV64ZVE32F-NEXT: ld a1, 24(a0)
; RV64ZVE32F-NEXT: ld a2, 16(a0)
-; RV64ZVE32F-NEXT: ld a3, 8(a0)
-; RV64ZVE32F-NEXT: ld a4, 0(a0)
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu
-; RV64ZVE32F-NEXT: vmset.m v0
-; RV64ZVE32F-NEXT: vmv.v.i v9, 0
-; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v10, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu
-; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0
+; RV64ZVE32F-NEXT: ld a4, 8(a0)
; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0
-; RV64ZVE32F-NEXT: addi a0, sp, 15
-; RV64ZVE32F-NEXT: vsm.v v9, (a0)
-; RV64ZVE32F-NEXT: lb a0, 15(sp)
+; RV64ZVE32F-NEXT: vmset.m v9
+; RV64ZVE32F-NEXT: vmv.x.s a3, v9
; RV64ZVE32F-NEXT: beqz zero, .LBB15_5
; RV64ZVE32F-NEXT: # %bb.1: # %else
-; RV64ZVE32F-NEXT: andi a4, a0, 2
-; RV64ZVE32F-NEXT: bnez a4, .LBB15_6
+; RV64ZVE32F-NEXT: andi a0, a3, 2
+; RV64ZVE32F-NEXT: bnez a0, .LBB15_6
; RV64ZVE32F-NEXT: .LBB15_2: # %else2
-; RV64ZVE32F-NEXT: andi a3, a0, 4
-; RV64ZVE32F-NEXT: bnez a3, .LBB15_7
+; RV64ZVE32F-NEXT: andi a0, a3, 4
+; RV64ZVE32F-NEXT: bnez a0, .LBB15_7
; RV64ZVE32F-NEXT: .LBB15_3: # %else4
-; RV64ZVE32F-NEXT: andi a0, a0, 8
+; RV64ZVE32F-NEXT: andi a0, a3, 8
; RV64ZVE32F-NEXT: bnez a0, .LBB15_8
; RV64ZVE32F-NEXT: .LBB15_4: # %else6
-; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
; RV64ZVE32F-NEXT: .LBB15_5: # %cond.store
+; RV64ZVE32F-NEXT: ld a0, 0(a0)
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu
-; RV64ZVE32F-NEXT: vse16.v v8, (a4)
-; RV64ZVE32F-NEXT: andi a4, a0, 2
-; RV64ZVE32F-NEXT: beqz a4, .LBB15_2
+; RV64ZVE32F-NEXT: vse16.v v8, (a0)
+; RV64ZVE32F-NEXT: andi a0, a3, 2
+; RV64ZVE32F-NEXT: beqz a0, .LBB15_2
; RV64ZVE32F-NEXT: .LBB15_6: # %cond.store1
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu
; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1
-; RV64ZVE32F-NEXT: vse16.v v9, (a3)
-; RV64ZVE32F-NEXT: andi a3, a0, 4
-; RV64ZVE32F-NEXT: beqz a3, .LBB15_3
+; RV64ZVE32F-NEXT: vse16.v v9, (a4)
+; RV64ZVE32F-NEXT: andi a0, a3, 4
+; RV64ZVE32F-NEXT: beqz a0, .LBB15_3
; RV64ZVE32F-NEXT: .LBB15_7: # %cond.store3
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu
; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2
; RV64ZVE32F-NEXT: vse16.v v9, (a2)
-; RV64ZVE32F-NEXT: andi a0, a0, 8
+; RV64ZVE32F-NEXT: andi a0, a3, 8
; RV64ZVE32F-NEXT: beqz a0, .LBB15_4
; RV64ZVE32F-NEXT: .LBB15_8: # %cond.store5
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu
; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 3
; RV64ZVE32F-NEXT: vse16.v v8, (a1)
-; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
%mhead = insertelement <4 x i1> poison, i1 1, i32 0
%mtrue = shufflevector <4 x i1> %mhead, <4 x i1> poison, <4 x i32> zeroinitializer
@@ -1823,27 +1674,14 @@ define void @mscatter_v2i32(<2 x i32> %val, <2 x i32*> %ptrs, <2 x i1> %m) {
;
; RV64ZVE32F-LABEL: mscatter_v2i32:
; RV64ZVE32F: # %bb.0:
-; RV64ZVE32F-NEXT: addi sp, sp, -16
-; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16
-; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v9, 0
-; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v10, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu
-; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0
-; RV64ZVE32F-NEXT: addi a2, sp, 15
-; RV64ZVE32F-NEXT: vsm.v v9, (a2)
-; RV64ZVE32F-NEXT: lbu a2, 15(sp)
+; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu
+; RV64ZVE32F-NEXT: vmv.x.s a2, v0
; RV64ZVE32F-NEXT: andi a3, a2, 1
; RV64ZVE32F-NEXT: bnez a3, .LBB23_3
; RV64ZVE32F-NEXT: # %bb.1: # %else
; RV64ZVE32F-NEXT: andi a0, a2, 2
; RV64ZVE32F-NEXT: bnez a0, .LBB23_4
; RV64ZVE32F-NEXT: .LBB23_2: # %else2
-; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
; RV64ZVE32F-NEXT: .LBB23_3: # %cond.store
; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu
@@ -1854,7 +1692,6 @@ define void @mscatter_v2i32(<2 x i32> %val, <2 x i32*> %ptrs, <2 x i1> %m) {
; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu
; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
; RV64ZVE32F-NEXT: vse32.v v8, (a1)
-; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> %val, <2 x i32*> %ptrs, i32 4, <2 x i1> %m)
ret void
@@ -1888,47 +1725,37 @@ define void @mscatter_v2i64_truncstore_v2i32(<2 x i64> %val, <2 x i32*> %ptrs, <
;
; RV64ZVE32F-LABEL: mscatter_v2i64_truncstore_v2i32:
; RV64ZVE32F: # %bb.0:
-; RV64ZVE32F-NEXT: addi sp, sp, -32
-; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 32
-; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v8, 0
-; RV64ZVE32F-NEXT: vmerge.vim v8, v8, 1, v0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v9, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu
-; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmsne.vi v8, v9, 0
-; RV64ZVE32F-NEXT: addi a4, sp, 31
-; RV64ZVE32F-NEXT: vsm.v v8, (a4)
-; RV64ZVE32F-NEXT: lbu a4, 31(sp)
-; RV64ZVE32F-NEXT: sw a1, 24(sp)
-; RV64ZVE32F-NEXT: sw a0, 20(sp)
+; RV64ZVE32F-NEXT: addi sp, sp, -16
+; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16
+; RV64ZVE32F-NEXT: sw a1, 12(sp)
+; RV64ZVE32F-NEXT: sw a0, 8(sp)
; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu
-; RV64ZVE32F-NEXT: addi a0, sp, 24
+; RV64ZVE32F-NEXT: addi a0, sp, 12
; RV64ZVE32F-NEXT: vle32.v v9, (a0)
-; RV64ZVE32F-NEXT: addi a0, sp, 20
+; RV64ZVE32F-NEXT: addi a0, sp, 8
; RV64ZVE32F-NEXT: vle32.v v8, (a0)
; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, tu, mu
-; RV64ZVE32F-NEXT: andi a0, a4, 1
; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1
-; RV64ZVE32F-NEXT: bnez a0, .LBB24_3
+; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, mf4, ta, mu
+; RV64ZVE32F-NEXT: vmv.x.s a0, v0
+; RV64ZVE32F-NEXT: andi a1, a0, 1
+; RV64ZVE32F-NEXT: bnez a1, .LBB24_3
; RV64ZVE32F-NEXT: # %bb.1: # %else
-; RV64ZVE32F-NEXT: andi a0, a4, 2
+; RV64ZVE32F-NEXT: andi a0, a0, 2
; RV64ZVE32F-NEXT: bnez a0, .LBB24_4
; RV64ZVE32F-NEXT: .LBB24_2: # %else2
-; RV64ZVE32F-NEXT: addi sp, sp, 32
+; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
; RV64ZVE32F-NEXT: .LBB24_3: # %cond.store
; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu
; RV64ZVE32F-NEXT: vse32.v v8, (a2)
-; RV64ZVE32F-NEXT: andi a0, a4, 2
+; RV64ZVE32F-NEXT: andi a0, a0, 2
; RV64ZVE32F-NEXT: beqz a0, .LBB24_2
; RV64ZVE32F-NEXT: .LBB24_4: # %cond.store1
; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu
; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
; RV64ZVE32F-NEXT: vse32.v v8, (a3)
-; RV64ZVE32F-NEXT: addi sp, sp, 32
+; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
%tval = trunc <2 x i64> %val to <2 x i32>
call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> %tval, <2 x i32*> %ptrs, i32 4, <2 x i1> %m)
@@ -1952,60 +1779,46 @@ define void @mscatter_v4i32(<4 x i32> %val, <4 x i32*> %ptrs, <4 x i1> %m) {
;
; RV64ZVE32F-LABEL: mscatter_v4i32:
; RV64ZVE32F: # %bb.0:
-; RV64ZVE32F-NEXT: addi sp, sp, -16
-; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v9, 0
-; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v10, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu
-; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0
-; RV64ZVE32F-NEXT: addi a1, sp, 15
-; RV64ZVE32F-NEXT: vsm.v v9, (a1)
; RV64ZVE32F-NEXT: ld a1, 24(a0)
-; RV64ZVE32F-NEXT: lbu a2, 15(sp)
-; RV64ZVE32F-NEXT: ld a3, 16(a0)
+; RV64ZVE32F-NEXT: ld a2, 16(a0)
; RV64ZVE32F-NEXT: ld a4, 8(a0)
-; RV64ZVE32F-NEXT: andi a5, a2, 1
+; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu
+; RV64ZVE32F-NEXT: vmv.x.s a3, v0
+; RV64ZVE32F-NEXT: andi a5, a3, 1
; RV64ZVE32F-NEXT: bnez a5, .LBB25_5
; RV64ZVE32F-NEXT: # %bb.1: # %else
-; RV64ZVE32F-NEXT: andi a0, a2, 2
+; RV64ZVE32F-NEXT: andi a0, a3, 2
; RV64ZVE32F-NEXT: bnez a0, .LBB25_6
; RV64ZVE32F-NEXT: .LBB25_2: # %else2
-; RV64ZVE32F-NEXT: andi a0, a2, 4
+; RV64ZVE32F-NEXT: andi a0, a3, 4
; RV64ZVE32F-NEXT: bnez a0, .LBB25_7
; RV64ZVE32F-NEXT: .LBB25_3: # %else4
-; RV64ZVE32F-NEXT: andi a0, a2, 8
+; RV64ZVE32F-NEXT: andi a0, a3, 8
; RV64ZVE32F-NEXT: bnez a0, .LBB25_8
; RV64ZVE32F-NEXT: .LBB25_4: # %else6
-; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
; RV64ZVE32F-NEXT: .LBB25_5: # %cond.store
; RV64ZVE32F-NEXT: ld a0, 0(a0)
; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu
; RV64ZVE32F-NEXT: vse32.v v8, (a0)
-; RV64ZVE32F-NEXT: andi a0, a2, 2
+; RV64ZVE32F-NEXT: andi a0, a3, 2
; RV64ZVE32F-NEXT: beqz a0, .LBB25_2
; RV64ZVE32F-NEXT: .LBB25_6: # %cond.store1
; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu
; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1
; RV64ZVE32F-NEXT: vse32.v v9, (a4)
-; RV64ZVE32F-NEXT: andi a0, a2, 4
+; RV64ZVE32F-NEXT: andi a0, a3, 4
; RV64ZVE32F-NEXT: beqz a0, .LBB25_3
; RV64ZVE32F-NEXT: .LBB25_7: # %cond.store3
; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu
; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2
-; RV64ZVE32F-NEXT: vse32.v v9, (a3)
-; RV64ZVE32F-NEXT: andi a0, a2, 8
+; RV64ZVE32F-NEXT: vse32.v v9, (a2)
+; RV64ZVE32F-NEXT: andi a0, a3, 8
; RV64ZVE32F-NEXT: beqz a0, .LBB25_4
; RV64ZVE32F-NEXT: .LBB25_8: # %cond.store5
; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu
; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 3
; RV64ZVE32F-NEXT: vse32.v v8, (a1)
-; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %val, <4 x i32*> %ptrs, i32 4, <4 x i1> %m)
ret void
@@ -2026,60 +1839,46 @@ define void @mscatter_truemask_v4i32(<4 x i32> %val, <4 x i32*> %ptrs) {
;
; RV64ZVE32F-LABEL: mscatter_truemask_v4i32:
; RV64ZVE32F: # %bb.0:
-; RV64ZVE32F-NEXT: addi sp, sp, -16
-; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16
; RV64ZVE32F-NEXT: ld a1, 24(a0)
; RV64ZVE32F-NEXT: ld a2, 16(a0)
-; RV64ZVE32F-NEXT: ld a3, 8(a0)
-; RV64ZVE32F-NEXT: ld a4, 0(a0)
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu
-; RV64ZVE32F-NEXT: vmset.m v0
-; RV64ZVE32F-NEXT: vmv.v.i v9, 0
-; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v10, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu
-; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0
+; RV64ZVE32F-NEXT: ld a4, 8(a0)
; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0
-; RV64ZVE32F-NEXT: addi a0, sp, 15
-; RV64ZVE32F-NEXT: vsm.v v9, (a0)
-; RV64ZVE32F-NEXT: lb a0, 15(sp)
+; RV64ZVE32F-NEXT: vmset.m v9
+; RV64ZVE32F-NEXT: vmv.x.s a3, v9
; RV64ZVE32F-NEXT: beqz zero, .LBB26_5
; RV64ZVE32F-NEXT: # %bb.1: # %else
-; RV64ZVE32F-NEXT: andi a4, a0, 2
-; RV64ZVE32F-NEXT: bnez a4, .LBB26_6
+; RV64ZVE32F-NEXT: andi a0, a3, 2
+; RV64ZVE32F-NEXT: bnez a0, .LBB26_6
; RV64ZVE32F-NEXT: .LBB26_2: # %else2
-; RV64ZVE32F-NEXT: andi a3, a0, 4
-; RV64ZVE32F-NEXT: bnez a3, .LBB26_7
+; RV64ZVE32F-NEXT: andi a0, a3, 4
+; RV64ZVE32F-NEXT: bnez a0, .LBB26_7
; RV64ZVE32F-NEXT: .LBB26_3: # %else4
-; RV64ZVE32F-NEXT: andi a0, a0, 8
+; RV64ZVE32F-NEXT: andi a0, a3, 8
; RV64ZVE32F-NEXT: bnez a0, .LBB26_8
; RV64ZVE32F-NEXT: .LBB26_4: # %else6
-; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
; RV64ZVE32F-NEXT: .LBB26_5: # %cond.store
+; RV64ZVE32F-NEXT: ld a0, 0(a0)
; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu
-; RV64ZVE32F-NEXT: vse32.v v8, (a4)
-; RV64ZVE32F-NEXT: andi a4, a0, 2
-; RV64ZVE32F-NEXT: beqz a4, .LBB26_2
+; RV64ZVE32F-NEXT: vse32.v v8, (a0)
+; RV64ZVE32F-NEXT: andi a0, a3, 2
+; RV64ZVE32F-NEXT: beqz a0, .LBB26_2
; RV64ZVE32F-NEXT: .LBB26_6: # %cond.store1
; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu
; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1
-; RV64ZVE32F-NEXT: vse32.v v9, (a3)
-; RV64ZVE32F-NEXT: andi a3, a0, 4
-; RV64ZVE32F-NEXT: beqz a3, .LBB26_3
+; RV64ZVE32F-NEXT: vse32.v v9, (a4)
+; RV64ZVE32F-NEXT: andi a0, a3, 4
+; RV64ZVE32F-NEXT: beqz a0, .LBB26_3
; RV64ZVE32F-NEXT: .LBB26_7: # %cond.store3
; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu
; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2
; RV64ZVE32F-NEXT: vse32.v v9, (a2)
-; RV64ZVE32F-NEXT: andi a0, a0, 8
+; RV64ZVE32F-NEXT: andi a0, a3, 8
; RV64ZVE32F-NEXT: beqz a0, .LBB26_4
; RV64ZVE32F-NEXT: .LBB26_8: # %cond.store5
; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu
; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 3
; RV64ZVE32F-NEXT: vse32.v v8, (a1)
-; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
%mhead = insertelement <4 x i1> poison, i1 1, i32 0
%mtrue = shufflevector <4 x i1> %mhead, <4 x i1> poison, <4 x i32> zeroinitializer
@@ -3198,34 +2997,21 @@ define void @mscatter_v2i64(<2 x i64> %val, <2 x i64*> %ptrs, <2 x i1> %m) {
;
; RV32ZVE32F-LABEL: mscatter_v2i64:
; RV32ZVE32F: # %bb.0:
-; RV32ZVE32F-NEXT: addi sp, sp, -16
-; RV32ZVE32F-NEXT: .cfi_def_cfa_offset 16
-; RV32ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu
-; RV32ZVE32F-NEXT: vmv.v.i v9, 0
-; RV32ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0
-; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV32ZVE32F-NEXT: vmv.v.i v10, 0
-; RV32ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu
-; RV32ZVE32F-NEXT: vslideup.vi v10, v9, 0
-; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV32ZVE32F-NEXT: vmsne.vi v9, v10, 0
-; RV32ZVE32F-NEXT: addi a1, sp, 15
-; RV32ZVE32F-NEXT: vsm.v v9, (a1)
-; RV32ZVE32F-NEXT: lbu a3, 15(sp)
; RV32ZVE32F-NEXT: lw a2, 12(a0)
; RV32ZVE32F-NEXT: lw a1, 8(a0)
+; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu
+; RV32ZVE32F-NEXT: vmv.x.s a3, v0
; RV32ZVE32F-NEXT: andi a4, a3, 1
; RV32ZVE32F-NEXT: bnez a4, .LBB37_3
; RV32ZVE32F-NEXT: # %bb.1: # %else
; RV32ZVE32F-NEXT: andi a0, a3, 2
; RV32ZVE32F-NEXT: bnez a0, .LBB37_4
; RV32ZVE32F-NEXT: .LBB37_2: # %else2
-; RV32ZVE32F-NEXT: addi sp, sp, 16
; RV32ZVE32F-NEXT: ret
; RV32ZVE32F-NEXT: .LBB37_3: # %cond.store
; RV32ZVE32F-NEXT: lw a4, 4(a0)
; RV32ZVE32F-NEXT: lw a0, 0(a0)
-; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, mu
+; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, mu
; RV32ZVE32F-NEXT: vmv.x.s a5, v8
; RV32ZVE32F-NEXT: sw a4, 4(a5)
; RV32ZVE32F-NEXT: sw a0, 0(a5)
@@ -3237,32 +3023,18 @@ define void @mscatter_v2i64(<2 x i64> %val, <2 x i64*> %ptrs, <2 x i1> %m) {
; RV32ZVE32F-NEXT: vmv.x.s a0, v8
; RV32ZVE32F-NEXT: sw a2, 4(a0)
; RV32ZVE32F-NEXT: sw a1, 0(a0)
-; RV32ZVE32F-NEXT: addi sp, sp, 16
; RV32ZVE32F-NEXT: ret
;
; RV64ZVE32F-LABEL: mscatter_v2i64:
; RV64ZVE32F: # %bb.0:
-; RV64ZVE32F-NEXT: addi sp, sp, -16
-; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16
-; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v8, 0
-; RV64ZVE32F-NEXT: vmerge.vim v8, v8, 1, v0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v9, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu
-; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmsne.vi v8, v9, 0
-; RV64ZVE32F-NEXT: addi a4, sp, 15
-; RV64ZVE32F-NEXT: vsm.v v8, (a4)
-; RV64ZVE32F-NEXT: lbu a4, 15(sp)
+; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu
+; RV64ZVE32F-NEXT: vmv.x.s a4, v0
; RV64ZVE32F-NEXT: andi a5, a4, 1
; RV64ZVE32F-NEXT: bnez a5, .LBB37_3
; RV64ZVE32F-NEXT: # %bb.1: # %else
; RV64ZVE32F-NEXT: andi a0, a4, 2
; RV64ZVE32F-NEXT: bnez a0, .LBB37_4
; RV64ZVE32F-NEXT: .LBB37_2: # %else2
-; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
; RV64ZVE32F-NEXT: .LBB37_3: # %cond.store
; RV64ZVE32F-NEXT: sd a0, 0(a2)
@@ -3270,7 +3042,6 @@ define void @mscatter_v2i64(<2 x i64> %val, <2 x i64*> %ptrs, <2 x i1> %m) {
; RV64ZVE32F-NEXT: beqz a0, .LBB37_2
; RV64ZVE32F-NEXT: .LBB37_4: # %cond.store1
; RV64ZVE32F-NEXT: sd a1, 0(a3)
-; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
call void @llvm.masked.scatter.v2i64.v2p0i64(<2 x i64> %val, <2 x i64*> %ptrs, i32 8, <2 x i1> %m)
ret void
@@ -3293,26 +3064,14 @@ define void @mscatter_v4i64(<4 x i64> %val, <4 x i64*> %ptrs, <4 x i1> %m) {
;
; RV32ZVE32F-LABEL: mscatter_v4i64:
; RV32ZVE32F: # %bb.0:
-; RV32ZVE32F-NEXT: addi sp, sp, -16
-; RV32ZVE32F-NEXT: .cfi_def_cfa_offset 16
-; RV32ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu
-; RV32ZVE32F-NEXT: vmv.v.i v9, 0
-; RV32ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0
-; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV32ZVE32F-NEXT: vmv.v.i v10, 0
-; RV32ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu
-; RV32ZVE32F-NEXT: vslideup.vi v10, v9, 0
-; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV32ZVE32F-NEXT: vmsne.vi v9, v10, 0
-; RV32ZVE32F-NEXT: addi a1, sp, 15
-; RV32ZVE32F-NEXT: vsm.v v9, (a1)
; RV32ZVE32F-NEXT: lw a1, 28(a0)
; RV32ZVE32F-NEXT: lw a2, 24(a0)
; RV32ZVE32F-NEXT: lw a3, 20(a0)
; RV32ZVE32F-NEXT: lw a4, 16(a0)
-; RV32ZVE32F-NEXT: lbu a5, 15(sp)
; RV32ZVE32F-NEXT: lw a7, 12(a0)
; RV32ZVE32F-NEXT: lw a6, 8(a0)
+; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu
+; RV32ZVE32F-NEXT: vmv.x.s a5, v0
; RV32ZVE32F-NEXT: andi t0, a5, 1
; RV32ZVE32F-NEXT: bnez t0, .LBB38_5
; RV32ZVE32F-NEXT: # %bb.1: # %else
@@ -3325,12 +3084,11 @@ define void @mscatter_v4i64(<4 x i64> %val, <4 x i64*> %ptrs, <4 x i1> %m) {
; RV32ZVE32F-NEXT: andi a0, a5, 8
; RV32ZVE32F-NEXT: bnez a0, .LBB38_8
; RV32ZVE32F-NEXT: .LBB38_4: # %else6
-; RV32ZVE32F-NEXT: addi sp, sp, 16
; RV32ZVE32F-NEXT: ret
; RV32ZVE32F-NEXT: .LBB38_5: # %cond.store
; RV32ZVE32F-NEXT: lw t0, 4(a0)
; RV32ZVE32F-NEXT: lw a0, 0(a0)
-; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, mu
+; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, mu
; RV32ZVE32F-NEXT: vmv.x.s t1, v8
; RV32ZVE32F-NEXT: sw t0, 4(t1)
; RV32ZVE32F-NEXT: sw a0, 0(t1)
@@ -3358,62 +3116,47 @@ define void @mscatter_v4i64(<4 x i64> %val, <4 x i64*> %ptrs, <4 x i1> %m) {
; RV32ZVE32F-NEXT: vmv.x.s a0, v8
; RV32ZVE32F-NEXT: sw a2, 0(a0)
; RV32ZVE32F-NEXT: sw a1, 4(a0)
-; RV32ZVE32F-NEXT: addi sp, sp, 16
-; RV32ZVE32F-NEXT: ret
-;
-; RV64ZVE32F-LABEL: mscatter_v4i64:
-; RV64ZVE32F: # %bb.0:
-; RV64ZVE32F-NEXT: addi sp, sp, -16
-; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v8, 0
-; RV64ZVE32F-NEXT: vmerge.vim v8, v8, 1, v0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v9, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu
-; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmsne.vi v8, v9, 0
-; RV64ZVE32F-NEXT: addi a2, sp, 15
-; RV64ZVE32F-NEXT: vsm.v v8, (a2)
+; RV32ZVE32F-NEXT: ret
+;
+; RV64ZVE32F-LABEL: mscatter_v4i64:
+; RV64ZVE32F: # %bb.0:
; RV64ZVE32F-NEXT: ld a2, 24(a1)
; RV64ZVE32F-NEXT: ld a4, 16(a1)
; RV64ZVE32F-NEXT: ld a7, 8(a1)
; RV64ZVE32F-NEXT: ld a3, 24(a0)
-; RV64ZVE32F-NEXT: lbu a5, 15(sp)
-; RV64ZVE32F-NEXT: ld a6, 16(a0)
+; RV64ZVE32F-NEXT: ld a5, 16(a0)
; RV64ZVE32F-NEXT: ld t0, 8(a0)
-; RV64ZVE32F-NEXT: andi t1, a5, 1
+; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu
+; RV64ZVE32F-NEXT: vmv.x.s a6, v0
+; RV64ZVE32F-NEXT: andi t1, a6, 1
; RV64ZVE32F-NEXT: bnez t1, .LBB38_5
; RV64ZVE32F-NEXT: # %bb.1: # %else
-; RV64ZVE32F-NEXT: andi a0, a5, 2
+; RV64ZVE32F-NEXT: andi a0, a6, 2
; RV64ZVE32F-NEXT: bnez a0, .LBB38_6
; RV64ZVE32F-NEXT: .LBB38_2: # %else2
-; RV64ZVE32F-NEXT: andi a0, a5, 4
+; RV64ZVE32F-NEXT: andi a0, a6, 4
; RV64ZVE32F-NEXT: bnez a0, .LBB38_7
; RV64ZVE32F-NEXT: .LBB38_3: # %else4
-; RV64ZVE32F-NEXT: andi a0, a5, 8
+; RV64ZVE32F-NEXT: andi a0, a6, 8
; RV64ZVE32F-NEXT: bnez a0, .LBB38_8
; RV64ZVE32F-NEXT: .LBB38_4: # %else6
-; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
; RV64ZVE32F-NEXT: .LBB38_5: # %cond.store
; RV64ZVE32F-NEXT: ld a1, 0(a1)
; RV64ZVE32F-NEXT: ld a0, 0(a0)
; RV64ZVE32F-NEXT: sd a0, 0(a1)
-; RV64ZVE32F-NEXT: andi a0, a5, 2
+; RV64ZVE32F-NEXT: andi a0, a6, 2
; RV64ZVE32F-NEXT: beqz a0, .LBB38_2
; RV64ZVE32F-NEXT: .LBB38_6: # %cond.store1
; RV64ZVE32F-NEXT: sd t0, 0(a7)
-; RV64ZVE32F-NEXT: andi a0, a5, 4
+; RV64ZVE32F-NEXT: andi a0, a6, 4
; RV64ZVE32F-NEXT: beqz a0, .LBB38_3
; RV64ZVE32F-NEXT: .LBB38_7: # %cond.store3
-; RV64ZVE32F-NEXT: sd a6, 0(a4)
-; RV64ZVE32F-NEXT: andi a0, a5, 8
+; RV64ZVE32F-NEXT: sd a5, 0(a4)
+; RV64ZVE32F-NEXT: andi a0, a6, 8
; RV64ZVE32F-NEXT: beqz a0, .LBB38_4
; RV64ZVE32F-NEXT: .LBB38_8: # %cond.store5
; RV64ZVE32F-NEXT: sd a3, 0(a2)
-; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
call void @llvm.masked.scatter.v4i64.v4p0i64(<4 x i64> %val, <4 x i64*> %ptrs, i32 8, <4 x i1> %m)
ret void
@@ -3434,64 +3177,51 @@ define void @mscatter_truemask_v4i64(<4 x i64> %val, <4 x i64*> %ptrs) {
;
; RV32ZVE32F-LABEL: mscatter_truemask_v4i64:
; RV32ZVE32F: # %bb.0:
-; RV32ZVE32F-NEXT: addi sp, sp, -16
-; RV32ZVE32F-NEXT: .cfi_def_cfa_offset 16
; RV32ZVE32F-NEXT: lw a1, 28(a0)
; RV32ZVE32F-NEXT: lw a2, 24(a0)
; RV32ZVE32F-NEXT: lw a3, 20(a0)
; RV32ZVE32F-NEXT: lw a4, 16(a0)
-; RV32ZVE32F-NEXT: lw a6, 12(a0)
-; RV32ZVE32F-NEXT: lw a5, 8(a0)
-; RV32ZVE32F-NEXT: lw t0, 4(a0)
-; RV32ZVE32F-NEXT: lw a7, 0(a0)
-; RV32ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu
-; RV32ZVE32F-NEXT: vmset.m v0
-; RV32ZVE32F-NEXT: vmv.v.i v9, 0
-; RV32ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0
-; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV32ZVE32F-NEXT: vmv.v.i v10, 0
-; RV32ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu
-; RV32ZVE32F-NEXT: vslideup.vi v10, v9, 0
+; RV32ZVE32F-NEXT: lw a7, 12(a0)
+; RV32ZVE32F-NEXT: lw a6, 8(a0)
; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV32ZVE32F-NEXT: vmsne.vi v9, v10, 0
-; RV32ZVE32F-NEXT: addi a0, sp, 15
-; RV32ZVE32F-NEXT: vsm.v v9, (a0)
-; RV32ZVE32F-NEXT: lb a0, 15(sp)
+; RV32ZVE32F-NEXT: vmset.m v9
+; RV32ZVE32F-NEXT: vmv.x.s a5, v9
; RV32ZVE32F-NEXT: beqz zero, .LBB39_5
; RV32ZVE32F-NEXT: # %bb.1: # %else
-; RV32ZVE32F-NEXT: andi a7, a0, 2
-; RV32ZVE32F-NEXT: bnez a7, .LBB39_6
+; RV32ZVE32F-NEXT: andi a0, a5, 2
+; RV32ZVE32F-NEXT: bnez a0, .LBB39_6
; RV32ZVE32F-NEXT: .LBB39_2: # %else2
-; RV32ZVE32F-NEXT: andi a5, a0, 4
-; RV32ZVE32F-NEXT: bnez a5, .LBB39_7
+; RV32ZVE32F-NEXT: andi a0, a5, 4
+; RV32ZVE32F-NEXT: bnez a0, .LBB39_7
; RV32ZVE32F-NEXT: .LBB39_3: # %else4
-; RV32ZVE32F-NEXT: andi a0, a0, 8
+; RV32ZVE32F-NEXT: andi a0, a5, 8
; RV32ZVE32F-NEXT: bnez a0, .LBB39_8
; RV32ZVE32F-NEXT: .LBB39_4: # %else6
-; RV32ZVE32F-NEXT: addi sp, sp, 16
; RV32ZVE32F-NEXT: ret
; RV32ZVE32F-NEXT: .LBB39_5: # %cond.store
+; RV32ZVE32F-NEXT: lw t0, 4(a0)
+; RV32ZVE32F-NEXT: lw a0, 0(a0)
; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, mu
; RV32ZVE32F-NEXT: vmv.x.s t1, v8
; RV32ZVE32F-NEXT: sw t0, 4(t1)
-; RV32ZVE32F-NEXT: sw a7, 0(t1)
-; RV32ZVE32F-NEXT: andi a7, a0, 2
-; RV32ZVE32F-NEXT: beqz a7, .LBB39_2
+; RV32ZVE32F-NEXT: sw a0, 0(t1)
+; RV32ZVE32F-NEXT: andi a0, a5, 2
+; RV32ZVE32F-NEXT: beqz a0, .LBB39_2
; RV32ZVE32F-NEXT: .LBB39_6: # %cond.store1
; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu
; RV32ZVE32F-NEXT: vslidedown.vi v9, v8, 1
-; RV32ZVE32F-NEXT: vmv.x.s a7, v9
-; RV32ZVE32F-NEXT: sw a6, 4(a7)
-; RV32ZVE32F-NEXT: sw a5, 0(a7)
-; RV32ZVE32F-NEXT: andi a5, a0, 4
-; RV32ZVE32F-NEXT: beqz a5, .LBB39_3
+; RV32ZVE32F-NEXT: vmv.x.s a0, v9
+; RV32ZVE32F-NEXT: sw a7, 4(a0)
+; RV32ZVE32F-NEXT: sw a6, 0(a0)
+; RV32ZVE32F-NEXT: andi a0, a5, 4
+; RV32ZVE32F-NEXT: beqz a0, .LBB39_3
; RV32ZVE32F-NEXT: .LBB39_7: # %cond.store3
; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu
; RV32ZVE32F-NEXT: vslidedown.vi v9, v8, 2
-; RV32ZVE32F-NEXT: vmv.x.s a5, v9
-; RV32ZVE32F-NEXT: sw a4, 0(a5)
-; RV32ZVE32F-NEXT: sw a3, 4(a5)
-; RV32ZVE32F-NEXT: andi a0, a0, 8
+; RV32ZVE32F-NEXT: vmv.x.s a0, v9
+; RV32ZVE32F-NEXT: sw a4, 0(a0)
+; RV32ZVE32F-NEXT: sw a3, 4(a0)
+; RV32ZVE32F-NEXT: andi a0, a5, 8
; RV32ZVE32F-NEXT: beqz a0, .LBB39_4
; RV32ZVE32F-NEXT: .LBB39_8: # %cond.store5
; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu
@@ -3499,62 +3229,47 @@ define void @mscatter_truemask_v4i64(<4 x i64> %val, <4 x i64*> %ptrs) {
; RV32ZVE32F-NEXT: vmv.x.s a0, v8
; RV32ZVE32F-NEXT: sw a2, 0(a0)
; RV32ZVE32F-NEXT: sw a1, 4(a0)
-; RV32ZVE32F-NEXT: addi sp, sp, 16
; RV32ZVE32F-NEXT: ret
;
; RV64ZVE32F-LABEL: mscatter_truemask_v4i64:
; RV64ZVE32F: # %bb.0:
-; RV64ZVE32F-NEXT: addi sp, sp, -16
-; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16
; RV64ZVE32F-NEXT: ld a2, 24(a1)
-; RV64ZVE32F-NEXT: ld a3, 16(a1)
-; RV64ZVE32F-NEXT: ld a5, 8(a1)
-; RV64ZVE32F-NEXT: ld a7, 0(a1)
-; RV64ZVE32F-NEXT: ld a1, 24(a0)
-; RV64ZVE32F-NEXT: ld a4, 16(a0)
-; RV64ZVE32F-NEXT: ld a6, 8(a0)
-; RV64ZVE32F-NEXT: ld t0, 0(a0)
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu
-; RV64ZVE32F-NEXT: vmset.m v0
-; RV64ZVE32F-NEXT: vmv.v.i v8, 0
-; RV64ZVE32F-NEXT: vmerge.vim v8, v8, 1, v0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v9, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu
-; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 0
+; RV64ZVE32F-NEXT: ld a4, 16(a1)
+; RV64ZVE32F-NEXT: ld a7, 8(a1)
+; RV64ZVE32F-NEXT: ld a3, 24(a0)
+; RV64ZVE32F-NEXT: ld a5, 16(a0)
+; RV64ZVE32F-NEXT: ld t0, 8(a0)
; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmsne.vi v8, v9, 0
-; RV64ZVE32F-NEXT: addi a0, sp, 15
-; RV64ZVE32F-NEXT: vsm.v v8, (a0)
-; RV64ZVE32F-NEXT: lb a0, 15(sp)
+; RV64ZVE32F-NEXT: vmset.m v8
+; RV64ZVE32F-NEXT: vmv.x.s a6, v8
; RV64ZVE32F-NEXT: beqz zero, .LBB39_5
; RV64ZVE32F-NEXT: # %bb.1: # %else
-; RV64ZVE32F-NEXT: andi a7, a0, 2
-; RV64ZVE32F-NEXT: bnez a7, .LBB39_6
+; RV64ZVE32F-NEXT: andi a0, a6, 2
+; RV64ZVE32F-NEXT: bnez a0, .LBB39_6
; RV64ZVE32F-NEXT: .LBB39_2: # %else2
-; RV64ZVE32F-NEXT: andi a5, a0, 4
-; RV64ZVE32F-NEXT: bnez a5, .LBB39_7
+; RV64ZVE32F-NEXT: andi a0, a6, 4
+; RV64ZVE32F-NEXT: bnez a0, .LBB39_7
; RV64ZVE32F-NEXT: .LBB39_3: # %else4
-; RV64ZVE32F-NEXT: andi a0, a0, 8
+; RV64ZVE32F-NEXT: andi a0, a6, 8
; RV64ZVE32F-NEXT: bnez a0, .LBB39_8
; RV64ZVE32F-NEXT: .LBB39_4: # %else6
-; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
; RV64ZVE32F-NEXT: .LBB39_5: # %cond.store
-; RV64ZVE32F-NEXT: sd t0, 0(a7)
-; RV64ZVE32F-NEXT: andi a7, a0, 2
-; RV64ZVE32F-NEXT: beqz a7, .LBB39_2
+; RV64ZVE32F-NEXT: ld a1, 0(a1)
+; RV64ZVE32F-NEXT: ld a0, 0(a0)
+; RV64ZVE32F-NEXT: sd a0, 0(a1)
+; RV64ZVE32F-NEXT: andi a0, a6, 2
+; RV64ZVE32F-NEXT: beqz a0, .LBB39_2
; RV64ZVE32F-NEXT: .LBB39_6: # %cond.store1
-; RV64ZVE32F-NEXT: sd a6, 0(a5)
-; RV64ZVE32F-NEXT: andi a5, a0, 4
-; RV64ZVE32F-NEXT: beqz a5, .LBB39_3
+; RV64ZVE32F-NEXT: sd t0, 0(a7)
+; RV64ZVE32F-NEXT: andi a0, a6, 4
+; RV64ZVE32F-NEXT: beqz a0, .LBB39_3
; RV64ZVE32F-NEXT: .LBB39_7: # %cond.store3
-; RV64ZVE32F-NEXT: sd a4, 0(a3)
-; RV64ZVE32F-NEXT: andi a0, a0, 8
+; RV64ZVE32F-NEXT: sd a5, 0(a4)
+; RV64ZVE32F-NEXT: andi a0, a6, 8
; RV64ZVE32F-NEXT: beqz a0, .LBB39_4
; RV64ZVE32F-NEXT: .LBB39_8: # %cond.store5
-; RV64ZVE32F-NEXT: sd a1, 0(a2)
-; RV64ZVE32F-NEXT: addi sp, sp, 16
+; RV64ZVE32F-NEXT: sd a3, 0(a2)
; RV64ZVE32F-NEXT: ret
%mhead = insertelement <4 x i1> poison, i1 1, i32 0
%mtrue = shufflevector <4 x i1> %mhead, <4 x i1> poison, <4 x i32> zeroinitializer
@@ -6435,27 +6150,14 @@ define void @mscatter_v2f16(<2 x half> %val, <2 x half*> %ptrs, <2 x i1> %m) {
;
; RV64ZVE32F-LABEL: mscatter_v2f16:
; RV64ZVE32F: # %bb.0:
-; RV64ZVE32F-NEXT: addi sp, sp, -16
-; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16
-; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v9, 0
-; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v10, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu
-; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0
-; RV64ZVE32F-NEXT: addi a2, sp, 15
-; RV64ZVE32F-NEXT: vsm.v v9, (a2)
-; RV64ZVE32F-NEXT: lbu a2, 15(sp)
+; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu
+; RV64ZVE32F-NEXT: vmv.x.s a2, v0
; RV64ZVE32F-NEXT: andi a3, a2, 1
; RV64ZVE32F-NEXT: bnez a3, .LBB53_3
; RV64ZVE32F-NEXT: # %bb.1: # %else
; RV64ZVE32F-NEXT: andi a0, a2, 2
; RV64ZVE32F-NEXT: bnez a0, .LBB53_4
; RV64ZVE32F-NEXT: .LBB53_2: # %else2
-; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
; RV64ZVE32F-NEXT: .LBB53_3: # %cond.store
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu
@@ -6466,7 +6168,6 @@ define void @mscatter_v2f16(<2 x half> %val, <2 x half*> %ptrs, <2 x i1> %m) {
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu
; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
; RV64ZVE32F-NEXT: vse16.v v8, (a1)
-; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
call void @llvm.masked.scatter.v2f16.v2p0f16(<2 x half> %val, <2 x half*> %ptrs, i32 2, <2 x i1> %m)
ret void
@@ -6489,60 +6190,46 @@ define void @mscatter_v4f16(<4 x half> %val, <4 x half*> %ptrs, <4 x i1> %m) {
;
; RV64ZVE32F-LABEL: mscatter_v4f16:
; RV64ZVE32F: # %bb.0:
-; RV64ZVE32F-NEXT: addi sp, sp, -16
-; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v9, 0
-; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v10, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu
-; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0
-; RV64ZVE32F-NEXT: addi a1, sp, 15
-; RV64ZVE32F-NEXT: vsm.v v9, (a1)
; RV64ZVE32F-NEXT: ld a1, 24(a0)
-; RV64ZVE32F-NEXT: lbu a2, 15(sp)
-; RV64ZVE32F-NEXT: ld a3, 16(a0)
+; RV64ZVE32F-NEXT: ld a2, 16(a0)
; RV64ZVE32F-NEXT: ld a4, 8(a0)
-; RV64ZVE32F-NEXT: andi a5, a2, 1
+; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu
+; RV64ZVE32F-NEXT: vmv.x.s a3, v0
+; RV64ZVE32F-NEXT: andi a5, a3, 1
; RV64ZVE32F-NEXT: bnez a5, .LBB54_5
; RV64ZVE32F-NEXT: # %bb.1: # %else
-; RV64ZVE32F-NEXT: andi a0, a2, 2
+; RV64ZVE32F-NEXT: andi a0, a3, 2
; RV64ZVE32F-NEXT: bnez a0, .LBB54_6
; RV64ZVE32F-NEXT: .LBB54_2: # %else2
-; RV64ZVE32F-NEXT: andi a0, a2, 4
+; RV64ZVE32F-NEXT: andi a0, a3, 4
; RV64ZVE32F-NEXT: bnez a0, .LBB54_7
; RV64ZVE32F-NEXT: .LBB54_3: # %else4
-; RV64ZVE32F-NEXT: andi a0, a2, 8
+; RV64ZVE32F-NEXT: andi a0, a3, 8
; RV64ZVE32F-NEXT: bnez a0, .LBB54_8
; RV64ZVE32F-NEXT: .LBB54_4: # %else6
-; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
; RV64ZVE32F-NEXT: .LBB54_5: # %cond.store
; RV64ZVE32F-NEXT: ld a0, 0(a0)
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu
; RV64ZVE32F-NEXT: vse16.v v8, (a0)
-; RV64ZVE32F-NEXT: andi a0, a2, 2
+; RV64ZVE32F-NEXT: andi a0, a3, 2
; RV64ZVE32F-NEXT: beqz a0, .LBB54_2
; RV64ZVE32F-NEXT: .LBB54_6: # %cond.store1
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu
; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1
; RV64ZVE32F-NEXT: vse16.v v9, (a4)
-; RV64ZVE32F-NEXT: andi a0, a2, 4
+; RV64ZVE32F-NEXT: andi a0, a3, 4
; RV64ZVE32F-NEXT: beqz a0, .LBB54_3
; RV64ZVE32F-NEXT: .LBB54_7: # %cond.store3
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu
; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2
-; RV64ZVE32F-NEXT: vse16.v v9, (a3)
-; RV64ZVE32F-NEXT: andi a0, a2, 8
+; RV64ZVE32F-NEXT: vse16.v v9, (a2)
+; RV64ZVE32F-NEXT: andi a0, a3, 8
; RV64ZVE32F-NEXT: beqz a0, .LBB54_4
; RV64ZVE32F-NEXT: .LBB54_8: # %cond.store5
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu
; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 3
; RV64ZVE32F-NEXT: vse16.v v8, (a1)
-; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
call void @llvm.masked.scatter.v4f16.v4p0f16(<4 x half> %val, <4 x half*> %ptrs, i32 2, <4 x i1> %m)
ret void
@@ -6563,60 +6250,46 @@ define void @mscatter_truemask_v4f16(<4 x half> %val, <4 x half*> %ptrs) {
;
; RV64ZVE32F-LABEL: mscatter_truemask_v4f16:
; RV64ZVE32F: # %bb.0:
-; RV64ZVE32F-NEXT: addi sp, sp, -16
-; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16
; RV64ZVE32F-NEXT: ld a1, 24(a0)
; RV64ZVE32F-NEXT: ld a2, 16(a0)
-; RV64ZVE32F-NEXT: ld a3, 8(a0)
-; RV64ZVE32F-NEXT: ld a4, 0(a0)
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu
-; RV64ZVE32F-NEXT: vmset.m v0
-; RV64ZVE32F-NEXT: vmv.v.i v9, 0
-; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v10, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu
-; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0
+; RV64ZVE32F-NEXT: ld a4, 8(a0)
; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0
-; RV64ZVE32F-NEXT: addi a0, sp, 15
-; RV64ZVE32F-NEXT: vsm.v v9, (a0)
-; RV64ZVE32F-NEXT: lb a0, 15(sp)
+; RV64ZVE32F-NEXT: vmset.m v9
+; RV64ZVE32F-NEXT: vmv.x.s a3, v9
; RV64ZVE32F-NEXT: beqz zero, .LBB55_5
; RV64ZVE32F-NEXT: # %bb.1: # %else
-; RV64ZVE32F-NEXT: andi a4, a0, 2
-; RV64ZVE32F-NEXT: bnez a4, .LBB55_6
+; RV64ZVE32F-NEXT: andi a0, a3, 2
+; RV64ZVE32F-NEXT: bnez a0, .LBB55_6
; RV64ZVE32F-NEXT: .LBB55_2: # %else2
-; RV64ZVE32F-NEXT: andi a3, a0, 4
-; RV64ZVE32F-NEXT: bnez a3, .LBB55_7
+; RV64ZVE32F-NEXT: andi a0, a3, 4
+; RV64ZVE32F-NEXT: bnez a0, .LBB55_7
; RV64ZVE32F-NEXT: .LBB55_3: # %else4
-; RV64ZVE32F-NEXT: andi a0, a0, 8
+; RV64ZVE32F-NEXT: andi a0, a3, 8
; RV64ZVE32F-NEXT: bnez a0, .LBB55_8
; RV64ZVE32F-NEXT: .LBB55_4: # %else6
-; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
; RV64ZVE32F-NEXT: .LBB55_5: # %cond.store
+; RV64ZVE32F-NEXT: ld a0, 0(a0)
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu
-; RV64ZVE32F-NEXT: vse16.v v8, (a4)
-; RV64ZVE32F-NEXT: andi a4, a0, 2
-; RV64ZVE32F-NEXT: beqz a4, .LBB55_2
+; RV64ZVE32F-NEXT: vse16.v v8, (a0)
+; RV64ZVE32F-NEXT: andi a0, a3, 2
+; RV64ZVE32F-NEXT: beqz a0, .LBB55_2
; RV64ZVE32F-NEXT: .LBB55_6: # %cond.store1
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu
; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1
-; RV64ZVE32F-NEXT: vse16.v v9, (a3)
-; RV64ZVE32F-NEXT: andi a3, a0, 4
-; RV64ZVE32F-NEXT: beqz a3, .LBB55_3
+; RV64ZVE32F-NEXT: vse16.v v9, (a4)
+; RV64ZVE32F-NEXT: andi a0, a3, 4
+; RV64ZVE32F-NEXT: beqz a0, .LBB55_3
; RV64ZVE32F-NEXT: .LBB55_7: # %cond.store3
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu
; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2
; RV64ZVE32F-NEXT: vse16.v v9, (a2)
-; RV64ZVE32F-NEXT: andi a0, a0, 8
+; RV64ZVE32F-NEXT: andi a0, a3, 8
; RV64ZVE32F-NEXT: beqz a0, .LBB55_4
; RV64ZVE32F-NEXT: .LBB55_8: # %cond.store5
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu
; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 3
; RV64ZVE32F-NEXT: vse16.v v8, (a1)
-; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
%mhead = insertelement <4 x i1> poison, i1 1, i32 0
%mtrue = shufflevector <4 x i1> %mhead, <4 x i1> poison, <4 x i32> zeroinitializer
@@ -7330,27 +7003,14 @@ define void @mscatter_v2f32(<2 x float> %val, <2 x float*> %ptrs, <2 x i1> %m) {
;
; RV64ZVE32F-LABEL: mscatter_v2f32:
; RV64ZVE32F: # %bb.0:
-; RV64ZVE32F-NEXT: addi sp, sp, -16
-; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16
-; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v9, 0
-; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v10, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu
-; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0
-; RV64ZVE32F-NEXT: addi a2, sp, 15
-; RV64ZVE32F-NEXT: vsm.v v9, (a2)
-; RV64ZVE32F-NEXT: lbu a2, 15(sp)
+; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu
+; RV64ZVE32F-NEXT: vmv.x.s a2, v0
; RV64ZVE32F-NEXT: andi a3, a2, 1
; RV64ZVE32F-NEXT: bnez a3, .LBB63_3
; RV64ZVE32F-NEXT: # %bb.1: # %else
; RV64ZVE32F-NEXT: andi a0, a2, 2
; RV64ZVE32F-NEXT: bnez a0, .LBB63_4
; RV64ZVE32F-NEXT: .LBB63_2: # %else2
-; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
; RV64ZVE32F-NEXT: .LBB63_3: # %cond.store
; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu
@@ -7361,7 +7021,6 @@ define void @mscatter_v2f32(<2 x float> %val, <2 x float*> %ptrs, <2 x i1> %m) {
; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu
; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
; RV64ZVE32F-NEXT: vse32.v v8, (a1)
-; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> %val, <2 x float*> %ptrs, i32 4, <2 x i1> %m)
ret void
@@ -7384,60 +7043,46 @@ define void @mscatter_v4f32(<4 x float> %val, <4 x float*> %ptrs, <4 x i1> %m) {
;
; RV64ZVE32F-LABEL: mscatter_v4f32:
; RV64ZVE32F: # %bb.0:
-; RV64ZVE32F-NEXT: addi sp, sp, -16
-; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v9, 0
-; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v10, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu
-; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0
-; RV64ZVE32F-NEXT: addi a1, sp, 15
-; RV64ZVE32F-NEXT: vsm.v v9, (a1)
; RV64ZVE32F-NEXT: ld a1, 24(a0)
-; RV64ZVE32F-NEXT: lbu a2, 15(sp)
-; RV64ZVE32F-NEXT: ld a3, 16(a0)
+; RV64ZVE32F-NEXT: ld a2, 16(a0)
; RV64ZVE32F-NEXT: ld a4, 8(a0)
-; RV64ZVE32F-NEXT: andi a5, a2, 1
+; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu
+; RV64ZVE32F-NEXT: vmv.x.s a3, v0
+; RV64ZVE32F-NEXT: andi a5, a3, 1
; RV64ZVE32F-NEXT: bnez a5, .LBB64_5
; RV64ZVE32F-NEXT: # %bb.1: # %else
-; RV64ZVE32F-NEXT: andi a0, a2, 2
+; RV64ZVE32F-NEXT: andi a0, a3, 2
; RV64ZVE32F-NEXT: bnez a0, .LBB64_6
; RV64ZVE32F-NEXT: .LBB64_2: # %else2
-; RV64ZVE32F-NEXT: andi a0, a2, 4
+; RV64ZVE32F-NEXT: andi a0, a3, 4
; RV64ZVE32F-NEXT: bnez a0, .LBB64_7
; RV64ZVE32F-NEXT: .LBB64_3: # %else4
-; RV64ZVE32F-NEXT: andi a0, a2, 8
+; RV64ZVE32F-NEXT: andi a0, a3, 8
; RV64ZVE32F-NEXT: bnez a0, .LBB64_8
; RV64ZVE32F-NEXT: .LBB64_4: # %else6
-; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
; RV64ZVE32F-NEXT: .LBB64_5: # %cond.store
; RV64ZVE32F-NEXT: ld a0, 0(a0)
; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu
; RV64ZVE32F-NEXT: vse32.v v8, (a0)
-; RV64ZVE32F-NEXT: andi a0, a2, 2
+; RV64ZVE32F-NEXT: andi a0, a3, 2
; RV64ZVE32F-NEXT: beqz a0, .LBB64_2
; RV64ZVE32F-NEXT: .LBB64_6: # %cond.store1
; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu
; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1
; RV64ZVE32F-NEXT: vse32.v v9, (a4)
-; RV64ZVE32F-NEXT: andi a0, a2, 4
+; RV64ZVE32F-NEXT: andi a0, a3, 4
; RV64ZVE32F-NEXT: beqz a0, .LBB64_3
; RV64ZVE32F-NEXT: .LBB64_7: # %cond.store3
; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu
; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2
-; RV64ZVE32F-NEXT: vse32.v v9, (a3)
-; RV64ZVE32F-NEXT: andi a0, a2, 8
+; RV64ZVE32F-NEXT: vse32.v v9, (a2)
+; RV64ZVE32F-NEXT: andi a0, a3, 8
; RV64ZVE32F-NEXT: beqz a0, .LBB64_4
; RV64ZVE32F-NEXT: .LBB64_8: # %cond.store5
; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu
; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 3
; RV64ZVE32F-NEXT: vse32.v v8, (a1)
-; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> %val, <4 x float*> %ptrs, i32 4, <4 x i1> %m)
ret void
@@ -7458,60 +7103,46 @@ define void @mscatter_truemask_v4f32(<4 x float> %val, <4 x float*> %ptrs) {
;
; RV64ZVE32F-LABEL: mscatter_truemask_v4f32:
; RV64ZVE32F: # %bb.0:
-; RV64ZVE32F-NEXT: addi sp, sp, -16
-; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16
; RV64ZVE32F-NEXT: ld a1, 24(a0)
; RV64ZVE32F-NEXT: ld a2, 16(a0)
-; RV64ZVE32F-NEXT: ld a3, 8(a0)
-; RV64ZVE32F-NEXT: ld a4, 0(a0)
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu
-; RV64ZVE32F-NEXT: vmset.m v0
-; RV64ZVE32F-NEXT: vmv.v.i v9, 0
-; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v10, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu
-; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0
+; RV64ZVE32F-NEXT: ld a4, 8(a0)
; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0
-; RV64ZVE32F-NEXT: addi a0, sp, 15
-; RV64ZVE32F-NEXT: vsm.v v9, (a0)
-; RV64ZVE32F-NEXT: lb a0, 15(sp)
+; RV64ZVE32F-NEXT: vmset.m v9
+; RV64ZVE32F-NEXT: vmv.x.s a3, v9
; RV64ZVE32F-NEXT: beqz zero, .LBB65_5
; RV64ZVE32F-NEXT: # %bb.1: # %else
-; RV64ZVE32F-NEXT: andi a4, a0, 2
-; RV64ZVE32F-NEXT: bnez a4, .LBB65_6
+; RV64ZVE32F-NEXT: andi a0, a3, 2
+; RV64ZVE32F-NEXT: bnez a0, .LBB65_6
; RV64ZVE32F-NEXT: .LBB65_2: # %else2
-; RV64ZVE32F-NEXT: andi a3, a0, 4
-; RV64ZVE32F-NEXT: bnez a3, .LBB65_7
+; RV64ZVE32F-NEXT: andi a0, a3, 4
+; RV64ZVE32F-NEXT: bnez a0, .LBB65_7
; RV64ZVE32F-NEXT: .LBB65_3: # %else4
-; RV64ZVE32F-NEXT: andi a0, a0, 8
+; RV64ZVE32F-NEXT: andi a0, a3, 8
; RV64ZVE32F-NEXT: bnez a0, .LBB65_8
; RV64ZVE32F-NEXT: .LBB65_4: # %else6
-; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
; RV64ZVE32F-NEXT: .LBB65_5: # %cond.store
+; RV64ZVE32F-NEXT: ld a0, 0(a0)
; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu
-; RV64ZVE32F-NEXT: vse32.v v8, (a4)
-; RV64ZVE32F-NEXT: andi a4, a0, 2
-; RV64ZVE32F-NEXT: beqz a4, .LBB65_2
+; RV64ZVE32F-NEXT: vse32.v v8, (a0)
+; RV64ZVE32F-NEXT: andi a0, a3, 2
+; RV64ZVE32F-NEXT: beqz a0, .LBB65_2
; RV64ZVE32F-NEXT: .LBB65_6: # %cond.store1
; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu
; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1
-; RV64ZVE32F-NEXT: vse32.v v9, (a3)
-; RV64ZVE32F-NEXT: andi a3, a0, 4
-; RV64ZVE32F-NEXT: beqz a3, .LBB65_3
+; RV64ZVE32F-NEXT: vse32.v v9, (a4)
+; RV64ZVE32F-NEXT: andi a0, a3, 4
+; RV64ZVE32F-NEXT: beqz a0, .LBB65_3
; RV64ZVE32F-NEXT: .LBB65_7: # %cond.store3
; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu
; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2
; RV64ZVE32F-NEXT: vse32.v v9, (a2)
-; RV64ZVE32F-NEXT: andi a0, a0, 8
+; RV64ZVE32F-NEXT: andi a0, a3, 8
; RV64ZVE32F-NEXT: beqz a0, .LBB65_4
; RV64ZVE32F-NEXT: .LBB65_8: # %cond.store5
; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu
; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 3
; RV64ZVE32F-NEXT: vse32.v v8, (a1)
-; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
%mhead = insertelement <4 x i1> poison, i1 1, i32 0
%mtrue = shufflevector <4 x i1> %mhead, <4 x i1> poison, <4 x i32> zeroinitializer
@@ -8629,30 +8260,17 @@ define void @mscatter_v2f64(<2 x double> %val, <2 x double*> %ptrs, <2 x i1> %m)
;
; RV32ZVE32F-LABEL: mscatter_v2f64:
; RV32ZVE32F: # %bb.0:
-; RV32ZVE32F-NEXT: addi sp, sp, -16
-; RV32ZVE32F-NEXT: .cfi_def_cfa_offset 16
-; RV32ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu
-; RV32ZVE32F-NEXT: vmv.v.i v9, 0
-; RV32ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0
-; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV32ZVE32F-NEXT: vmv.v.i v10, 0
-; RV32ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu
-; RV32ZVE32F-NEXT: vslideup.vi v10, v9, 0
-; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV32ZVE32F-NEXT: vmsne.vi v9, v10, 0
-; RV32ZVE32F-NEXT: addi a0, sp, 15
-; RV32ZVE32F-NEXT: vsm.v v9, (a0)
-; RV32ZVE32F-NEXT: lbu a0, 15(sp)
+; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu
+; RV32ZVE32F-NEXT: vmv.x.s a0, v0
; RV32ZVE32F-NEXT: andi a1, a0, 1
; RV32ZVE32F-NEXT: bnez a1, .LBB76_3
; RV32ZVE32F-NEXT: # %bb.1: # %else
; RV32ZVE32F-NEXT: andi a0, a0, 2
; RV32ZVE32F-NEXT: bnez a0, .LBB76_4
; RV32ZVE32F-NEXT: .LBB76_2: # %else2
-; RV32ZVE32F-NEXT: addi sp, sp, 16
; RV32ZVE32F-NEXT: ret
; RV32ZVE32F-NEXT: .LBB76_3: # %cond.store
-; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, mu
+; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, mu
; RV32ZVE32F-NEXT: vmv.x.s a1, v8
; RV32ZVE32F-NEXT: fsd fa0, 0(a1)
; RV32ZVE32F-NEXT: andi a0, a0, 2
@@ -8662,32 +8280,18 @@ define void @mscatter_v2f64(<2 x double> %val, <2 x double*> %ptrs, <2 x i1> %m)
; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 1
; RV32ZVE32F-NEXT: vmv.x.s a0, v8
; RV32ZVE32F-NEXT: fsd fa1, 0(a0)
-; RV32ZVE32F-NEXT: addi sp, sp, 16
; RV32ZVE32F-NEXT: ret
;
; RV64ZVE32F-LABEL: mscatter_v2f64:
; RV64ZVE32F: # %bb.0:
-; RV64ZVE32F-NEXT: addi sp, sp, -16
-; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16
-; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v8, 0
-; RV64ZVE32F-NEXT: vmerge.vim v8, v8, 1, v0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v9, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu
-; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmsne.vi v8, v9, 0
-; RV64ZVE32F-NEXT: addi a2, sp, 15
-; RV64ZVE32F-NEXT: vsm.v v8, (a2)
-; RV64ZVE32F-NEXT: lbu a2, 15(sp)
+; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu
+; RV64ZVE32F-NEXT: vmv.x.s a2, v0
; RV64ZVE32F-NEXT: andi a3, a2, 1
; RV64ZVE32F-NEXT: bnez a3, .LBB76_3
; RV64ZVE32F-NEXT: # %bb.1: # %else
; RV64ZVE32F-NEXT: andi a0, a2, 2
; RV64ZVE32F-NEXT: bnez a0, .LBB76_4
; RV64ZVE32F-NEXT: .LBB76_2: # %else2
-; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
; RV64ZVE32F-NEXT: .LBB76_3: # %cond.store
; RV64ZVE32F-NEXT: fsd fa0, 0(a0)
@@ -8695,7 +8299,6 @@ define void @mscatter_v2f64(<2 x double> %val, <2 x double*> %ptrs, <2 x i1> %m)
; RV64ZVE32F-NEXT: beqz a0, .LBB76_2
; RV64ZVE32F-NEXT: .LBB76_4: # %cond.store1
; RV64ZVE32F-NEXT: fsd fa1, 0(a1)
-; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
call void @llvm.masked.scatter.v2f64.v2p0f64(<2 x double> %val, <2 x double*> %ptrs, i32 8, <2 x i1> %m)
ret void
@@ -8718,20 +8321,8 @@ define void @mscatter_v4f64(<4 x double> %val, <4 x double*> %ptrs, <4 x i1> %m)
;
; RV32ZVE32F-LABEL: mscatter_v4f64:
; RV32ZVE32F: # %bb.0:
-; RV32ZVE32F-NEXT: addi sp, sp, -16
-; RV32ZVE32F-NEXT: .cfi_def_cfa_offset 16
-; RV32ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu
-; RV32ZVE32F-NEXT: vmv.v.i v9, 0
-; RV32ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0
-; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV32ZVE32F-NEXT: vmv.v.i v10, 0
-; RV32ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu
-; RV32ZVE32F-NEXT: vslideup.vi v10, v9, 0
-; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV32ZVE32F-NEXT: vmsne.vi v9, v10, 0
-; RV32ZVE32F-NEXT: addi a0, sp, 15
-; RV32ZVE32F-NEXT: vsm.v v9, (a0)
-; RV32ZVE32F-NEXT: lbu a0, 15(sp)
+; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu
+; RV32ZVE32F-NEXT: vmv.x.s a0, v0
; RV32ZVE32F-NEXT: andi a1, a0, 1
; RV32ZVE32F-NEXT: bnez a1, .LBB77_5
; RV32ZVE32F-NEXT: # %bb.1: # %else
@@ -8744,10 +8335,9 @@ define void @mscatter_v4f64(<4 x double> %val, <4 x double*> %ptrs, <4 x i1> %m)
; RV32ZVE32F-NEXT: andi a0, a0, 8
; RV32ZVE32F-NEXT: bnez a0, .LBB77_8
; RV32ZVE32F-NEXT: .LBB77_4: # %else6
-; RV32ZVE32F-NEXT: addi sp, sp, 16
; RV32ZVE32F-NEXT: ret
; RV32ZVE32F-NEXT: .LBB77_5: # %cond.store
-; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, mu
+; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, mu
; RV32ZVE32F-NEXT: vmv.x.s a1, v8
; RV32ZVE32F-NEXT: fsd fa0, 0(a1)
; RV32ZVE32F-NEXT: andi a1, a0, 2
@@ -8771,58 +8361,43 @@ define void @mscatter_v4f64(<4 x double> %val, <4 x double*> %ptrs, <4 x i1> %m)
; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 3
; RV32ZVE32F-NEXT: vmv.x.s a0, v8
; RV32ZVE32F-NEXT: fsd fa3, 0(a0)
-; RV32ZVE32F-NEXT: addi sp, sp, 16
; RV32ZVE32F-NEXT: ret
;
; RV64ZVE32F-LABEL: mscatter_v4f64:
; RV64ZVE32F: # %bb.0:
-; RV64ZVE32F-NEXT: addi sp, sp, -16
-; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v8, 0
-; RV64ZVE32F-NEXT: vmerge.vim v8, v8, 1, v0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v9, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu
-; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmsne.vi v8, v9, 0
-; RV64ZVE32F-NEXT: addi a1, sp, 15
-; RV64ZVE32F-NEXT: vsm.v v8, (a1)
; RV64ZVE32F-NEXT: ld a1, 24(a0)
-; RV64ZVE32F-NEXT: lbu a2, 15(sp)
-; RV64ZVE32F-NEXT: ld a3, 16(a0)
+; RV64ZVE32F-NEXT: ld a2, 16(a0)
; RV64ZVE32F-NEXT: ld a4, 8(a0)
-; RV64ZVE32F-NEXT: andi a5, a2, 1
+; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu
+; RV64ZVE32F-NEXT: vmv.x.s a3, v0
+; RV64ZVE32F-NEXT: andi a5, a3, 1
; RV64ZVE32F-NEXT: bnez a5, .LBB77_5
; RV64ZVE32F-NEXT: # %bb.1: # %else
-; RV64ZVE32F-NEXT: andi a0, a2, 2
+; RV64ZVE32F-NEXT: andi a0, a3, 2
; RV64ZVE32F-NEXT: bnez a0, .LBB77_6
; RV64ZVE32F-NEXT: .LBB77_2: # %else2
-; RV64ZVE32F-NEXT: andi a0, a2, 4
+; RV64ZVE32F-NEXT: andi a0, a3, 4
; RV64ZVE32F-NEXT: bnez a0, .LBB77_7
; RV64ZVE32F-NEXT: .LBB77_3: # %else4
-; RV64ZVE32F-NEXT: andi a0, a2, 8
+; RV64ZVE32F-NEXT: andi a0, a3, 8
; RV64ZVE32F-NEXT: bnez a0, .LBB77_8
; RV64ZVE32F-NEXT: .LBB77_4: # %else6
-; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
; RV64ZVE32F-NEXT: .LBB77_5: # %cond.store
; RV64ZVE32F-NEXT: ld a0, 0(a0)
; RV64ZVE32F-NEXT: fsd fa0, 0(a0)
-; RV64ZVE32F-NEXT: andi a0, a2, 2
+; RV64ZVE32F-NEXT: andi a0, a3, 2
; RV64ZVE32F-NEXT: beqz a0, .LBB77_2
; RV64ZVE32F-NEXT: .LBB77_6: # %cond.store1
; RV64ZVE32F-NEXT: fsd fa1, 0(a4)
-; RV64ZVE32F-NEXT: andi a0, a2, 4
+; RV64ZVE32F-NEXT: andi a0, a3, 4
; RV64ZVE32F-NEXT: beqz a0, .LBB77_3
; RV64ZVE32F-NEXT: .LBB77_7: # %cond.store3
-; RV64ZVE32F-NEXT: fsd fa2, 0(a3)
-; RV64ZVE32F-NEXT: andi a0, a2, 8
+; RV64ZVE32F-NEXT: fsd fa2, 0(a2)
+; RV64ZVE32F-NEXT: andi a0, a3, 8
; RV64ZVE32F-NEXT: beqz a0, .LBB77_4
; RV64ZVE32F-NEXT: .LBB77_8: # %cond.store5
; RV64ZVE32F-NEXT: fsd fa3, 0(a1)
-; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
call void @llvm.masked.scatter.v4f64.v4p0f64(<4 x double> %val, <4 x double*> %ptrs, i32 8, <4 x i1> %m)
ret void
@@ -8843,21 +8418,9 @@ define void @mscatter_truemask_v4f64(<4 x double> %val, <4 x double*> %ptrs) {
;
; RV32ZVE32F-LABEL: mscatter_truemask_v4f64:
; RV32ZVE32F: # %bb.0:
-; RV32ZVE32F-NEXT: addi sp, sp, -16
-; RV32ZVE32F-NEXT: .cfi_def_cfa_offset 16
-; RV32ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu
-; RV32ZVE32F-NEXT: vmset.m v0
-; RV32ZVE32F-NEXT: vmv.v.i v9, 0
-; RV32ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0
; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV32ZVE32F-NEXT: vmv.v.i v10, 0
-; RV32ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu
-; RV32ZVE32F-NEXT: vslideup.vi v10, v9, 0
-; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV32ZVE32F-NEXT: vmsne.vi v9, v10, 0
-; RV32ZVE32F-NEXT: addi a0, sp, 15
-; RV32ZVE32F-NEXT: vsm.v v9, (a0)
-; RV32ZVE32F-NEXT: lb a0, 15(sp)
+; RV32ZVE32F-NEXT: vmset.m v9
+; RV32ZVE32F-NEXT: vmv.x.s a0, v9
; RV32ZVE32F-NEXT: beqz zero, .LBB78_5
; RV32ZVE32F-NEXT: # %bb.1: # %else
; RV32ZVE32F-NEXT: andi a1, a0, 2
@@ -8869,7 +8432,6 @@ define void @mscatter_truemask_v4f64(<4 x double> %val, <4 x double*> %ptrs) {
; RV32ZVE32F-NEXT: andi a0, a0, 8
; RV32ZVE32F-NEXT: bnez a0, .LBB78_8
; RV32ZVE32F-NEXT: .LBB78_4: # %else6
-; RV32ZVE32F-NEXT: addi sp, sp, 16
; RV32ZVE32F-NEXT: ret
; RV32ZVE32F-NEXT: .LBB78_5: # %cond.store
; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, mu
@@ -8896,58 +8458,43 @@ define void @mscatter_truemask_v4f64(<4 x double> %val, <4 x double*> %ptrs) {
; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 3
; RV32ZVE32F-NEXT: vmv.x.s a0, v8
; RV32ZVE32F-NEXT: fsd fa3, 0(a0)
-; RV32ZVE32F-NEXT: addi sp, sp, 16
; RV32ZVE32F-NEXT: ret
;
; RV64ZVE32F-LABEL: mscatter_truemask_v4f64:
; RV64ZVE32F: # %bb.0:
-; RV64ZVE32F-NEXT: addi sp, sp, -16
-; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16
; RV64ZVE32F-NEXT: ld a1, 24(a0)
; RV64ZVE32F-NEXT: ld a2, 16(a0)
-; RV64ZVE32F-NEXT: ld a3, 8(a0)
-; RV64ZVE32F-NEXT: ld a4, 0(a0)
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu
-; RV64ZVE32F-NEXT: vmset.m v0
-; RV64ZVE32F-NEXT: vmv.v.i v8, 0
-; RV64ZVE32F-NEXT: vmerge.vim v8, v8, 1, v0
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmv.v.i v9, 0
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu
-; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 0
+; RV64ZVE32F-NEXT: ld a4, 8(a0)
; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64ZVE32F-NEXT: vmsne.vi v8, v9, 0
-; RV64ZVE32F-NEXT: addi a0, sp, 15
-; RV64ZVE32F-NEXT: vsm.v v8, (a0)
-; RV64ZVE32F-NEXT: lb a0, 15(sp)
+; RV64ZVE32F-NEXT: vmset.m v8
+; RV64ZVE32F-NEXT: vmv.x.s a3, v8
; RV64ZVE32F-NEXT: beqz zero, .LBB78_5
; RV64ZVE32F-NEXT: # %bb.1: # %else
-; RV64ZVE32F-NEXT: andi a4, a0, 2
-; RV64ZVE32F-NEXT: bnez a4, .LBB78_6
+; RV64ZVE32F-NEXT: andi a0, a3, 2
+; RV64ZVE32F-NEXT: bnez a0, .LBB78_6
; RV64ZVE32F-NEXT: .LBB78_2: # %else2
-; RV64ZVE32F-NEXT: andi a3, a0, 4
-; RV64ZVE32F-NEXT: bnez a3, .LBB78_7
+; RV64ZVE32F-NEXT: andi a0, a3, 4
+; RV64ZVE32F-NEXT: bnez a0, .LBB78_7
; RV64ZVE32F-NEXT: .LBB78_3: # %else4
-; RV64ZVE32F-NEXT: andi a0, a0, 8
+; RV64ZVE32F-NEXT: andi a0, a3, 8
; RV64ZVE32F-NEXT: bnez a0, .LBB78_8
; RV64ZVE32F-NEXT: .LBB78_4: # %else6
-; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
; RV64ZVE32F-NEXT: .LBB78_5: # %cond.store
-; RV64ZVE32F-NEXT: fsd fa0, 0(a4)
-; RV64ZVE32F-NEXT: andi a4, a0, 2
-; RV64ZVE32F-NEXT: beqz a4, .LBB78_2
+; RV64ZVE32F-NEXT: ld a0, 0(a0)
+; RV64ZVE32F-NEXT: fsd fa0, 0(a0)
+; RV64ZVE32F-NEXT: andi a0, a3, 2
+; RV64ZVE32F-NEXT: beqz a0, .LBB78_2
; RV64ZVE32F-NEXT: .LBB78_6: # %cond.store1
-; RV64ZVE32F-NEXT: fsd fa1, 0(a3)
-; RV64ZVE32F-NEXT: andi a3, a0, 4
-; RV64ZVE32F-NEXT: beqz a3, .LBB78_3
+; RV64ZVE32F-NEXT: fsd fa1, 0(a4)
+; RV64ZVE32F-NEXT: andi a0, a3, 4
+; RV64ZVE32F-NEXT: beqz a0, .LBB78_3
; RV64ZVE32F-NEXT: .LBB78_7: # %cond.store3
; RV64ZVE32F-NEXT: fsd fa2, 0(a2)
-; RV64ZVE32F-NEXT: andi a0, a0, 8
+; RV64ZVE32F-NEXT: andi a0, a3, 8
; RV64ZVE32F-NEXT: beqz a0, .LBB78_4
; RV64ZVE32F-NEXT: .LBB78_8: # %cond.store5
; RV64ZVE32F-NEXT: fsd fa3, 0(a1)
-; RV64ZVE32F-NEXT: addi sp, sp, 16
; RV64ZVE32F-NEXT: ret
%mhead = insertelement <4 x i1> poison, i1 1, i32 0
%mtrue = shufflevector <4 x i1> %mhead, <4 x i1> poison, <4 x i32> zeroinitializer
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll
index a0dc946e223f3..a169bb68611c7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll
@@ -49,24 +49,18 @@ declare <2 x i16> @llvm.masked.gather.v2i16.v2p0i16(<2 x i16*>, i32, <2 x i1>, <
define <2 x i16> @mgather_v2i16_align1(<2 x i16*> %ptrs, <2 x i1> %m, <2 x i16> %passthru) {
; RV32-LABEL: mgather_v2i16_align1:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
-; RV32-NEXT: vsetivli zero, 2, e8, mf8, ta, mu
-; RV32-NEXT: vmv.v.i v10, 0
-; RV32-NEXT: vmerge.vim v10, v10, 1, v0
-; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV32-NEXT: vmv.v.i v11, 0
-; RV32-NEXT: vsetivli zero, 2, e8, mf2, tu, mu
-; RV32-NEXT: vslideup.vi v11, v10, 0
-; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV32-NEXT: vmsne.vi v10, v11, 0
-; RV32-NEXT: addi a0, sp, 15
-; RV32-NEXT: vsm.v v10, (a0)
-; RV32-NEXT: lbu a0, 15(sp)
+; RV32-NEXT: vsetivli zero, 0, e8, mf8, ta, mu
+; RV32-NEXT: vmv.x.s a0, v0
; RV32-NEXT: andi a1, a0, 1
-; RV32-NEXT: beqz a1, .LBB4_2
-; RV32-NEXT: # %bb.1: # %cond.load
-; RV32-NEXT: vsetivli zero, 0, e32, mf2, ta, mu
+; RV32-NEXT: bnez a1, .LBB4_3
+; RV32-NEXT: # %bb.1: # %else
+; RV32-NEXT: andi a0, a0, 2
+; RV32-NEXT: bnez a0, .LBB4_4
+; RV32-NEXT: .LBB4_2: # %else2
+; RV32-NEXT: vmv1r.v v8, v9
+; RV32-NEXT: ret
+; RV32-NEXT: .LBB4_3: # %cond.load
+; RV32-NEXT: vsetvli zero, zero, e32, mf2, ta, mu
; RV32-NEXT: vmv.x.s a1, v8
; RV32-NEXT: lb a2, 1(a1)
; RV32-NEXT: lbu a1, 0(a1)
@@ -74,10 +68,9 @@ define <2 x i16> @mgather_v2i16_align1(<2 x i16*> %ptrs, <2 x i1> %m, <2 x i16>
; RV32-NEXT: or a1, a2, a1
; RV32-NEXT: vsetivli zero, 2, e16, mf4, tu, mu
; RV32-NEXT: vmv.s.x v9, a1
-; RV32-NEXT: .LBB4_2: # %else
; RV32-NEXT: andi a0, a0, 2
-; RV32-NEXT: beqz a0, .LBB4_4
-; RV32-NEXT: # %bb.3: # %cond.load1
+; RV32-NEXT: beqz a0, .LBB4_2
+; RV32-NEXT: .LBB4_4: # %cond.load1
; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu
; RV32-NEXT: vslidedown.vi v8, v8, 1
; RV32-NEXT: vmv.x.s a0, v8
@@ -88,31 +81,23 @@ define <2 x i16> @mgather_v2i16_align1(<2 x i16*> %ptrs, <2 x i1> %m, <2 x i16>
; RV32-NEXT: vmv.s.x v8, a0
; RV32-NEXT: vsetivli zero, 2, e16, mf4, tu, mu
; RV32-NEXT: vslideup.vi v9, v8, 1
-; RV32-NEXT: .LBB4_4: # %else2
; RV32-NEXT: vmv1r.v v8, v9
-; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: ret
;
; RV64-LABEL: mgather_v2i16_align1:
; RV64: # %bb.0:
-; RV64-NEXT: addi sp, sp, -16
-; RV64-NEXT: .cfi_def_cfa_offset 16
-; RV64-NEXT: vsetivli zero, 2, e8, mf8, ta, mu
-; RV64-NEXT: vmv.v.i v10, 0
-; RV64-NEXT: vmerge.vim v10, v10, 1, v0
-; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64-NEXT: vmv.v.i v11, 0
-; RV64-NEXT: vsetivli zero, 2, e8, mf2, tu, mu
-; RV64-NEXT: vslideup.vi v11, v10, 0
-; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64-NEXT: vmsne.vi v10, v11, 0
-; RV64-NEXT: addi a0, sp, 15
-; RV64-NEXT: vsm.v v10, (a0)
-; RV64-NEXT: lbu a0, 15(sp)
+; RV64-NEXT: vsetivli zero, 0, e8, mf8, ta, mu
+; RV64-NEXT: vmv.x.s a0, v0
; RV64-NEXT: andi a1, a0, 1
-; RV64-NEXT: beqz a1, .LBB4_2
-; RV64-NEXT: # %bb.1: # %cond.load
-; RV64-NEXT: vsetivli zero, 0, e64, m1, ta, mu
+; RV64-NEXT: bnez a1, .LBB4_3
+; RV64-NEXT: # %bb.1: # %else
+; RV64-NEXT: andi a0, a0, 2
+; RV64-NEXT: bnez a0, .LBB4_4
+; RV64-NEXT: .LBB4_2: # %else2
+; RV64-NEXT: vmv1r.v v8, v9
+; RV64-NEXT: ret
+; RV64-NEXT: .LBB4_3: # %cond.load
+; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, mu
; RV64-NEXT: vmv.x.s a1, v8
; RV64-NEXT: lb a2, 1(a1)
; RV64-NEXT: lbu a1, 0(a1)
@@ -120,10 +105,9 @@ define <2 x i16> @mgather_v2i16_align1(<2 x i16*> %ptrs, <2 x i1> %m, <2 x i16>
; RV64-NEXT: or a1, a2, a1
; RV64-NEXT: vsetivli zero, 2, e16, mf4, tu, mu
; RV64-NEXT: vmv.s.x v9, a1
-; RV64-NEXT: .LBB4_2: # %else
; RV64-NEXT: andi a0, a0, 2
-; RV64-NEXT: beqz a0, .LBB4_4
-; RV64-NEXT: # %bb.3: # %cond.load1
+; RV64-NEXT: beqz a0, .LBB4_2
+; RV64-NEXT: .LBB4_4: # %cond.load1
; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu
; RV64-NEXT: vslidedown.vi v8, v8, 1
; RV64-NEXT: vmv.x.s a0, v8
@@ -134,9 +118,7 @@ define <2 x i16> @mgather_v2i16_align1(<2 x i16*> %ptrs, <2 x i1> %m, <2 x i16>
; RV64-NEXT: vmv.s.x v8, a0
; RV64-NEXT: vsetivli zero, 2, e16, mf4, tu, mu
; RV64-NEXT: vslideup.vi v9, v8, 1
-; RV64-NEXT: .LBB4_4: # %else2
; RV64-NEXT: vmv1r.v v8, v9
-; RV64-NEXT: addi sp, sp, 16
; RV64-NEXT: ret
%v = call <2 x i16> @llvm.masked.gather.v2i16.v2p0i16(<2 x i16*> %ptrs, i32 1, <2 x i1> %m, <2 x i16> %passthru)
ret <2 x i16> %v
@@ -147,25 +129,19 @@ declare <2 x i64> @llvm.masked.gather.v2i64.v2p0i64(<2 x i64*>, i32, <2 x i1>, <
define <2 x i64> @mgather_v2i64_align4(<2 x i64*> %ptrs, <2 x i1> %m, <2 x i64> %passthru) {
; RV32-LABEL: mgather_v2i64_align4:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
-; RV32-NEXT: vsetivli zero, 2, e8, mf8, ta, mu
-; RV32-NEXT: vmv.v.i v10, 0
-; RV32-NEXT: vmerge.vim v10, v10, 1, v0
-; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV32-NEXT: vmv.v.i v11, 0
-; RV32-NEXT: vsetivli zero, 2, e8, mf2, tu, mu
-; RV32-NEXT: vslideup.vi v11, v10, 0
-; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV32-NEXT: vmsne.vi v10, v11, 0
-; RV32-NEXT: addi a0, sp, 15
-; RV32-NEXT: vsm.v v10, (a0)
-; RV32-NEXT: lbu a0, 15(sp)
+; RV32-NEXT: vsetivli zero, 0, e8, mf8, ta, mu
+; RV32-NEXT: vmv.x.s a0, v0
; RV32-NEXT: andi a1, a0, 1
; RV32-NEXT: vsetivli zero, 2, e32, m1, ta, mu
; RV32-NEXT: vmv.v.i v10, 0
-; RV32-NEXT: beqz a1, .LBB5_2
-; RV32-NEXT: # %bb.1: # %cond.load
+; RV32-NEXT: bnez a1, .LBB5_3
+; RV32-NEXT: # %bb.1: # %else
+; RV32-NEXT: andi a0, a0, 2
+; RV32-NEXT: bnez a0, .LBB5_4
+; RV32-NEXT: .LBB5_2: # %else2
+; RV32-NEXT: vmv1r.v v8, v9
+; RV32-NEXT: ret
+; RV32-NEXT: .LBB5_3: # %cond.load
; RV32-NEXT: vmv.x.s a1, v8
; RV32-NEXT: lw a2, 4(a1)
; RV32-NEXT: lw a1, 0(a1)
@@ -173,10 +149,9 @@ define <2 x i64> @mgather_v2i64_align4(<2 x i64*> %ptrs, <2 x i1> %m, <2 x i64>
; RV32-NEXT: vslide1up.vx v12, v11, a1
; RV32-NEXT: vsetivli zero, 1, e64, m1, tu, mu
; RV32-NEXT: vslideup.vi v9, v12, 0
-; RV32-NEXT: .LBB5_2: # %else
; RV32-NEXT: andi a0, a0, 2
-; RV32-NEXT: beqz a0, .LBB5_4
-; RV32-NEXT: # %bb.3: # %cond.load1
+; RV32-NEXT: beqz a0, .LBB5_2
+; RV32-NEXT: .LBB5_4: # %cond.load1
; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu
; RV32-NEXT: vslidedown.vi v8, v8, 1
; RV32-NEXT: vmv.x.s a0, v8
@@ -187,31 +162,23 @@ define <2 x i64> @mgather_v2i64_align4(<2 x i64*> %ptrs, <2 x i1> %m, <2 x i64>
; RV32-NEXT: vslide1up.vx v10, v8, a0
; RV32-NEXT: vsetivli zero, 2, e64, m1, tu, mu
; RV32-NEXT: vslideup.vi v9, v10, 1
-; RV32-NEXT: .LBB5_4: # %else2
; RV32-NEXT: vmv1r.v v8, v9
-; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: ret
;
; RV64-LABEL: mgather_v2i64_align4:
; RV64: # %bb.0:
-; RV64-NEXT: addi sp, sp, -16
-; RV64-NEXT: .cfi_def_cfa_offset 16
-; RV64-NEXT: vsetivli zero, 2, e8, mf8, ta, mu
-; RV64-NEXT: vmv.v.i v10, 0
-; RV64-NEXT: vmerge.vim v10, v10, 1, v0
-; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64-NEXT: vmv.v.i v11, 0
-; RV64-NEXT: vsetivli zero, 2, e8, mf2, tu, mu
-; RV64-NEXT: vslideup.vi v11, v10, 0
-; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64-NEXT: vmsne.vi v10, v11, 0
-; RV64-NEXT: addi a0, sp, 15
-; RV64-NEXT: vsm.v v10, (a0)
-; RV64-NEXT: lbu a0, 15(sp)
+; RV64-NEXT: vsetivli zero, 0, e8, mf8, ta, mu
+; RV64-NEXT: vmv.x.s a0, v0
; RV64-NEXT: andi a1, a0, 1
-; RV64-NEXT: beqz a1, .LBB5_2
-; RV64-NEXT: # %bb.1: # %cond.load
-; RV64-NEXT: vsetivli zero, 0, e64, m1, ta, mu
+; RV64-NEXT: bnez a1, .LBB5_3
+; RV64-NEXT: # %bb.1: # %else
+; RV64-NEXT: andi a0, a0, 2
+; RV64-NEXT: bnez a0, .LBB5_4
+; RV64-NEXT: .LBB5_2: # %else2
+; RV64-NEXT: vmv1r.v v8, v9
+; RV64-NEXT: ret
+; RV64-NEXT: .LBB5_3: # %cond.load
+; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, mu
; RV64-NEXT: vmv.x.s a1, v8
; RV64-NEXT: lwu a2, 4(a1)
; RV64-NEXT: lwu a1, 0(a1)
@@ -219,10 +186,9 @@ define <2 x i64> @mgather_v2i64_align4(<2 x i64*> %ptrs, <2 x i1> %m, <2 x i64>
; RV64-NEXT: or a1, a2, a1
; RV64-NEXT: vsetivli zero, 2, e64, m1, tu, mu
; RV64-NEXT: vmv.s.x v9, a1
-; RV64-NEXT: .LBB5_2: # %else
; RV64-NEXT: andi a0, a0, 2
-; RV64-NEXT: beqz a0, .LBB5_4
-; RV64-NEXT: # %bb.3: # %cond.load1
+; RV64-NEXT: beqz a0, .LBB5_2
+; RV64-NEXT: .LBB5_4: # %cond.load1
; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu
; RV64-NEXT: vslidedown.vi v8, v8, 1
; RV64-NEXT: vmv.x.s a0, v8
@@ -233,9 +199,7 @@ define <2 x i64> @mgather_v2i64_align4(<2 x i64*> %ptrs, <2 x i1> %m, <2 x i64>
; RV64-NEXT: vmv.s.x v8, a0
; RV64-NEXT: vsetivli zero, 2, e64, m1, tu, mu
; RV64-NEXT: vslideup.vi v9, v8, 1
-; RV64-NEXT: .LBB5_4: # %else2
; RV64-NEXT: vmv1r.v v8, v9
-; RV64-NEXT: addi sp, sp, 16
; RV64-NEXT: ret
%v = call <2 x i64> @llvm.masked.gather.v2i64.v2p0i64(<2 x i64*> %ptrs, i32 4, <2 x i1> %m, <2 x i64> %passthru)
ret <2 x i64> %v
@@ -246,20 +210,8 @@ declare void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16>, <4 x i16*>, i32, <4 x
define void @mscatter_v4i16_align1(<4 x i16> %val, <4 x i16*> %ptrs, <4 x i1> %m) {
; RV32-LABEL: mscatter_v4i16_align1:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
-; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, mu
-; RV32-NEXT: vmv.v.i v10, 0
-; RV32-NEXT: vmerge.vim v10, v10, 1, v0
-; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV32-NEXT: vmv.v.i v11, 0
-; RV32-NEXT: vsetivli zero, 4, e8, mf2, tu, mu
-; RV32-NEXT: vslideup.vi v11, v10, 0
-; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV32-NEXT: vmsne.vi v10, v11, 0
-; RV32-NEXT: addi a0, sp, 15
-; RV32-NEXT: vsm.v v10, (a0)
-; RV32-NEXT: lbu a0, 15(sp)
+; RV32-NEXT: vsetivli zero, 0, e8, mf8, ta, mu
+; RV32-NEXT: vmv.x.s a0, v0
; RV32-NEXT: andi a1, a0, 1
; RV32-NEXT: bnez a1, .LBB6_5
; RV32-NEXT: # %bb.1: # %else
@@ -272,7 +224,6 @@ define void @mscatter_v4i16_align1(<4 x i16> %val, <4 x i16*> %ptrs, <4 x i1> %m
; RV32-NEXT: andi a0, a0, 8
; RV32-NEXT: bnez a0, .LBB6_8
; RV32-NEXT: .LBB6_4: # %else6
-; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: ret
; RV32-NEXT: .LBB6_5: # %cond.store
; RV32-NEXT: vsetivli zero, 0, e16, mf2, ta, mu
@@ -318,25 +269,12 @@ define void @mscatter_v4i16_align1(<4 x i16> %val, <4 x i16*> %ptrs, <4 x i1> %m
; RV32-NEXT: sb a0, 0(a1)
; RV32-NEXT: srli a0, a0, 8
; RV32-NEXT: sb a0, 1(a1)
-; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: ret
;
; RV64-LABEL: mscatter_v4i16_align1:
; RV64: # %bb.0:
-; RV64-NEXT: addi sp, sp, -16
-; RV64-NEXT: .cfi_def_cfa_offset 16
-; RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, mu
-; RV64-NEXT: vmv.v.i v9, 0
-; RV64-NEXT: vmerge.vim v9, v9, 1, v0
-; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64-NEXT: vmv.v.i v12, 0
-; RV64-NEXT: vsetivli zero, 4, e8, mf2, tu, mu
-; RV64-NEXT: vslideup.vi v12, v9, 0
-; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64-NEXT: vmsne.vi v9, v12, 0
-; RV64-NEXT: addi a0, sp, 15
-; RV64-NEXT: vsm.v v9, (a0)
-; RV64-NEXT: lbu a0, 15(sp)
+; RV64-NEXT: vsetivli zero, 0, e8, mf8, ta, mu
+; RV64-NEXT: vmv.x.s a0, v0
; RV64-NEXT: andi a1, a0, 1
; RV64-NEXT: bnez a1, .LBB6_5
; RV64-NEXT: # %bb.1: # %else
@@ -349,7 +287,6 @@ define void @mscatter_v4i16_align1(<4 x i16> %val, <4 x i16*> %ptrs, <4 x i1> %m
; RV64-NEXT: andi a0, a0, 8
; RV64-NEXT: bnez a0, .LBB6_8
; RV64-NEXT: .LBB6_4: # %else6
-; RV64-NEXT: addi sp, sp, 16
; RV64-NEXT: ret
; RV64-NEXT: .LBB6_5: # %cond.store
; RV64-NEXT: vsetivli zero, 0, e16, mf2, ta, mu
@@ -395,7 +332,6 @@ define void @mscatter_v4i16_align1(<4 x i16> %val, <4 x i16*> %ptrs, <4 x i1> %m
; RV64-NEXT: sb a0, 0(a1)
; RV64-NEXT: srli a0, a0, 8
; RV64-NEXT: sb a0, 1(a1)
-; RV64-NEXT: addi sp, sp, 16
; RV64-NEXT: ret
call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %val, <4 x i16*> %ptrs, i32 1, <4 x i1> %m)
ret void
@@ -406,30 +342,17 @@ declare void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32>, <2 x i32*>, i32, <2 x
define void @mscatter_v2i32_align2(<2 x i32> %val, <2 x i32*> %ptrs, <2 x i1> %m) {
; RV32-LABEL: mscatter_v2i32_align2:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
-; RV32-NEXT: vsetivli zero, 2, e8, mf8, ta, mu
-; RV32-NEXT: vmv.v.i v10, 0
-; RV32-NEXT: vmerge.vim v10, v10, 1, v0
-; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV32-NEXT: vmv.v.i v11, 0
-; RV32-NEXT: vsetivli zero, 2, e8, mf2, tu, mu
-; RV32-NEXT: vslideup.vi v11, v10, 0
-; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV32-NEXT: vmsne.vi v10, v11, 0
-; RV32-NEXT: addi a0, sp, 15
-; RV32-NEXT: vsm.v v10, (a0)
-; RV32-NEXT: lbu a0, 15(sp)
+; RV32-NEXT: vsetivli zero, 0, e8, mf8, ta, mu
+; RV32-NEXT: vmv.x.s a0, v0
; RV32-NEXT: andi a1, a0, 1
; RV32-NEXT: bnez a1, .LBB7_3
; RV32-NEXT: # %bb.1: # %else
; RV32-NEXT: andi a0, a0, 2
; RV32-NEXT: bnez a0, .LBB7_4
; RV32-NEXT: .LBB7_2: # %else2
-; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: ret
; RV32-NEXT: .LBB7_3: # %cond.store
-; RV32-NEXT: vsetivli zero, 0, e32, mf2, ta, mu
+; RV32-NEXT: vsetvli zero, zero, e32, mf2, ta, mu
; RV32-NEXT: vmv.x.s a1, v8
; RV32-NEXT: vmv.x.s a2, v9
; RV32-NEXT: sh a1, 0(a2)
@@ -446,35 +369,21 @@ define void @mscatter_v2i32_align2(<2 x i32> %val, <2 x i32*> %ptrs, <2 x i1> %m
; RV32-NEXT: sh a0, 0(a1)
; RV32-NEXT: srli a0, a0, 16
; RV32-NEXT: sh a0, 2(a1)
-; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: ret
;
; RV64-LABEL: mscatter_v2i32_align2:
; RV64: # %bb.0:
-; RV64-NEXT: addi sp, sp, -16
-; RV64-NEXT: .cfi_def_cfa_offset 16
-; RV64-NEXT: vsetivli zero, 2, e8, mf8, ta, mu
-; RV64-NEXT: vmv.v.i v10, 0
-; RV64-NEXT: vmerge.vim v10, v10, 1, v0
-; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64-NEXT: vmv.v.i v11, 0
-; RV64-NEXT: vsetivli zero, 2, e8, mf2, tu, mu
-; RV64-NEXT: vslideup.vi v11, v10, 0
-; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64-NEXT: vmsne.vi v10, v11, 0
-; RV64-NEXT: addi a0, sp, 15
-; RV64-NEXT: vsm.v v10, (a0)
-; RV64-NEXT: lbu a0, 15(sp)
+; RV64-NEXT: vsetivli zero, 0, e8, mf8, ta, mu
+; RV64-NEXT: vmv.x.s a0, v0
; RV64-NEXT: andi a1, a0, 1
; RV64-NEXT: bnez a1, .LBB7_3
; RV64-NEXT: # %bb.1: # %else
; RV64-NEXT: andi a0, a0, 2
; RV64-NEXT: bnez a0, .LBB7_4
; RV64-NEXT: .LBB7_2: # %else2
-; RV64-NEXT: addi sp, sp, 16
; RV64-NEXT: ret
; RV64-NEXT: .LBB7_3: # %cond.store
-; RV64-NEXT: vsetivli zero, 0, e32, mf2, ta, mu
+; RV64-NEXT: vsetvli zero, zero, e32, mf2, ta, mu
; RV64-NEXT: vmv.x.s a1, v8
; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, mu
; RV64-NEXT: vmv.x.s a2, v9
@@ -493,7 +402,6 @@ define void @mscatter_v2i32_align2(<2 x i32> %val, <2 x i32*> %ptrs, <2 x i1> %m
; RV64-NEXT: sh a0, 0(a1)
; RV64-NEXT: srli a0, a0, 16
; RV64-NEXT: sh a0, 2(a1)
-; RV64-NEXT: addi sp, sp, 16
; RV64-NEXT: ret
call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> %val, <2 x i32*> %ptrs, i32 2, <2 x i1> %m)
ret void
@@ -504,21 +412,10 @@ declare <2 x i32> @llvm.masked.load.v2i32(<2 x i32>*, i32, <2 x i1>, <2 x i32>)
define void @masked_load_v2i32_align1(<2 x i32>* %a, <2 x i32> %m, <2 x i32>* %res_ptr) nounwind {
; RV32-LABEL: masked_load_v2i32_align1:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, mu
-; RV32-NEXT: vmseq.vi v0, v8, 0
-; RV32-NEXT: vmv.v.i v8, 0
+; RV32-NEXT: vmseq.vi v8, v8, 0
; RV32-NEXT: vsetvli zero, zero, e8, mf8, ta, mu
-; RV32-NEXT: vmerge.vim v8, v8, 1, v0
-; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV32-NEXT: vmv.v.i v9, 0
-; RV32-NEXT: vsetivli zero, 2, e8, mf2, tu, mu
-; RV32-NEXT: vslideup.vi v9, v8, 0
-; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV32-NEXT: vmsne.vi v8, v9, 0
-; RV32-NEXT: addi a2, sp, 15
-; RV32-NEXT: vsm.v v8, (a2)
-; RV32-NEXT: lbu a2, 15(sp)
+; RV32-NEXT: vmv.x.s a2, v8
; RV32-NEXT: andi a3, a2, 1
; RV32-NEXT: beqz a3, .LBB8_2
; RV32-NEXT: # %bb.1: # %cond.load
@@ -559,26 +456,14 @@ define void @masked_load_v2i32_align1(<2 x i32>* %a, <2 x i32> %m, <2 x i32>* %r
; RV32-NEXT: .LBB8_4: # %else2
; RV32-NEXT: vsetvli zero, zero, e32, mf2, ta, mu
; RV32-NEXT: vse32.v v8, (a1)
-; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: ret
;
; RV64-LABEL: masked_load_v2i32_align1:
; RV64: # %bb.0:
-; RV64-NEXT: addi sp, sp, -16
; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu
-; RV64-NEXT: vmseq.vi v0, v8, 0
-; RV64-NEXT: vmv.v.i v8, 0
+; RV64-NEXT: vmseq.vi v8, v8, 0
; RV64-NEXT: vsetvli zero, zero, e8, mf8, ta, mu
-; RV64-NEXT: vmerge.vim v8, v8, 1, v0
-; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64-NEXT: vmv.v.i v9, 0
-; RV64-NEXT: vsetivli zero, 2, e8, mf2, tu, mu
-; RV64-NEXT: vslideup.vi v9, v8, 0
-; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64-NEXT: vmsne.vi v8, v9, 0
-; RV64-NEXT: addi a2, sp, 15
-; RV64-NEXT: vsm.v v8, (a2)
-; RV64-NEXT: lbu a2, 15(sp)
+; RV64-NEXT: vmv.x.s a2, v8
; RV64-NEXT: andi a3, a2, 1
; RV64-NEXT: beqz a3, .LBB8_2
; RV64-NEXT: # %bb.1: # %cond.load
@@ -619,7 +504,6 @@ define void @masked_load_v2i32_align1(<2 x i32>* %a, <2 x i32> %m, <2 x i32>* %r
; RV64-NEXT: .LBB8_4: # %else2
; RV64-NEXT: vsetvli zero, zero, e32, mf2, ta, mu
; RV64-NEXT: vse32.v v8, (a1)
-; RV64-NEXT: addi sp, sp, 16
; RV64-NEXT: ret
%mask = icmp eq <2 x i32> %m, zeroinitializer
%load = call <2 x i32> @llvm.masked.load.v2i32(<2 x i32>* %a, i32 1, <2 x i1> %mask, <2 x i32> undef)
@@ -632,31 +516,19 @@ declare void @llvm.masked.store.v2i32.p0v2i32(<2 x i32>, <2 x i32>*, i32, <2 x i
define void @masked_store_v2i32_align2(<2 x i32> %val, <2 x i32>* %a, <2 x i32> %m) nounwind {
; CHECK-LABEL: masked_store_v2i32_align2:
; CHECK: # %bb.0:
-; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu
-; CHECK-NEXT: vmseq.vi v0, v9, 0
-; CHECK-NEXT: vmv.v.i v9, 0
+; CHECK-NEXT: vmseq.vi v9, v9, 0
; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, mu
-; CHECK-NEXT: vmerge.vim v9, v9, 1, v0
-; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; CHECK-NEXT: vmv.v.i v10, 0
-; CHECK-NEXT: vsetivli zero, 2, e8, mf2, tu, mu
-; CHECK-NEXT: vslideup.vi v10, v9, 0
-; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; CHECK-NEXT: vmsne.vi v9, v10, 0
-; CHECK-NEXT: addi a1, sp, 15
-; CHECK-NEXT: vsm.v v9, (a1)
-; CHECK-NEXT: lbu a1, 15(sp)
+; CHECK-NEXT: vmv.x.s a1, v9
; CHECK-NEXT: andi a2, a1, 1
; CHECK-NEXT: bnez a2, .LBB9_3
; CHECK-NEXT: # %bb.1: # %else
; CHECK-NEXT: andi a1, a1, 2
; CHECK-NEXT: bnez a1, .LBB9_4
; CHECK-NEXT: .LBB9_2: # %else2
-; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
; CHECK-NEXT: .LBB9_3: # %cond.store
-; CHECK-NEXT: vsetivli zero, 0, e32, mf2, ta, mu
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu
; CHECK-NEXT: vmv.x.s a2, v8
; CHECK-NEXT: sh a2, 0(a0)
; CHECK-NEXT: srli a2, a2, 16
@@ -670,7 +542,6 @@ define void @masked_store_v2i32_align2(<2 x i32> %val, <2 x i32>* %a, <2 x i32>
; CHECK-NEXT: sh a1, 4(a0)
; CHECK-NEXT: srli a1, a1, 16
; CHECK-NEXT: sh a1, 6(a0)
-; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
%mask = icmp eq <2 x i32> %m, zeroinitializer
call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32> %val, <2 x i32>* %a, i32 2, <2 x i1> %mask)
More information about the llvm-commits
mailing list