[llvm] [RISCV] Correct the limit of RegPresureSet `GPRAll` (PR #118473)
Pengcheng Wang via llvm-commits
llvm-commits at lists.llvm.org
Tue Dec 3 05:47:49 PST 2024
https://github.com/wangpc-pp updated https://github.com/llvm/llvm-project/pull/118473
>From 17117c191b5fd5e9c047af74d39dc3a7be9d2091 Mon Sep 17 00:00:00 2001
From: Wang Pengcheng <wangpengcheng.pp at bytedance.com>
Date: Tue, 3 Dec 2024 19:00:18 +0800
Subject: [PATCH 1/3] [RISCV] Correct the limit of RegPresureSet `GPRAll`
The generated limit is 33, which is the total number of scalar registers
plus 1 (for `DUMMY_REG_PAIR_WITH_X0`).
This is not right as not all scalar registers can be used. There are 4-6
reserved registers, so we need to adjust the limit by the reserved set.
This change has impacts on instruction scheduling, MachineLICM, etc.
Here are the statistics of spills/reloads on `llvm-test-suite` with
`-O3 -march=rva23u64`:
```
Metric: regalloc.NumSpills,regalloc.NumReloads
Program regalloc.NumSpills regalloc.NumReloads
baseline after diff baseline after diff
External/S...NT2017rate/502.gcc_r/502.gcc_r 11812.00 11338.00 -474.00 26813.00 25751.00 -1062.00
External/S...T2017speed/602.gcc_s/602.gcc_s 11812.00 11338.00 -474.00 26813.00 25751.00 -1062.00
External/S...te/526.blender_r/526.blender_r 13514.00 13228.00 -286.00 27456.00 27260.00 -196.00
External/S...00.perlbench_s/600.perlbench_s 4398.00 4274.00 -124.00 9745.00 9341.00 -404.00
External/S...00.perlbench_r/500.perlbench_r 4398.00 4274.00 -124.00 9745.00 9341.00 -404.00
SingleSour...nchmarks/Adobe-C++/loop_unroll 1533.00 1413.00 -120.00 2943.00 2633.00 -310.00
External/S...rate/510.parest_r/510.parest_r 43985.00 43879.00 -106.00 87409.00 87309.00 -100.00
External/S...te/538.imagick_r/538.imagick_r 4160.00 4060.00 -100.00 10338.00 10244.00 -94.00
External/S...ed/638.imagick_s/638.imagick_s 4160.00 4060.00 -100.00 10338.00 10244.00 -94.00
MultiSourc...e/Applications/ClamAV/clamscan 2120.00 2023.00 -97.00 5035.00 4901.00 -134.00
MultiSourc...sumer-typeset/consumer-typeset 1218.00 1129.00 -89.00 3041.00 2887.00 -154.00
MultiSourc.../Applications/JM/ldecod/ldecod 1341.00 1263.00 -78.00 2316.00 2238.00 -78.00
External/S...rate/511.povray_r/511.povray_r 1734.00 1659.00 -75.00 3413.00 3246.00 -167.00
MultiSource/Applications/SPASS/SPASS 1442.00 1376.00 -66.00 2954.00 2837.00 -117.00
MultiSourc.../DOE-ProxyApps-C++/CLAMR/CLAMR 1628.00 1568.00 -60.00 3026.00 2958.00 -68.00
regalloc.NumSpills regalloc.NumReloads
run baseline after diff baseline after diff
mean 86.725206 85.041122 -1.684083 1363.122137 1342.900383 -3.212869
```
Co-authored-by: BoyaoWang430 <wangboyao at bytedance.com>
---
llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp | 14 +
llvm/lib/Target/RISCV/RISCVRegisterInfo.h | 2 +
llvm/test/CodeGen/RISCV/pr69586.ll | 821 ++---
.../RISCV/rvv/fixed-vectors-masked-scatter.ll | 78 +-
.../RISCV/rvv/fixed-vectors-setcc-fp-vp.ll | 2208 +++++------
.../RISCV/rvv/intrinsic-vector-match.ll | 700 ++--
.../RISCV/rvv/vxrm-insert-out-of-loop.ll | 5 +-
...lar-shift-by-byte-multiple-legalization.ll | 3240 ++++++++---------
.../RISCV/wide-scalar-shift-legalization.ll | 646 ++--
9 files changed, 3755 insertions(+), 3959 deletions(-)
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
index cfcc3119257f65..a73bd1621a739d 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
@@ -934,3 +934,17 @@ bool RISCVRegisterInfo::getRegAllocationHints(
return BaseImplRetVal;
}
+
+unsigned RISCVRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF,
+ unsigned Idx) const {
+ if (Idx == RISCV::RegisterPressureSets::GPRAll) {
+ unsigned Reserved = 0;
+ BitVector ReservedRegs = getReservedRegs(MF);
+ for (MCPhysReg Reg = RISCV::X0_H; Reg <= RISCV::X31_H; Reg++)
+ if (ReservedRegs.test(Reg))
+ Reserved++;
+
+ return 32 - Reserved;
+ }
+ return RISCVGenRegisterInfo::getRegPressureSetLimit(MF, Idx);
+}
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.h b/llvm/lib/Target/RISCV/RISCVRegisterInfo.h
index 3ab79694e175c8..ca4934de2f52d2 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.h
@@ -144,6 +144,8 @@ struct RISCVRegisterInfo : public RISCVGenRegisterInfo {
static bool isRVVRegClass(const TargetRegisterClass *RC) {
return RISCVRI::isVRegClass(RC->TSFlags);
}
+ unsigned getRegPressureSetLimit(const MachineFunction &MF,
+ unsigned Idx) const override;
};
} // namespace llvm
diff --git a/llvm/test/CodeGen/RISCV/pr69586.ll b/llvm/test/CodeGen/RISCV/pr69586.ll
index 9fc9a3c42867e7..21e64ada7061aa 100644
--- a/llvm/test/CodeGen/RISCV/pr69586.ll
+++ b/llvm/test/CodeGen/RISCV/pr69586.ll
@@ -44,59 +44,50 @@ define void @test(ptr %0, ptr %1, i64 %2) {
; NOREMAT-NEXT: addi a5, a7, 512
; NOREMAT-NEXT: addi a4, a7, 1024
; NOREMAT-NEXT: addi a6, a7, 1536
-; NOREMAT-NEXT: li t4, 1
-; NOREMAT-NEXT: li a2, 5
-; NOREMAT-NEXT: li t1, 3
-; NOREMAT-NEXT: li t0, 7
-; NOREMAT-NEXT: lui t5, 1
-; NOREMAT-NEXT: li s4, 9
-; NOREMAT-NEXT: li s6, 11
-; NOREMAT-NEXT: li s9, 13
-; NOREMAT-NEXT: li ra, 15
-; NOREMAT-NEXT: lui t2, 2
-; NOREMAT-NEXT: lui s1, 3
-; NOREMAT-NEXT: lui t3, 4
-; NOREMAT-NEXT: lui s0, 5
-; NOREMAT-NEXT: lui s3, 6
-; NOREMAT-NEXT: lui s7, 7
+; NOREMAT-NEXT: li t1, 1
+; NOREMAT-NEXT: li a3, 5
+; NOREMAT-NEXT: li t0, 3
+; NOREMAT-NEXT: li a2, 7
+; NOREMAT-NEXT: lui t2, 1
+; NOREMAT-NEXT: li s5, 9
+; NOREMAT-NEXT: li s8, 11
+; NOREMAT-NEXT: lui s1, 2
+; NOREMAT-NEXT: lui t5, 3
+; NOREMAT-NEXT: lui s11, 4
+; NOREMAT-NEXT: lui ra, 5
+; NOREMAT-NEXT: lui t3, 6
+; NOREMAT-NEXT: lui s0, 7
; NOREMAT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOREMAT-NEXT: slli t4, t4, 11
-; NOREMAT-NEXT: sd t4, 512(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: slli a3, a2, 9
-; NOREMAT-NEXT: sd a3, 504(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: slli t6, t1, 10
-; NOREMAT-NEXT: slli s2, t0, 9
-; NOREMAT-NEXT: add a0, a7, t5
-; NOREMAT-NEXT: lui s11, 1
-; NOREMAT-NEXT: slli s4, s4, 9
-; NOREMAT-NEXT: slli s5, a2, 10
-; NOREMAT-NEXT: slli s6, s6, 9
-; NOREMAT-NEXT: slli s8, t1, 11
+; NOREMAT-NEXT: slli t4, t1, 11
+; NOREMAT-NEXT: slli t6, a3, 9
+; NOREMAT-NEXT: slli s2, t0, 10
+; NOREMAT-NEXT: slli s4, a2, 9
+; NOREMAT-NEXT: add a0, a7, t2
; NOREMAT-NEXT: vle32.v v8, (a5)
-; NOREMAT-NEXT: slli s9, s9, 9
-; NOREMAT-NEXT: li t5, 13
+; NOREMAT-NEXT: slli s5, s5, 9
; NOREMAT-NEXT: vle32.v v10, (a4)
; NOREMAT-NEXT: vle32.v v2, (a4)
-; NOREMAT-NEXT: slli s10, t0, 10
+; NOREMAT-NEXT: slli s6, a3, 10
; NOREMAT-NEXT: vle32.v v0, (a6)
; NOREMAT-NEXT: vle32.v v12, (a6)
-; NOREMAT-NEXT: slli ra, ra, 9
+; NOREMAT-NEXT: slli s8, s8, 9
+; NOREMAT-NEXT: slli s9, t0, 11
; NOREMAT-NEXT: vle32.v v4, (a0)
; NOREMAT-NEXT: vle32.v v20, (a0)
-; NOREMAT-NEXT: add a4, a7, t2
+; NOREMAT-NEXT: add a4, a7, s1
; NOREMAT-NEXT: vle32.v v6, (a4)
; NOREMAT-NEXT: vle32.v v30, (a4)
-; NOREMAT-NEXT: add a4, a7, s1
+; NOREMAT-NEXT: add a4, a7, t5
; NOREMAT-NEXT: vle32.v v28, (a4)
; NOREMAT-NEXT: vle32.v v26, (a4)
-; NOREMAT-NEXT: add a4, a7, t3
+; NOREMAT-NEXT: add a4, a7, s11
; NOREMAT-NEXT: vle32.v v24, (a4)
; NOREMAT-NEXT: vle32.v v22, (a4)
-; NOREMAT-NEXT: add a4, a7, s0
+; NOREMAT-NEXT: add a4, a7, ra
; NOREMAT-NEXT: vle32.v v14, (a7)
; NOREMAT-NEXT: vle32.v v18, (a4)
; NOREMAT-NEXT: vle32.v v16, (a4)
-; NOREMAT-NEXT: add a4, a7, s3
+; NOREMAT-NEXT: add a4, a7, t3
; NOREMAT-NEXT: sf.vc.vv 3, 0, v14, v8
; NOREMAT-NEXT: vle32.v v14, (a4)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v8, v10
@@ -107,78 +98,86 @@ define void @test(ptr %0, ptr %1, i64 %2) {
; NOREMAT-NEXT: vle32.v v10, (a4)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v2, v0
; NOREMAT-NEXT: vle32.v v2, (a4)
-; NOREMAT-NEXT: add a4, a7, a3
+; NOREMAT-NEXT: add a4, a7, t6
; NOREMAT-NEXT: vle32.v v0, (a4)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v10
; NOREMAT-NEXT: vle32.v v10, (a4)
-; NOREMAT-NEXT: add a4, a7, t6
+; NOREMAT-NEXT: add a4, a7, s2
; NOREMAT-NEXT: vle32.v v12, (a4)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v2, v0
; NOREMAT-NEXT: vle32.v v2, (a4)
-; NOREMAT-NEXT: add a4, a7, s2
+; NOREMAT-NEXT: add a4, a7, s4
; NOREMAT-NEXT: vle32.v v8, (a4)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v10, v12
; NOREMAT-NEXT: vle32.v v12, (a4)
-; NOREMAT-NEXT: add a4, a7, s7
+; NOREMAT-NEXT: add a4, a7, s0
; NOREMAT-NEXT: vle32.v v0, (a4)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v2, v8
; NOREMAT-NEXT: vle32.v v10, (a4)
-; NOREMAT-NEXT: add a4, a7, s4
+; NOREMAT-NEXT: add a4, a7, s5
; NOREMAT-NEXT: vle32.v v8, (a4)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v4
; NOREMAT-NEXT: vle32.v v12, (a4)
-; NOREMAT-NEXT: add a4, a7, s5
+; NOREMAT-NEXT: add a4, a7, s6
; NOREMAT-NEXT: vle32.v v4, (a4)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v20, v8
; NOREMAT-NEXT: vle32.v v8, (a4)
-; NOREMAT-NEXT: add a4, a7, s6
+; NOREMAT-NEXT: add a4, a7, s8
; NOREMAT-NEXT: vle32.v v20, (a4)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v4
; NOREMAT-NEXT: vle32.v v12, (a4)
-; NOREMAT-NEXT: add a4, a7, s8
+; NOREMAT-NEXT: add a4, a7, s9
; NOREMAT-NEXT: vle32.v v4, (a4)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v8, v20
; NOREMAT-NEXT: vle32.v v8, (a4)
-; NOREMAT-NEXT: add a4, a7, s9
+; NOREMAT-NEXT: li t5, 13
+; NOREMAT-NEXT: slli a4, t5, 9
+; NOREMAT-NEXT: sd a4, 624(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT: add a4, a7, a4
; NOREMAT-NEXT: vle32.v v20, (a4)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v4
; NOREMAT-NEXT: vle32.v v12, (a4)
-; NOREMAT-NEXT: add a4, a7, s10
+; NOREMAT-NEXT: slli a4, a2, 10
+; NOREMAT-NEXT: sd a4, 616(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT: add a4, a7, a4
; NOREMAT-NEXT: vle32.v v4, (a4)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v8, v20
; NOREMAT-NEXT: vle32.v v8, (a4)
-; NOREMAT-NEXT: add a4, a7, ra
+; NOREMAT-NEXT: li a6, 15
+; NOREMAT-NEXT: slli a4, a6, 9
+; NOREMAT-NEXT: sd a4, 608(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT: add a4, a7, a4
; NOREMAT-NEXT: vle32.v v2, (a4)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v4
-; NOREMAT-NEXT: lui t4, 8
-; NOREMAT-NEXT: add a5, a7, t4
+; NOREMAT-NEXT: lui t1, 8
+; NOREMAT-NEXT: add a5, a7, t1
; NOREMAT-NEXT: vle32.v v20, (a5)
; NOREMAT-NEXT: vle32.v v12, (a4)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v8, v2
; NOREMAT-NEXT: li a4, 17
; NOREMAT-NEXT: slli a4, a4, 9
-; NOREMAT-NEXT: li s1, 17
-; NOREMAT-NEXT: sd a4, 624(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT: li t2, 17
+; NOREMAT-NEXT: sd a4, 600(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: add a4, a7, a4
; NOREMAT-NEXT: vle32.v v8, (a4)
; NOREMAT-NEXT: vle32.v v4, (a4)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v6
; NOREMAT-NEXT: li a5, 9
; NOREMAT-NEXT: slli a4, a5, 10
-; NOREMAT-NEXT: sd a4, 616(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT: sd a4, 592(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: add a4, a7, a4
; NOREMAT-NEXT: vle32.v v12, (a4)
; NOREMAT-NEXT: vle32.v v6, (a4)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v8
; NOREMAT-NEXT: li a4, 19
; NOREMAT-NEXT: slli a4, a4, 9
-; NOREMAT-NEXT: li t2, 19
-; NOREMAT-NEXT: sd a4, 608(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT: li s1, 19
+; NOREMAT-NEXT: sd a4, 584(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: add a4, a7, a4
; NOREMAT-NEXT: vle32.v v8, (a4)
; NOREMAT-NEXT: vle32.v v30, (a4)
-; NOREMAT-NEXT: slli a3, a2, 11
-; NOREMAT-NEXT: sd a3, 600(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT: slli a3, a3, 11
+; NOREMAT-NEXT: sd a3, 576(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v12
; NOREMAT-NEXT: add a3, a7, a3
; NOREMAT-NEXT: vle32.v v12, (a3)
@@ -186,46 +185,45 @@ define void @test(ptr %0, ptr %1, i64 %2) {
; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v8
; NOREMAT-NEXT: li s7, 21
; NOREMAT-NEXT: slli a3, s7, 9
-; NOREMAT-NEXT: sd a3, 592(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT: sd a3, 568(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: add a3, a7, a3
; NOREMAT-NEXT: vle32.v v8, (a3)
; NOREMAT-NEXT: vle32.v v6, (a3)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v12
-; NOREMAT-NEXT: li a6, 11
-; NOREMAT-NEXT: slli a3, a6, 10
-; NOREMAT-NEXT: sd a3, 584(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT: li a4, 11
+; NOREMAT-NEXT: slli a3, a4, 10
+; NOREMAT-NEXT: sd a3, 560(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: add a3, a7, a3
; NOREMAT-NEXT: vle32.v v12, (a3)
; NOREMAT-NEXT: vle32.v v30, (a3)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v8
; NOREMAT-NEXT: li s3, 23
-; NOREMAT-NEXT: slli a3, s3, 9
-; NOREMAT-NEXT: sd a3, 576(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a3, a7, a3
+; NOREMAT-NEXT: slli s10, s3, 9
+; NOREMAT-NEXT: add a3, a7, s10
; NOREMAT-NEXT: vle32.v v8, (a3)
; NOREMAT-NEXT: vle32.v v4, (a3)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v12
; NOREMAT-NEXT: li s0, 25
; NOREMAT-NEXT: slli a3, s0, 9
-; NOREMAT-NEXT: sd a3, 568(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT: sd a3, 552(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: add a3, a7, a3
; NOREMAT-NEXT: vle32.v v12, (a3)
; NOREMAT-NEXT: vle32.v v6, (a3)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v8
; NOREMAT-NEXT: slli a3, t5, 10
-; NOREMAT-NEXT: sd a3, 560(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT: sd a3, 544(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: add a3, a7, a3
; NOREMAT-NEXT: vle32.v v8, (a3)
; NOREMAT-NEXT: vle32.v v30, (a3)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v28
; NOREMAT-NEXT: li t3, 27
; NOREMAT-NEXT: slli a3, t3, 9
-; NOREMAT-NEXT: sd a3, 552(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT: sd a3, 536(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: add a3, a7, a3
; NOREMAT-NEXT: vle32.v v28, (a3)
; NOREMAT-NEXT: vle32.v v4, (a3)
-; NOREMAT-NEXT: slli a2, t0, 11
-; NOREMAT-NEXT: sd a2, 544(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT: slli a2, a2, 11
+; NOREMAT-NEXT: sd a2, 528(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: sf.vc.vv 3, 0, v26, v12
; NOREMAT-NEXT: add a2, a7, a2
; NOREMAT-NEXT: vle32.v v12, (a2)
@@ -233,39 +231,37 @@ define void @test(ptr %0, ptr %1, i64 %2) {
; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v8
; NOREMAT-NEXT: li t0, 29
; NOREMAT-NEXT: slli a2, t0, 9
-; NOREMAT-NEXT: sd a2, 536(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT: sd a2, 520(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: add a2, a7, a2
; NOREMAT-NEXT: vle32.v v8, (a2)
; NOREMAT-NEXT: vle32.v v6, (a2)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v28
-; NOREMAT-NEXT: li a3, 15
-; NOREMAT-NEXT: slli a2, a3, 10
-; NOREMAT-NEXT: sd a2, 528(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT: slli a2, a6, 10
+; NOREMAT-NEXT: sd a2, 512(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: add a2, a7, a2
; NOREMAT-NEXT: vle32.v v28, (a2)
; NOREMAT-NEXT: vle32.v v30, (a2)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v12
-; NOREMAT-NEXT: li t1, 31
-; NOREMAT-NEXT: slli a2, t1, 9
-; NOREMAT-NEXT: sd a2, 520(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a2, a7, a2
-; NOREMAT-NEXT: vle32.v v12, (a2)
-; NOREMAT-NEXT: vle32.v v4, (a2)
-; NOREMAT-NEXT: sf.vc.vv 3, 0, v26, v8
-; NOREMAT-NEXT: lui a4, 4
-; NOREMAT-NEXT: addiw a0, a4, 512
-; NOREMAT-NEXT: sd a0, 496(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT: li a3, 31
+; NOREMAT-NEXT: slli a0, a3, 9
+; NOREMAT-NEXT: sd a0, 504(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: add a0, a7, a0
-; NOREMAT-NEXT: vle32.v v8, (a0)
-; NOREMAT-NEXT: vle32.v v26, (a0)
+; NOREMAT-NEXT: vle32.v v12, (a0)
+; NOREMAT-NEXT: vle32.v v4, (a0)
+; NOREMAT-NEXT: sf.vc.vv 3, 0, v26, v8
+; NOREMAT-NEXT: addiw a2, s11, 512
+; NOREMAT-NEXT: sd a2, 496(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT: add a2, a7, a2
+; NOREMAT-NEXT: vle32.v v8, (a2)
+; NOREMAT-NEXT: vle32.v v26, (a2)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v28
-; NOREMAT-NEXT: slli a2, s1, 10
+; NOREMAT-NEXT: slli a2, t2, 10
; NOREMAT-NEXT: sd a2, 488(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: add a2, a7, a2
; NOREMAT-NEXT: vle32.v v28, (a2)
; NOREMAT-NEXT: vle32.v v6, (a2)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v12
-; NOREMAT-NEXT: addiw a2, a4, 1536
+; NOREMAT-NEXT: addiw a2, s11, 1536
; NOREMAT-NEXT: sd a2, 480(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: add a2, a7, a2
; NOREMAT-NEXT: vle32.v v12, (a2)
@@ -277,27 +273,25 @@ define void @test(ptr %0, ptr %1, i64 %2) {
; NOREMAT-NEXT: vle32.v v24, (a2)
; NOREMAT-NEXT: vle32.v v4, (a2)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v22, v8
-; NOREMAT-NEXT: lui a5, 5
-; NOREMAT-NEXT: addiw a2, a5, -1536
+; NOREMAT-NEXT: addiw a2, ra, -1536
; NOREMAT-NEXT: sd a2, 464(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: add a2, a7, a2
; NOREMAT-NEXT: vle32.v v8, (a2)
; NOREMAT-NEXT: vle32.v v22, (a2)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v26, v28
-; NOREMAT-NEXT: slli a2, t2, 10
+; NOREMAT-NEXT: slli a2, s1, 10
; NOREMAT-NEXT: sd a2, 456(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: li t2, 19
; NOREMAT-NEXT: add a2, a7, a2
; NOREMAT-NEXT: vle32.v v26, (a2)
; NOREMAT-NEXT: vle32.v v28, (a2)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v12
-; NOREMAT-NEXT: addiw a2, a5, -512
+; NOREMAT-NEXT: addiw a2, ra, -512
; NOREMAT-NEXT: sd a2, 448(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: add a2, a7, a2
; NOREMAT-NEXT: vle32.v v12, (a2)
; NOREMAT-NEXT: vle32.v v6, (a2)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v24
-; NOREMAT-NEXT: addiw a2, a5, 512
+; NOREMAT-NEXT: addiw a2, ra, 512
; NOREMAT-NEXT: sd a2, 440(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: add a2, a7, a2
; NOREMAT-NEXT: vle32.v v24, (a2)
@@ -309,20 +303,20 @@ define void @test(ptr %0, ptr %1, i64 %2) {
; NOREMAT-NEXT: vle32.v v8, (a2)
; NOREMAT-NEXT: vle32.v v4, (a2)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v22, v26
-; NOREMAT-NEXT: addiw a2, a5, 1536
+; NOREMAT-NEXT: addiw a2, ra, 1536
; NOREMAT-NEXT: sd a2, 424(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: add a2, a7, a2
; NOREMAT-NEXT: vle32.v v22, (a2)
; NOREMAT-NEXT: vle32.v v26, (a2)
-; NOREMAT-NEXT: slli a2, a6, 11
+; NOREMAT-NEXT: slli a2, a4, 11
; NOREMAT-NEXT: sd a2, 416(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: sf.vc.vv 3, 0, v28, v12
; NOREMAT-NEXT: add a2, a7, a2
; NOREMAT-NEXT: vle32.v v12, (a2)
; NOREMAT-NEXT: vle32.v v28, (a2)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v18
-; NOREMAT-NEXT: lui a6, 6
-; NOREMAT-NEXT: addiw a2, a6, -1536
+; NOREMAT-NEXT: lui a4, 6
+; NOREMAT-NEXT: addiw a2, a4, -1536
; NOREMAT-NEXT: sd a2, 408(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: add a2, a7, a2
; NOREMAT-NEXT: vle32.v v18, (a2)
@@ -334,13 +328,13 @@ define void @test(ptr %0, ptr %1, i64 %2) {
; NOREMAT-NEXT: vle32.v v16, (a2)
; NOREMAT-NEXT: vle32.v v24, (a2)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v8
-; NOREMAT-NEXT: addiw a2, a6, -512
+; NOREMAT-NEXT: addiw a2, a4, -512
; NOREMAT-NEXT: sd a2, 392(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: add a2, a7, a2
; NOREMAT-NEXT: vle32.v v8, (a2)
; NOREMAT-NEXT: vle32.v v30, (a2)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v22
-; NOREMAT-NEXT: addiw a2, a6, 512
+; NOREMAT-NEXT: addiw a2, a4, 512
; NOREMAT-NEXT: sd a2, 384(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: add a2, a7, a2
; NOREMAT-NEXT: vle32.v v22, (a2)
@@ -352,7 +346,7 @@ define void @test(ptr %0, ptr %1, i64 %2) {
; NOREMAT-NEXT: vle32.v v26, (a2)
; NOREMAT-NEXT: vle32.v v2, (a2)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v28, v18
-; NOREMAT-NEXT: addiw a2, a6, 1536
+; NOREMAT-NEXT: addiw a2, a4, 1536
; NOREMAT-NEXT: sd a2, 368(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: add a2, a7, a2
; NOREMAT-NEXT: vle32.v v18, (a2)
@@ -364,8 +358,8 @@ define void @test(ptr %0, ptr %1, i64 %2) {
; NOREMAT-NEXT: vle32.v v16, (a2)
; NOREMAT-NEXT: vle32.v v6, (a2)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v24, v8
-; NOREMAT-NEXT: lui s0, 7
-; NOREMAT-NEXT: addiw a2, s0, -1536
+; NOREMAT-NEXT: lui a5, 7
+; NOREMAT-NEXT: addiw a2, a5, -1536
; NOREMAT-NEXT: sd a2, 352(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: add a2, a7, a2
; NOREMAT-NEXT: vle32.v v8, (a2)
@@ -379,15 +373,14 @@ define void @test(ptr %0, ptr %1, i64 %2) {
; NOREMAT-NEXT: addi a0, sp, 640
; NOREMAT-NEXT: vl2r.v v12, (a0) # Unknown-size Folded Reload
; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v22
-; NOREMAT-NEXT: addiw a2, s0, -512
+; NOREMAT-NEXT: addiw a2, a5, -512
; NOREMAT-NEXT: sd a2, 336(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: add a2, a7, a2
; NOREMAT-NEXT: vle32.v v22, (a2)
; NOREMAT-NEXT: vle32.v v12, (a2)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v26
-; NOREMAT-NEXT: addiw a2, s0, 512
+; NOREMAT-NEXT: addiw a2, a5, 512
; NOREMAT-NEXT: sd a2, 328(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: lui t3, 7
; NOREMAT-NEXT: add a2, a7, a2
; NOREMAT-NEXT: vle32.v v26, (a2)
; NOREMAT-NEXT: vle32.v v4, (a2)
@@ -398,30 +391,30 @@ define void @test(ptr %0, ptr %1, i64 %2) {
; NOREMAT-NEXT: vle32.v v18, (a2)
; NOREMAT-NEXT: vle32.v v2, (a2)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v28, v16
-; NOREMAT-NEXT: addiw a2, t3, 1536
+; NOREMAT-NEXT: addiw a2, a5, 1536
; NOREMAT-NEXT: sd a2, 312(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: add a2, a7, a2
; NOREMAT-NEXT: vle32.v v16, (a2)
; NOREMAT-NEXT: vle32.v v28, (a2)
-; NOREMAT-NEXT: slli a2, a3, 11
+; NOREMAT-NEXT: slli a2, a6, 11
; NOREMAT-NEXT: sd a2, 304(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v8
; NOREMAT-NEXT: add a2, a7, a2
; NOREMAT-NEXT: vle32.v v8, (a2)
; NOREMAT-NEXT: vle32.v v6, (a2)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v24, v14
-; NOREMAT-NEXT: addiw a2, t4, -1536
+; NOREMAT-NEXT: addiw a2, t1, -1536
; NOREMAT-NEXT: sd a2, 296(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: add a2, a7, a2
; NOREMAT-NEXT: vle32.v v14, (a2)
; NOREMAT-NEXT: vle32.v v24, (a2)
-; NOREMAT-NEXT: slli a2, t1, 10
+; NOREMAT-NEXT: slli a2, a3, 10
; NOREMAT-NEXT: sd a2, 288(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v22
; NOREMAT-NEXT: add a2, a7, a2
; NOREMAT-NEXT: vle32.v v22, (a2)
; NOREMAT-NEXT: vle32.v v30, (a2)
-; NOREMAT-NEXT: addiw a0, t4, -512
+; NOREMAT-NEXT: addiw a0, t1, -512
; NOREMAT-NEXT: sd a0, 280(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: add a0, a7, a0
; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v0
@@ -438,32 +431,33 @@ define void @test(ptr %0, ptr %1, i64 %2) {
; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0
; NOREMAT-NEXT: addi a0, a1, 1024
; NOREMAT-NEXT: vse32.v v8, (a0)
-; NOREMAT-NEXT: add s11, a1, s11
-; NOREMAT-NEXT: sd s11, 272(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT: lui a0, 1
+; NOREMAT-NEXT: add a0, a1, a0
+; NOREMAT-NEXT: sd a0, 272(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: lui a0, 2
; NOREMAT-NEXT: add a0, a1, a0
; NOREMAT-NEXT: sd a0, 264(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: lui a0, 3
; NOREMAT-NEXT: add a0, a1, a0
; NOREMAT-NEXT: sd a0, 256(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT: add s11, a1, s11
+; NOREMAT-NEXT: sd s11, 248(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT: add ra, a1, ra
+; NOREMAT-NEXT: sd ra, 240(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: add a4, a1, a4
-; NOREMAT-NEXT: sd a4, 248(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT: sd a4, 232(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: add a5, a1, a5
-; NOREMAT-NEXT: sd a5, 240(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a6, a1, a6
-; NOREMAT-NEXT: sd a6, 232(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add t3, a1, t3
-; NOREMAT-NEXT: sd t3, 224(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a0, a1, t4
+; NOREMAT-NEXT: sd a5, 224(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT: add a0, a1, t1
; NOREMAT-NEXT: sd a0, 216(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: addiw a0, t4, 512
+; NOREMAT-NEXT: addiw a0, t1, 512
; NOREMAT-NEXT: sd a0, 192(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: addiw a0, t4, 1024
+; NOREMAT-NEXT: addiw a0, t1, 1024
; NOREMAT-NEXT: sd a0, 176(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: addiw a0, t4, 1536
+; NOREMAT-NEXT: addiw a0, t1, 1536
; NOREMAT-NEXT: sd a0, 160(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: slli s1, s1, 11
-; NOREMAT-NEXT: sd s1, 128(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT: slli t2, t2, 11
+; NOREMAT-NEXT: sd t2, 128(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: lui a0, 9
; NOREMAT-NEXT: addiw a2, a0, -1536
; NOREMAT-NEXT: sd a2, 88(sp) # 8-byte Folded Spill
@@ -476,7 +470,7 @@ define void @test(ptr %0, ptr %1, i64 %2) {
; NOREMAT-NEXT: addiw s11, a0, 512
; NOREMAT-NEXT: addiw s7, a0, 1024
; NOREMAT-NEXT: addiw s3, a0, 1536
-; NOREMAT-NEXT: slli s1, t2, 11
+; NOREMAT-NEXT: slli s1, s1, 11
; NOREMAT-NEXT: lui a0, 10
; NOREMAT-NEXT: addiw t2, a0, -1536
; NOREMAT-NEXT: addiw a7, a0, -1024
@@ -484,52 +478,52 @@ define void @test(ptr %0, ptr %1, i64 %2) {
; NOREMAT-NEXT: add a2, a1, a0
; NOREMAT-NEXT: sd a2, 200(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: addiw a0, a0, 512
-; NOREMAT-NEXT: ld a2, 512(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT: add a2, a1, a2
-; NOREMAT-NEXT: ld a3, 504(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT: add a3, a1, a3
-; NOREMAT-NEXT: add a5, a1, t6
-; NOREMAT-NEXT: add a6, a1, s2
-; NOREMAT-NEXT: add t0, a1, s4
-; NOREMAT-NEXT: add t1, a1, s5
-; NOREMAT-NEXT: add t3, a1, s6
-; NOREMAT-NEXT: add t4, a1, s8
-; NOREMAT-NEXT: add t5, a1, s9
-; NOREMAT-NEXT: add t6, a1, s10
-; NOREMAT-NEXT: add s0, a1, ra
-; NOREMAT-NEXT: ld s2, 624(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT: add a2, a1, t4
+; NOREMAT-NEXT: add a3, a1, t6
+; NOREMAT-NEXT: add a5, a1, s2
+; NOREMAT-NEXT: add a6, a1, s4
+; NOREMAT-NEXT: add t0, a1, s5
+; NOREMAT-NEXT: add t1, a1, s6
+; NOREMAT-NEXT: add t3, a1, s8
+; NOREMAT-NEXT: add t4, a1, s9
+; NOREMAT-NEXT: ld t5, 624(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT: add t5, a1, t5
+; NOREMAT-NEXT: ld t6, 616(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT: add t6, a1, t6
+; NOREMAT-NEXT: ld s0, 608(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT: add s0, a1, s0
+; NOREMAT-NEXT: ld s2, 600(sp) # 8-byte Folded Reload
; NOREMAT-NEXT: add s2, a1, s2
-; NOREMAT-NEXT: ld s4, 616(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT: ld s4, 592(sp) # 8-byte Folded Reload
; NOREMAT-NEXT: add s4, a1, s4
-; NOREMAT-NEXT: ld s5, 608(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT: ld s5, 584(sp) # 8-byte Folded Reload
; NOREMAT-NEXT: add s5, a1, s5
-; NOREMAT-NEXT: ld s6, 600(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT: ld s6, 576(sp) # 8-byte Folded Reload
; NOREMAT-NEXT: add s6, a1, s6
-; NOREMAT-NEXT: ld s8, 592(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT: ld s8, 568(sp) # 8-byte Folded Reload
; NOREMAT-NEXT: add s8, a1, s8
-; NOREMAT-NEXT: ld s9, 584(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT: ld s9, 560(sp) # 8-byte Folded Reload
; NOREMAT-NEXT: add s9, a1, s9
-; NOREMAT-NEXT: ld s10, 576(sp) # 8-byte Folded Reload
; NOREMAT-NEXT: add s10, a1, s10
-; NOREMAT-NEXT: ld ra, 568(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT: ld ra, 552(sp) # 8-byte Folded Reload
; NOREMAT-NEXT: add ra, a1, ra
; NOREMAT-NEXT: sd ra, 16(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: ld ra, 560(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT: ld ra, 544(sp) # 8-byte Folded Reload
; NOREMAT-NEXT: add ra, a1, ra
; NOREMAT-NEXT: sd ra, 24(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: ld ra, 552(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT: ld ra, 536(sp) # 8-byte Folded Reload
; NOREMAT-NEXT: add ra, a1, ra
; NOREMAT-NEXT: sd ra, 32(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: ld ra, 544(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT: ld ra, 528(sp) # 8-byte Folded Reload
; NOREMAT-NEXT: add ra, a1, ra
; NOREMAT-NEXT: sd ra, 48(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: ld ra, 536(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT: ld ra, 520(sp) # 8-byte Folded Reload
; NOREMAT-NEXT: add ra, a1, ra
; NOREMAT-NEXT: sd ra, 56(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: ld ra, 528(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT: ld ra, 512(sp) # 8-byte Folded Reload
; NOREMAT-NEXT: add ra, a1, ra
; NOREMAT-NEXT: sd ra, 64(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: ld ra, 520(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT: ld ra, 504(sp) # 8-byte Folded Reload
; NOREMAT-NEXT: add ra, a1, ra
; NOREMAT-NEXT: sd ra, 80(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: ld ra, 496(sp) # 8-byte Folded Reload
@@ -923,10 +917,9 @@ define void @test(ptr %0, ptr %1, i64 %2) {
; REMAT-NEXT: .cfi_offset s10, -96
; REMAT-NEXT: .cfi_offset s11, -104
; REMAT-NEXT: csrr a2, vlenb
-; REMAT-NEXT: li a3, 18
-; REMAT-NEXT: mul a2, a2, a3
+; REMAT-NEXT: slli a2, a2, 3
; REMAT-NEXT: sub sp, sp, a2
-; REMAT-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xa0, 0x04, 0x22, 0x11, 0x12, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 544 + 18 * vlenb
+; REMAT-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xa0, 0x04, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 544 + 8 * vlenb
; REMAT-NEXT: li a4, 32
; REMAT-NEXT: addi a5, a0, 512
; REMAT-NEXT: addi a3, a0, 1024
@@ -963,23 +956,14 @@ define void @test(ptr %0, ptr %1, i64 %2) {
; REMAT-NEXT: slli s6, s6, 9
; REMAT-NEXT: li s7, 5
; REMAT-NEXT: slli s7, s7, 11
-; REMAT-NEXT: li s8, 21
-; REMAT-NEXT: slli s8, s8, 9
-; REMAT-NEXT: li s9, 11
-; REMAT-NEXT: slli s9, s9, 10
-; REMAT-NEXT: li s10, 23
-; REMAT-NEXT: slli s10, s10, 9
-; REMAT-NEXT: lui s11, 3
-; REMAT-NEXT: li ra, 25
-; REMAT-NEXT: slli ra, ra, 9
; REMAT-NEXT: vsetvli zero, a4, e32, m2, ta, ma
; REMAT-NEXT: vle32.v v8, (a5)
-; REMAT-NEXT: li a4, 13
-; REMAT-NEXT: slli a4, a4, 10
+; REMAT-NEXT: li a4, 21
+; REMAT-NEXT: slli a4, a4, 9
; REMAT-NEXT: vle32.v v10, (a3)
; REMAT-NEXT: vle32.v v12, (a3)
-; REMAT-NEXT: li a3, 27
-; REMAT-NEXT: slli a3, a3, 9
+; REMAT-NEXT: li a3, 11
+; REMAT-NEXT: slli a3, a3, 10
; REMAT-NEXT: vle32.v v14, (a2)
; REMAT-NEXT: vle32.v v16, (a2)
; REMAT-NEXT: add a2, a0, a6
@@ -995,7 +979,8 @@ define void @test(ptr %0, ptr %1, i64 %2) {
; REMAT-NEXT: vle32.v v30, (a2)
; REMAT-NEXT: vle32.v v6, (a2)
; REMAT-NEXT: csrr a2, vlenb
-; REMAT-NEXT: slli a2, a2, 4
+; REMAT-NEXT: li a5, 6
+; REMAT-NEXT: mul a2, a2, a5
; REMAT-NEXT: add a2, sp, a2
; REMAT-NEXT: addi a2, a2, 432
; REMAT-NEXT: vs2r.v v6, (a2) # Unknown-size Folded Spill
@@ -1004,8 +989,7 @@ define void @test(ptr %0, ptr %1, i64 %2) {
; REMAT-NEXT: vle32.v v2, (a2)
; REMAT-NEXT: vle32.v v6, (a2)
; REMAT-NEXT: csrr a2, vlenb
-; REMAT-NEXT: li a5, 14
-; REMAT-NEXT: mul a2, a2, a5
+; REMAT-NEXT: slli a2, a2, 2
; REMAT-NEXT: add a2, sp, a2
; REMAT-NEXT: addi a2, a2, 432
; REMAT-NEXT: vs2r.v v6, (a2) # Unknown-size Folded Spill
@@ -1019,17 +1003,11 @@ define void @test(ptr %0, ptr %1, i64 %2) {
; REMAT-NEXT: sf.vc.vv 3, 0, v12, v14
; REMAT-NEXT: vle32.v v0, (a2)
; REMAT-NEXT: add a2, a0, t5
-; REMAT-NEXT: vle32.v v14, (a2)
-; REMAT-NEXT: sf.vc.vv 3, 0, v16, v18
; REMAT-NEXT: vle32.v v8, (a2)
-; REMAT-NEXT: csrr a2, vlenb
-; REMAT-NEXT: li a5, 12
-; REMAT-NEXT: mul a2, a2, a5
-; REMAT-NEXT: add a2, sp, a2
-; REMAT-NEXT: addi a2, a2, 432
-; REMAT-NEXT: vs2r.v v8, (a2) # Unknown-size Folded Spill
-; REMAT-NEXT: add a2, a0, t6
+; REMAT-NEXT: sf.vc.vv 3, 0, v16, v18
; REMAT-NEXT: vle32.v v18, (a2)
+; REMAT-NEXT: add a2, a0, t6
+; REMAT-NEXT: vle32.v v16, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v20, v22
; REMAT-NEXT: vle32.v v20, (a2)
; REMAT-NEXT: add a2, a0, s0
@@ -1039,403 +1017,340 @@ define void @test(ptr %0, ptr %1, i64 %2) {
; REMAT-NEXT: add a2, a0, s1
; REMAT-NEXT: vle32.v v26, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v28, v30
-; REMAT-NEXT: vle32.v v28, (a2)
+; REMAT-NEXT: vle32.v v14, (a2)
; REMAT-NEXT: add a2, a0, s2
-; REMAT-NEXT: vle32.v v8, (a2)
+; REMAT-NEXT: vle32.v v12, (a2)
; REMAT-NEXT: csrr a5, vlenb
-; REMAT-NEXT: slli a5, a5, 4
+; REMAT-NEXT: li a6, 6
+; REMAT-NEXT: mul a5, a5, a6
; REMAT-NEXT: add a5, sp, a5
; REMAT-NEXT: addi a5, a5, 432
-; REMAT-NEXT: vl2r.v v12, (a5) # Unknown-size Folded Reload
-; REMAT-NEXT: sf.vc.vv 3, 0, v12, v2
+; REMAT-NEXT: vl2r.v v28, (a5) # Unknown-size Folded Reload
+; REMAT-NEXT: sf.vc.vv 3, 0, v28, v2
; REMAT-NEXT: vle32.v v2, (a2)
; REMAT-NEXT: add a2, a0, s3
-; REMAT-NEXT: vle32.v v12, (a2)
+; REMAT-NEXT: vle32.v v28, (a2)
; REMAT-NEXT: csrr a5, vlenb
-; REMAT-NEXT: li a6, 14
-; REMAT-NEXT: mul a5, a5, a6
+; REMAT-NEXT: slli a5, a5, 2
; REMAT-NEXT: add a5, sp, a5
; REMAT-NEXT: addi a5, a5, 432
-; REMAT-NEXT: vl2r.v v16, (a5) # Unknown-size Folded Reload
-; REMAT-NEXT: sf.vc.vv 3, 0, v16, v4
-; REMAT-NEXT: vle32.v v30, (a2)
+; REMAT-NEXT: vl2r.v v30, (a5) # Unknown-size Folded Reload
+; REMAT-NEXT: sf.vc.vv 3, 0, v30, v4
+; REMAT-NEXT: vle32.v v4, (a2)
; REMAT-NEXT: add a2, a0, s4
-; REMAT-NEXT: vle32.v v16, (a2)
+; REMAT-NEXT: vle32.v v30, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v6, v10
-; REMAT-NEXT: vle32.v v6, (a2)
-; REMAT-NEXT: add a2, a0, s5
; REMAT-NEXT: vle32.v v10, (a2)
-; REMAT-NEXT: sf.vc.vv 3, 0, v0, v14
-; REMAT-NEXT: vle32.v v4, (a2)
-; REMAT-NEXT: add a2, a0, s6
-; REMAT-NEXT: vle32.v v14, (a2)
-; REMAT-NEXT: csrr a5, vlenb
-; REMAT-NEXT: li a6, 12
-; REMAT-NEXT: mul a5, a5, a6
-; REMAT-NEXT: add a5, sp, a5
-; REMAT-NEXT: addi a5, a5, 432
-; REMAT-NEXT: vl2r.v v0, (a5) # Unknown-size Folded Reload
-; REMAT-NEXT: sf.vc.vv 3, 0, v0, v18
+; REMAT-NEXT: add a2, a0, s5
+; REMAT-NEXT: vle32.v v6, (a2)
+; REMAT-NEXT: sf.vc.vv 3, 0, v0, v8
; REMAT-NEXT: vle32.v v0, (a2)
-; REMAT-NEXT: add a2, a0, s7
+; REMAT-NEXT: add a2, a0, s6
+; REMAT-NEXT: vle32.v v8, (a2)
+; REMAT-NEXT: sf.vc.vv 3, 0, v18, v16
; REMAT-NEXT: vle32.v v18, (a2)
+; REMAT-NEXT: add a2, a0, s7
+; REMAT-NEXT: vle32.v v16, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v20, v22
; REMAT-NEXT: vle32.v v22, (a2)
-; REMAT-NEXT: add a2, a0, s8
+; REMAT-NEXT: add a2, a0, a4
; REMAT-NEXT: vle32.v v20, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v24, v26
-; REMAT-NEXT: vle32.v v26, (a2)
-; REMAT-NEXT: add a2, a0, s9
; REMAT-NEXT: vle32.v v24, (a2)
-; REMAT-NEXT: sf.vc.vv 3, 0, v28, v8
-; REMAT-NEXT: vle32.v v28, (a2)
-; REMAT-NEXT: add a2, a0, s10
-; REMAT-NEXT: vle32.v v8, (a2)
-; REMAT-NEXT: sf.vc.vv 3, 0, v2, v12
-; REMAT-NEXT: vle32.v v12, (a2)
-; REMAT-NEXT: csrr a2, vlenb
-; REMAT-NEXT: slli a2, a2, 3
-; REMAT-NEXT: add a2, sp, a2
-; REMAT-NEXT: addi a2, a2, 432
-; REMAT-NEXT: vs2r.v v12, (a2) # Unknown-size Folded Spill
-; REMAT-NEXT: add a2, a0, s11
+; REMAT-NEXT: addi a2, sp, 432
+; REMAT-NEXT: vs2r.v v24, (a2) # Unknown-size Folded Spill
+; REMAT-NEXT: add a2, a0, a3
+; REMAT-NEXT: vle32.v v24, (a2)
+; REMAT-NEXT: sf.vc.vv 3, 0, v14, v12
; REMAT-NEXT: vle32.v v12, (a2)
-; REMAT-NEXT: sf.vc.vv 3, 0, v30, v16
-; REMAT-NEXT: vle32.v v16, (a2)
-; REMAT-NEXT: add a2, a0, ra
-; REMAT-NEXT: vle32.v v2, (a2)
-; REMAT-NEXT: sf.vc.vv 3, 0, v6, v10
-; REMAT-NEXT: vle32.v v10, (a2)
-; REMAT-NEXT: csrr a2, vlenb
-; REMAT-NEXT: slli a2, a2, 1
-; REMAT-NEXT: add a2, sp, a2
-; REMAT-NEXT: addi a2, a2, 432
-; REMAT-NEXT: vs2r.v v10, (a2) # Unknown-size Folded Spill
-; REMAT-NEXT: add a2, a0, a4
-; REMAT-NEXT: vle32.v v10, (a2)
-; REMAT-NEXT: sf.vc.vv 3, 0, v4, v14
+; REMAT-NEXT: li a5, 23
+; REMAT-NEXT: slli a5, a5, 9
+; REMAT-NEXT: add a2, a0, a5
+; REMAT-NEXT: vle32.v v26, (a2)
+; REMAT-NEXT: sf.vc.vv 3, 0, v2, v28
; REMAT-NEXT: vle32.v v14, (a2)
; REMAT-NEXT: csrr a2, vlenb
-; REMAT-NEXT: slli a2, a2, 2
+; REMAT-NEXT: li a3, 6
+; REMAT-NEXT: mul a2, a2, a3
; REMAT-NEXT: add a2, sp, a2
; REMAT-NEXT: addi a2, a2, 432
; REMAT-NEXT: vs2r.v v14, (a2) # Unknown-size Folded Spill
-; REMAT-NEXT: add a2, a0, a3
+; REMAT-NEXT: lui s8, 3
+; REMAT-NEXT: add a2, a0, s8
+; REMAT-NEXT: vle32.v v28, (a2)
+; REMAT-NEXT: sf.vc.vv 3, 0, v4, v30
; REMAT-NEXT: vle32.v v14, (a2)
-; REMAT-NEXT: sf.vc.vv 3, 0, v0, v18
-; REMAT-NEXT: vle32.v v18, (a2)
-; REMAT-NEXT: csrr a2, vlenb
-; REMAT-NEXT: slli a2, a2, 4
-; REMAT-NEXT: add a2, sp, a2
-; REMAT-NEXT: addi a2, a2, 432
-; REMAT-NEXT: vs2r.v v18, (a2) # Unknown-size Folded Spill
-; REMAT-NEXT: li a5, 7
-; REMAT-NEXT: slli a5, a5, 11
-; REMAT-NEXT: add a2, a0, a5
-; REMAT-NEXT: vle32.v v18, (a2)
-; REMAT-NEXT: addi a3, sp, 432
-; REMAT-NEXT: vs2r.v v18, (a3) # Unknown-size Folded Spill
-; REMAT-NEXT: sf.vc.vv 3, 0, v22, v20
-; REMAT-NEXT: vle32.v v18, (a2)
; REMAT-NEXT: csrr a2, vlenb
-; REMAT-NEXT: li a3, 14
-; REMAT-NEXT: mul a2, a2, a3
-; REMAT-NEXT: add a2, sp, a2
-; REMAT-NEXT: addi a2, a2, 432
-; REMAT-NEXT: vs2r.v v18, (a2) # Unknown-size Folded Spill
-; REMAT-NEXT: li a2, 29
-; REMAT-NEXT: slli a2, a2, 9
-; REMAT-NEXT: add a2, a0, a2
-; REMAT-NEXT: vle32.v v18, (a2)
-; REMAT-NEXT: sf.vc.vv 3, 0, v26, v24
-; REMAT-NEXT: vle32.v v20, (a2)
-; REMAT-NEXT: csrr a2, vlenb
-; REMAT-NEXT: li a3, 12
-; REMAT-NEXT: mul a2, a2, a3
+; REMAT-NEXT: slli a2, a2, 2
; REMAT-NEXT: add a2, sp, a2
; REMAT-NEXT: addi a2, a2, 432
-; REMAT-NEXT: vs2r.v v20, (a2) # Unknown-size Folded Spill
-; REMAT-NEXT: li a2, 15
-; REMAT-NEXT: slli a2, a2, 10
-; REMAT-NEXT: add a2, a0, a2
+; REMAT-NEXT: vs2r.v v14, (a2) # Unknown-size Folded Spill
+; REMAT-NEXT: li s9, 25
+; REMAT-NEXT: slli s9, s9, 9
+; REMAT-NEXT: add a2, a0, s9
; REMAT-NEXT: vle32.v v30, (a2)
-; REMAT-NEXT: sf.vc.vv 3, 0, v28, v8
-; REMAT-NEXT: vle32.v v8, (a2)
-; REMAT-NEXT: csrr a2, vlenb
-; REMAT-NEXT: li a3, 10
-; REMAT-NEXT: mul a2, a2, a3
-; REMAT-NEXT: add a2, sp, a2
-; REMAT-NEXT: addi a2, a2, 432
-; REMAT-NEXT: vs2r.v v8, (a2) # Unknown-size Folded Spill
-; REMAT-NEXT: li a2, 31
-; REMAT-NEXT: slli a2, a2, 9
-; REMAT-NEXT: add a2, a0, a2
+; REMAT-NEXT: sf.vc.vv 3, 0, v10, v6
+; REMAT-NEXT: vle32.v v14, (a2)
+; REMAT-NEXT: li s10, 13
+; REMAT-NEXT: slli s10, s10, 10
+; REMAT-NEXT: add a2, a0, s10
; REMAT-NEXT: vle32.v v6, (a2)
-; REMAT-NEXT: csrr a3, vlenb
-; REMAT-NEXT: slli a3, a3, 3
-; REMAT-NEXT: add a3, sp, a3
-; REMAT-NEXT: addi a3, a3, 432
-; REMAT-NEXT: vl2r.v v8, (a3) # Unknown-size Folded Reload
-; REMAT-NEXT: sf.vc.vv 3, 0, v8, v12
+; REMAT-NEXT: sf.vc.vv 3, 0, v0, v8
; REMAT-NEXT: vle32.v v8, (a2)
; REMAT-NEXT: csrr a2, vlenb
-; REMAT-NEXT: slli a2, a2, 3
+; REMAT-NEXT: slli a2, a2, 1
; REMAT-NEXT: add a2, sp, a2
; REMAT-NEXT: addi a2, a2, 432
; REMAT-NEXT: vs2r.v v8, (a2) # Unknown-size Folded Spill
-; REMAT-NEXT: lui a2, 4
-; REMAT-NEXT: add a2, a0, a2
+; REMAT-NEXT: li s11, 27
+; REMAT-NEXT: slli s11, s11, 9
+; REMAT-NEXT: add a2, a0, s11
; REMAT-NEXT: vle32.v v4, (a2)
-; REMAT-NEXT: sf.vc.vv 3, 0, v16, v2
-; REMAT-NEXT: vle32.v v8, (a2)
-; REMAT-NEXT: csrr a2, vlenb
-; REMAT-NEXT: li a3, 6
-; REMAT-NEXT: mul a2, a2, a3
-; REMAT-NEXT: add a2, sp, a2
-; REMAT-NEXT: addi a2, a2, 432
-; REMAT-NEXT: vs2r.v v8, (a2) # Unknown-size Folded Spill
-; REMAT-NEXT: lui a2, 4
-; REMAT-NEXT: addiw a2, a2, 512
-; REMAT-NEXT: add a2, a0, a2
+; REMAT-NEXT: sf.vc.vv 3, 0, v18, v16
+; REMAT-NEXT: vle32.v v18, (a2)
+; REMAT-NEXT: li ra, 7
+; REMAT-NEXT: slli ra, ra, 11
+; REMAT-NEXT: add a2, a0, ra
; REMAT-NEXT: vle32.v v2, (a2)
-; REMAT-NEXT: csrr a3, vlenb
-; REMAT-NEXT: slli a3, a3, 1
-; REMAT-NEXT: add a3, sp, a3
-; REMAT-NEXT: addi a3, a3, 432
-; REMAT-NEXT: vl2r.v v8, (a3) # Unknown-size Folded Reload
-; REMAT-NEXT: sf.vc.vv 3, 0, v8, v10
+; REMAT-NEXT: sf.vc.vv 3, 0, v22, v20
; REMAT-NEXT: vle32.v v20, (a2)
-; REMAT-NEXT: li a2, 17
-; REMAT-NEXT: slli a2, a2, 10
+; REMAT-NEXT: li a2, 29
+; REMAT-NEXT: slli a2, a2, 9
; REMAT-NEXT: add a2, a0, a2
; REMAT-NEXT: vle32.v v0, (a2)
-; REMAT-NEXT: csrr a3, vlenb
-; REMAT-NEXT: slli a3, a3, 2
-; REMAT-NEXT: add a3, sp, a3
-; REMAT-NEXT: addi a3, a3, 432
+; REMAT-NEXT: addi a3, sp, 432
; REMAT-NEXT: vl2r.v v8, (a3) # Unknown-size Folded Reload
-; REMAT-NEXT: sf.vc.vv 3, 0, v8, v14
+; REMAT-NEXT: sf.vc.vv 3, 0, v8, v24
; REMAT-NEXT: vle32.v v22, (a2)
-; REMAT-NEXT: lui a2, 4
-; REMAT-NEXT: addiw a2, a2, 1536
+; REMAT-NEXT: li a2, 15
+; REMAT-NEXT: slli a2, a2, 10
; REMAT-NEXT: add a2, a0, a2
; REMAT-NEXT: vle32.v v24, (a2)
-; REMAT-NEXT: csrr a3, vlenb
-; REMAT-NEXT: slli a3, a3, 4
-; REMAT-NEXT: add a3, sp, a3
-; REMAT-NEXT: addi a3, a3, 432
-; REMAT-NEXT: vl2r.v v8, (a3) # Unknown-size Folded Reload
-; REMAT-NEXT: addi a3, sp, 432
-; REMAT-NEXT: vl2r.v v10, (a3) # Unknown-size Folded Reload
-; REMAT-NEXT: sf.vc.vv 3, 0, v8, v10
+; REMAT-NEXT: sf.vc.vv 3, 0, v12, v26
; REMAT-NEXT: vle32.v v8, (a2)
-; REMAT-NEXT: li a2, 9
-; REMAT-NEXT: slli a2, a2, 11
+; REMAT-NEXT: li a2, 31
+; REMAT-NEXT: slli a2, a2, 9
; REMAT-NEXT: add a2, a0, a2
; REMAT-NEXT: vle32.v v26, (a2)
; REMAT-NEXT: csrr a3, vlenb
-; REMAT-NEXT: li a4, 14
+; REMAT-NEXT: li a4, 6
; REMAT-NEXT: mul a3, a3, a4
; REMAT-NEXT: add a3, sp, a3
; REMAT-NEXT: addi a3, a3, 432
; REMAT-NEXT: vl2r.v v10, (a3) # Unknown-size Folded Reload
-; REMAT-NEXT: sf.vc.vv 3, 0, v10, v18
+; REMAT-NEXT: sf.vc.vv 3, 0, v10, v28
; REMAT-NEXT: vle32.v v10, (a2)
-; REMAT-NEXT: lui a2, 5
-; REMAT-NEXT: addiw a2, a2, -1536
+; REMAT-NEXT: lui a2, 4
; REMAT-NEXT: add a2, a0, a2
; REMAT-NEXT: vle32.v v28, (a2)
; REMAT-NEXT: csrr a3, vlenb
-; REMAT-NEXT: li a4, 12
-; REMAT-NEXT: mul a3, a3, a4
+; REMAT-NEXT: slli a3, a3, 2
; REMAT-NEXT: add a3, sp, a3
; REMAT-NEXT: addi a3, a3, 432
; REMAT-NEXT: vl2r.v v12, (a3) # Unknown-size Folded Reload
; REMAT-NEXT: sf.vc.vv 3, 0, v12, v30
; REMAT-NEXT: vle32.v v12, (a2)
-; REMAT-NEXT: li a2, 19
-; REMAT-NEXT: slli a2, a2, 10
+; REMAT-NEXT: lui a2, 4
+; REMAT-NEXT: addiw a2, a2, 512
; REMAT-NEXT: add a2, a0, a2
; REMAT-NEXT: vle32.v v30, (a2)
-; REMAT-NEXT: csrr a3, vlenb
-; REMAT-NEXT: li a4, 10
-; REMAT-NEXT: mul a3, a3, a4
-; REMAT-NEXT: add a3, sp, a3
-; REMAT-NEXT: addi a3, a3, 432
-; REMAT-NEXT: vl2r.v v14, (a3) # Unknown-size Folded Reload
; REMAT-NEXT: sf.vc.vv 3, 0, v14, v6
; REMAT-NEXT: vle32.v v14, (a2)
-; REMAT-NEXT: lui a2, 5
-; REMAT-NEXT: addiw a2, a2, -512
+; REMAT-NEXT: li a2, 17
+; REMAT-NEXT: slli a2, a2, 10
; REMAT-NEXT: add a2, a0, a2
; REMAT-NEXT: vle32.v v6, (a2)
; REMAT-NEXT: csrr a3, vlenb
-; REMAT-NEXT: slli a3, a3, 3
+; REMAT-NEXT: slli a3, a3, 1
; REMAT-NEXT: add a3, sp, a3
; REMAT-NEXT: addi a3, a3, 432
; REMAT-NEXT: vl2r.v v16, (a3) # Unknown-size Folded Reload
; REMAT-NEXT: sf.vc.vv 3, 0, v16, v4
; REMAT-NEXT: vle32.v v16, (a2)
-; REMAT-NEXT: lui a2, 5
+; REMAT-NEXT: lui a2, 4
+; REMAT-NEXT: addiw a2, a2, 1536
; REMAT-NEXT: add a2, a0, a2
; REMAT-NEXT: vle32.v v4, (a2)
-; REMAT-NEXT: csrr a3, vlenb
-; REMAT-NEXT: li a4, 6
-; REMAT-NEXT: mul a3, a3, a4
-; REMAT-NEXT: add a3, sp, a3
-; REMAT-NEXT: addi a3, a3, 432
-; REMAT-NEXT: vl2r.v v18, (a3) # Unknown-size Folded Reload
; REMAT-NEXT: sf.vc.vv 3, 0, v18, v2
; REMAT-NEXT: vle32.v v18, (a2)
-; REMAT-NEXT: lui a2, 5
-; REMAT-NEXT: addiw a2, a2, 512
+; REMAT-NEXT: li a2, 9
+; REMAT-NEXT: slli a2, a2, 11
; REMAT-NEXT: add a2, a0, a2
; REMAT-NEXT: vle32.v v2, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v20, v0
; REMAT-NEXT: vle32.v v20, (a2)
-; REMAT-NEXT: li s7, 21
-; REMAT-NEXT: slli s7, s7, 10
-; REMAT-NEXT: add a2, a0, s7
+; REMAT-NEXT: lui a2, 5
+; REMAT-NEXT: addiw a2, a2, -1536
+; REMAT-NEXT: add a2, a0, a2
; REMAT-NEXT: vle32.v v0, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v22, v24
; REMAT-NEXT: vle32.v v22, (a2)
-; REMAT-NEXT: lui s4, 5
-; REMAT-NEXT: addiw s4, s4, 1536
-; REMAT-NEXT: add a2, a0, s4
+; REMAT-NEXT: li a2, 19
+; REMAT-NEXT: slli a2, a2, 10
+; REMAT-NEXT: add a2, a0, a2
; REMAT-NEXT: vle32.v v24, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v8, v26
; REMAT-NEXT: vle32.v v8, (a2)
-; REMAT-NEXT: li a2, 11
-; REMAT-NEXT: slli a2, a2, 11
+; REMAT-NEXT: lui a2, 5
+; REMAT-NEXT: addiw a2, a2, -512
; REMAT-NEXT: add a2, a0, a2
; REMAT-NEXT: vle32.v v26, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v10, v28
; REMAT-NEXT: vle32.v v10, (a2)
-; REMAT-NEXT: lui s3, 6
-; REMAT-NEXT: addiw s3, s3, -1536
-; REMAT-NEXT: add a2, a0, s3
+; REMAT-NEXT: lui a2, 5
+; REMAT-NEXT: add a2, a0, a2
; REMAT-NEXT: vle32.v v28, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v12, v30
; REMAT-NEXT: vle32.v v12, (a2)
-; REMAT-NEXT: li s2, 23
-; REMAT-NEXT: slli s2, s2, 10
-; REMAT-NEXT: add a2, a0, s2
+; REMAT-NEXT: lui a2, 5
+; REMAT-NEXT: addiw a2, a2, 512
+; REMAT-NEXT: add a2, a0, a2
; REMAT-NEXT: vle32.v v30, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v14, v6
; REMAT-NEXT: vle32.v v14, (a2)
-; REMAT-NEXT: lui a2, 6
-; REMAT-NEXT: addiw a2, a2, -512
+; REMAT-NEXT: li a2, 21
+; REMAT-NEXT: slli a2, a2, 10
; REMAT-NEXT: add a2, a0, a2
; REMAT-NEXT: vle32.v v6, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v16, v4
; REMAT-NEXT: vle32.v v16, (a2)
-; REMAT-NEXT: lui a2, 6
+; REMAT-NEXT: lui a2, 5
+; REMAT-NEXT: addiw a2, a2, 1536
; REMAT-NEXT: add a2, a0, a2
-; REMAT-NEXT: lui s1, 6
; REMAT-NEXT: vle32.v v4, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v18, v2
; REMAT-NEXT: vle32.v v18, (a2)
-; REMAT-NEXT: lui s0, 6
-; REMAT-NEXT: addiw s0, s0, 512
-; REMAT-NEXT: add a2, a0, s0
+; REMAT-NEXT: li a2, 11
+; REMAT-NEXT: slli a2, a2, 11
+; REMAT-NEXT: add a2, a0, a2
; REMAT-NEXT: vle32.v v2, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v20, v0
; REMAT-NEXT: vle32.v v20, (a2)
-; REMAT-NEXT: li a2, 25
-; REMAT-NEXT: slli a2, a2, 10
+; REMAT-NEXT: lui a2, 6
+; REMAT-NEXT: addiw a2, a2, -1536
; REMAT-NEXT: add a2, a0, a2
; REMAT-NEXT: vle32.v v0, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v22, v24
; REMAT-NEXT: vle32.v v22, (a2)
-; REMAT-NEXT: lui t6, 6
-; REMAT-NEXT: addiw t6, t6, 1536
-; REMAT-NEXT: add a2, a0, t6
+; REMAT-NEXT: li a2, 23
+; REMAT-NEXT: slli a2, a2, 10
+; REMAT-NEXT: add a2, a0, a2
; REMAT-NEXT: vle32.v v24, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v8, v26
; REMAT-NEXT: vle32.v v8, (a2)
-; REMAT-NEXT: li t5, 13
-; REMAT-NEXT: slli t5, t5, 11
-; REMAT-NEXT: add a2, a0, t5
+; REMAT-NEXT: lui a2, 6
+; REMAT-NEXT: addiw a2, a2, -512
+; REMAT-NEXT: add a2, a0, a2
; REMAT-NEXT: vle32.v v26, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v10, v28
; REMAT-NEXT: vle32.v v10, (a2)
-; REMAT-NEXT: lui a2, 7
-; REMAT-NEXT: addiw a2, a2, -1536
+; REMAT-NEXT: lui a2, 6
; REMAT-NEXT: add a2, a0, a2
+; REMAT-NEXT: lui s1, 6
; REMAT-NEXT: vle32.v v28, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v12, v30
; REMAT-NEXT: vle32.v v12, (a2)
-; REMAT-NEXT: li t4, 27
-; REMAT-NEXT: slli t4, t4, 10
-; REMAT-NEXT: add a2, a0, t4
+; REMAT-NEXT: lui s0, 6
+; REMAT-NEXT: addiw s0, s0, 512
+; REMAT-NEXT: add a2, a0, s0
; REMAT-NEXT: vle32.v v30, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v14, v6
; REMAT-NEXT: vle32.v v14, (a2)
-; REMAT-NEXT: lui a2, 7
-; REMAT-NEXT: addiw a2, a2, -512
+; REMAT-NEXT: li a2, 25
+; REMAT-NEXT: slli a2, a2, 10
; REMAT-NEXT: add a2, a0, a2
; REMAT-NEXT: vle32.v v6, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v16, v4
; REMAT-NEXT: vle32.v v16, (a2)
-; REMAT-NEXT: lui a2, 7
-; REMAT-NEXT: add a2, a0, a2
-; REMAT-NEXT: lui t3, 7
+; REMAT-NEXT: lui t6, 6
+; REMAT-NEXT: addiw t6, t6, 1536
+; REMAT-NEXT: add a2, a0, t6
; REMAT-NEXT: vle32.v v4, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v18, v2
; REMAT-NEXT: vle32.v v18, (a2)
-; REMAT-NEXT: lui t2, 7
-; REMAT-NEXT: addiw t2, t2, 512
-; REMAT-NEXT: add a2, a0, t2
+; REMAT-NEXT: li t5, 13
+; REMAT-NEXT: slli t5, t5, 11
+; REMAT-NEXT: add a2, a0, t5
; REMAT-NEXT: vle32.v v2, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v20, v0
; REMAT-NEXT: vle32.v v20, (a2)
-; REMAT-NEXT: li t1, 29
-; REMAT-NEXT: slli t1, t1, 10
-; REMAT-NEXT: add a2, a0, t1
+; REMAT-NEXT: lui a2, 7
+; REMAT-NEXT: addiw a2, a2, -1536
+; REMAT-NEXT: add a2, a0, a2
; REMAT-NEXT: vle32.v v0, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v22, v24
; REMAT-NEXT: vle32.v v22, (a2)
-; REMAT-NEXT: lui t0, 7
-; REMAT-NEXT: addiw t0, t0, 1536
-; REMAT-NEXT: add a2, a0, t0
+; REMAT-NEXT: li t4, 27
+; REMAT-NEXT: slli t4, t4, 10
+; REMAT-NEXT: add a2, a0, t4
; REMAT-NEXT: vle32.v v24, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v8, v26
; REMAT-NEXT: vle32.v v8, (a2)
-; REMAT-NEXT: li a7, 15
-; REMAT-NEXT: slli a7, a7, 11
-; REMAT-NEXT: add a2, a0, a7
+; REMAT-NEXT: lui a2, 7
+; REMAT-NEXT: addiw a2, a2, -512
+; REMAT-NEXT: add a2, a0, a2
; REMAT-NEXT: vle32.v v26, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v10, v28
; REMAT-NEXT: vle32.v v10, (a2)
-; REMAT-NEXT: lui a6, 8
-; REMAT-NEXT: addiw a6, a6, -1536
-; REMAT-NEXT: add a2, a0, a6
+; REMAT-NEXT: lui a2, 7
+; REMAT-NEXT: add a2, a0, a2
+; REMAT-NEXT: lui t3, 7
; REMAT-NEXT: vle32.v v28, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v12, v30
; REMAT-NEXT: vle32.v v12, (a2)
-; REMAT-NEXT: li a4, 31
-; REMAT-NEXT: slli a4, a4, 10
-; REMAT-NEXT: add a2, a0, a4
+; REMAT-NEXT: lui t2, 7
+; REMAT-NEXT: addiw t2, t2, 512
+; REMAT-NEXT: add a2, a0, t2
; REMAT-NEXT: vle32.v v30, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v14, v6
; REMAT-NEXT: vle32.v v14, (a2)
-; REMAT-NEXT: lui a3, 8
-; REMAT-NEXT: addiw a3, a3, -512
-; REMAT-NEXT: add a2, a0, a3
+; REMAT-NEXT: li t1, 29
+; REMAT-NEXT: slli t1, t1, 10
+; REMAT-NEXT: add a2, a0, t1
; REMAT-NEXT: vle32.v v6, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v16, v4
; REMAT-NEXT: vle32.v v16, (a2)
-; REMAT-NEXT: lui a2, 8
-; REMAT-NEXT: add a0, a0, a2
-; REMAT-NEXT: vle32.v v4, (a0)
+; REMAT-NEXT: lui t0, 7
+; REMAT-NEXT: addiw t0, t0, 1536
+; REMAT-NEXT: add a2, a0, t0
+; REMAT-NEXT: vle32.v v4, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v18, v2
+; REMAT-NEXT: vle32.v v18, (a2)
+; REMAT-NEXT: li a7, 15
+; REMAT-NEXT: slli a7, a7, 11
+; REMAT-NEXT: add a2, a0, a7
+; REMAT-NEXT: vle32.v v2, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v20, v0
+; REMAT-NEXT: vle32.v v20, (a2)
+; REMAT-NEXT: lui a6, 8
+; REMAT-NEXT: addiw a6, a6, -1536
+; REMAT-NEXT: add a2, a0, a6
+; REMAT-NEXT: vle32.v v0, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v22, v24
+; REMAT-NEXT: vle32.v v22, (a2)
+; REMAT-NEXT: li a4, 31
+; REMAT-NEXT: slli a4, a4, 10
+; REMAT-NEXT: add a2, a0, a4
+; REMAT-NEXT: vle32.v v24, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v8, v26
+; REMAT-NEXT: vle32.v v8, (a2)
+; REMAT-NEXT: lui a3, 8
+; REMAT-NEXT: addiw a3, a3, -512
+; REMAT-NEXT: add a2, a0, a3
+; REMAT-NEXT: vle32.v v26, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v10, v28
+; REMAT-NEXT: vle32.v v10, (a2)
+; REMAT-NEXT: lui a2, 8
+; REMAT-NEXT: add a0, a0, a2
+; REMAT-NEXT: vle32.v v28, (a0)
; REMAT-NEXT: sf.vc.vv 3, 0, v12, v30
; REMAT-NEXT: sf.vc.vv 3, 0, v14, v6
; REMAT-NEXT: sf.vc.vv 3, 0, v16, v4
+; REMAT-NEXT: sf.vc.vv 3, 0, v18, v2
+; REMAT-NEXT: sf.vc.vv 3, 0, v20, v0
+; REMAT-NEXT: sf.vc.vv 3, 0, v22, v24
+; REMAT-NEXT: sf.vc.vv 3, 0, v8, v26
+; REMAT-NEXT: sf.vc.vv 3, 0, v10, v28
; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0
; REMAT-NEXT: addi a0, a1, 1024
; REMAT-NEXT: vse32.v v8, (a0)
@@ -1482,45 +1397,38 @@ define void @test(ptr %0, ptr %1, i64 %2) {
; REMAT-NEXT: slli a0, a0, 10
; REMAT-NEXT: add a0, a1, a0
; REMAT-NEXT: sd a0, 336(sp) # 8-byte Folded Spill
-; REMAT-NEXT: li a0, 15
-; REMAT-NEXT: slli a0, a0, 9
-; REMAT-NEXT: add a0, a1, a0
-; REMAT-NEXT: sd a0, 328(sp) # 8-byte Folded Spill
-; REMAT-NEXT: lui a0, 2
-; REMAT-NEXT: add a0, a1, a0
-; REMAT-NEXT: sd a0, 320(sp) # 8-byte Folded Spill
-; REMAT-NEXT: li a0, 17
-; REMAT-NEXT: slli a0, a0, 9
-; REMAT-NEXT: add a0, a1, a0
-; REMAT-NEXT: sd a0, 312(sp) # 8-byte Folded Spill
+; REMAT-NEXT: add s2, a1, s2
+; REMAT-NEXT: sd s2, 328(sp) # 8-byte Folded Spill
+; REMAT-NEXT: add s3, a1, s3
+; REMAT-NEXT: sd s3, 320(sp) # 8-byte Folded Spill
+; REMAT-NEXT: add s4, a1, s4
+; REMAT-NEXT: sd s4, 312(sp) # 8-byte Folded Spill
; REMAT-NEXT: add s5, a1, s5
; REMAT-NEXT: sd s5, 304(sp) # 8-byte Folded Spill
; REMAT-NEXT: add s6, a1, s6
; REMAT-NEXT: sd s6, 296(sp) # 8-byte Folded Spill
-; REMAT-NEXT: li a0, 5
-; REMAT-NEXT: slli a0, a0, 11
+; REMAT-NEXT: add s7, a1, s7
+; REMAT-NEXT: sd s7, 288(sp) # 8-byte Folded Spill
+; REMAT-NEXT: li a0, 21
+; REMAT-NEXT: slli a0, a0, 9
+; REMAT-NEXT: add a0, a1, a0
+; REMAT-NEXT: sd a0, 280(sp) # 8-byte Folded Spill
+; REMAT-NEXT: li a0, 11
+; REMAT-NEXT: slli a0, a0, 10
; REMAT-NEXT: add a0, a1, a0
-; REMAT-NEXT: sd a0, 288(sp) # 8-byte Folded Spill
+; REMAT-NEXT: sd a0, 272(sp) # 8-byte Folded Spill
+; REMAT-NEXT: add a5, a1, a5
+; REMAT-NEXT: sd a5, 264(sp) # 8-byte Folded Spill
; REMAT-NEXT: add s8, a1, s8
-; REMAT-NEXT: sd s8, 280(sp) # 8-byte Folded Spill
+; REMAT-NEXT: sd s8, 256(sp) # 8-byte Folded Spill
; REMAT-NEXT: add s9, a1, s9
-; REMAT-NEXT: sd s9, 272(sp) # 8-byte Folded Spill
+; REMAT-NEXT: sd s9, 248(sp) # 8-byte Folded Spill
; REMAT-NEXT: add s10, a1, s10
-; REMAT-NEXT: sd s10, 264(sp) # 8-byte Folded Spill
+; REMAT-NEXT: sd s10, 240(sp) # 8-byte Folded Spill
; REMAT-NEXT: add s11, a1, s11
-; REMAT-NEXT: sd s11, 256(sp) # 8-byte Folded Spill
+; REMAT-NEXT: sd s11, 232(sp) # 8-byte Folded Spill
; REMAT-NEXT: add ra, a1, ra
-; REMAT-NEXT: sd ra, 248(sp) # 8-byte Folded Spill
-; REMAT-NEXT: li a0, 13
-; REMAT-NEXT: slli a0, a0, 10
-; REMAT-NEXT: add a0, a1, a0
-; REMAT-NEXT: sd a0, 240(sp) # 8-byte Folded Spill
-; REMAT-NEXT: li a0, 27
-; REMAT-NEXT: slli a0, a0, 9
-; REMAT-NEXT: add a0, a1, a0
-; REMAT-NEXT: sd a0, 232(sp) # 8-byte Folded Spill
-; REMAT-NEXT: add a5, a1, a5
-; REMAT-NEXT: sd a5, 224(sp) # 8-byte Folded Spill
+; REMAT-NEXT: sd ra, 224(sp) # 8-byte Folded Spill
; REMAT-NEXT: li a0, 29
; REMAT-NEXT: slli a0, a0, 9
; REMAT-NEXT: add a0, a1, a0
@@ -1571,18 +1479,26 @@ define void @test(ptr %0, ptr %1, i64 %2) {
; REMAT-NEXT: addiw a0, a0, 512
; REMAT-NEXT: add a0, a1, a0
; REMAT-NEXT: sd a0, 120(sp) # 8-byte Folded Spill
-; REMAT-NEXT: add s7, a1, s7
-; REMAT-NEXT: sd s7, 112(sp) # 8-byte Folded Spill
-; REMAT-NEXT: add s4, a1, s4
-; REMAT-NEXT: sd s4, 104(sp) # 8-byte Folded Spill
+; REMAT-NEXT: li a0, 21
+; REMAT-NEXT: slli a0, a0, 10
+; REMAT-NEXT: add a0, a1, a0
+; REMAT-NEXT: sd a0, 112(sp) # 8-byte Folded Spill
+; REMAT-NEXT: lui a0, 5
+; REMAT-NEXT: addiw a0, a0, 1536
+; REMAT-NEXT: add a0, a1, a0
+; REMAT-NEXT: sd a0, 104(sp) # 8-byte Folded Spill
; REMAT-NEXT: li a0, 11
; REMAT-NEXT: slli a0, a0, 11
; REMAT-NEXT: add a0, a1, a0
; REMAT-NEXT: sd a0, 96(sp) # 8-byte Folded Spill
-; REMAT-NEXT: add s3, a1, s3
-; REMAT-NEXT: sd s3, 88(sp) # 8-byte Folded Spill
-; REMAT-NEXT: add s2, a1, s2
-; REMAT-NEXT: sd s2, 80(sp) # 8-byte Folded Spill
+; REMAT-NEXT: lui a0, 6
+; REMAT-NEXT: addiw a0, a0, -1536
+; REMAT-NEXT: add a0, a1, a0
+; REMAT-NEXT: sd a0, 88(sp) # 8-byte Folded Spill
+; REMAT-NEXT: li a0, 23
+; REMAT-NEXT: slli a0, a0, 10
+; REMAT-NEXT: add a0, a1, a0
+; REMAT-NEXT: sd a0, 80(sp) # 8-byte Folded Spill
; REMAT-NEXT: lui a0, 6
; REMAT-NEXT: addiw a0, a0, -512
; REMAT-NEXT: add a0, a1, a0
@@ -1879,8 +1795,7 @@ define void @test(ptr %0, ptr %1, i64 %2) {
; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0
; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0
; REMAT-NEXT: csrr a0, vlenb
-; REMAT-NEXT: li a1, 18
-; REMAT-NEXT: mul a0, a0, a1
+; REMAT-NEXT: slli a0, a0, 3
; REMAT-NEXT: add sp, sp, a0
; REMAT-NEXT: .cfi_def_cfa sp, 544
; REMAT-NEXT: ld ra, 536(sp) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll
index 575a757149ebba..0b5856a7000dd4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll
@@ -5682,28 +5682,16 @@ define void @mscatter_baseidx_v8i64(<8 x i64> %val, ptr %base, <8 x i64> %idxs,
;
; RV32ZVE32F-LABEL: mscatter_baseidx_v8i64:
; RV32ZVE32F: # %bb.0:
-; RV32ZVE32F-NEXT: addi sp, sp, -48
-; RV32ZVE32F-NEXT: .cfi_def_cfa_offset 48
-; RV32ZVE32F-NEXT: sw s0, 44(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT: sw s1, 40(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT: sw s2, 36(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT: sw s3, 32(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT: sw s4, 28(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT: sw s5, 24(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT: sw s6, 20(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT: sw s7, 16(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT: sw s8, 12(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT: sw s9, 8(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT: addi sp, sp, -16
+; RV32ZVE32F-NEXT: .cfi_def_cfa_offset 16
+; RV32ZVE32F-NEXT: sw s0, 12(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT: sw s1, 8(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT: sw s2, 4(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT: sw s3, 0(sp) # 4-byte Folded Spill
; RV32ZVE32F-NEXT: .cfi_offset s0, -4
; RV32ZVE32F-NEXT: .cfi_offset s1, -8
; RV32ZVE32F-NEXT: .cfi_offset s2, -12
; RV32ZVE32F-NEXT: .cfi_offset s3, -16
-; RV32ZVE32F-NEXT: .cfi_offset s4, -20
-; RV32ZVE32F-NEXT: .cfi_offset s5, -24
-; RV32ZVE32F-NEXT: .cfi_offset s6, -28
-; RV32ZVE32F-NEXT: .cfi_offset s7, -32
-; RV32ZVE32F-NEXT: .cfi_offset s8, -36
-; RV32ZVE32F-NEXT: .cfi_offset s9, -40
; RV32ZVE32F-NEXT: .cfi_remember_state
; RV32ZVE32F-NEXT: lw a3, 56(a0)
; RV32ZVE32F-NEXT: lw a4, 60(a0)
@@ -5715,30 +5703,30 @@ define void @mscatter_baseidx_v8i64(<8 x i64> %val, ptr %base, <8 x i64> %idxs,
; RV32ZVE32F-NEXT: lw t4, 28(a0)
; RV32ZVE32F-NEXT: lw t1, 32(a0)
; RV32ZVE32F-NEXT: lw t2, 36(a0)
+; RV32ZVE32F-NEXT: lw t5, 0(a2)
+; RV32ZVE32F-NEXT: lw t6, 8(a2)
+; RV32ZVE32F-NEXT: lw s0, 16(a2)
+; RV32ZVE32F-NEXT: lw s1, 24(a2)
+; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; RV32ZVE32F-NEXT: vmv.v.x v8, t5
+; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, t6
+; RV32ZVE32F-NEXT: lw t5, 32(a2)
+; RV32ZVE32F-NEXT: lw t6, 40(a2)
+; RV32ZVE32F-NEXT: lw s2, 48(a2)
+; RV32ZVE32F-NEXT: lw s3, 56(a2)
+; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, s0
+; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, s1
+; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, t5
+; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, t6
; RV32ZVE32F-NEXT: lw s0, 8(a0)
; RV32ZVE32F-NEXT: lw s1, 12(a0)
; RV32ZVE32F-NEXT: lw t5, 16(a0)
; RV32ZVE32F-NEXT: lw t6, 20(a0)
-; RV32ZVE32F-NEXT: lw s2, 32(a2)
-; RV32ZVE32F-NEXT: lw s3, 40(a2)
-; RV32ZVE32F-NEXT: lw s4, 48(a2)
-; RV32ZVE32F-NEXT: lw s5, 56(a2)
-; RV32ZVE32F-NEXT: lw s6, 0(a2)
-; RV32ZVE32F-NEXT: lw s7, 8(a2)
-; RV32ZVE32F-NEXT: lw s8, 16(a2)
-; RV32ZVE32F-NEXT: lw s9, 24(a2)
-; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; RV32ZVE32F-NEXT: vmv.v.x v8, s6
+; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, s2
; RV32ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma
; RV32ZVE32F-NEXT: vmv.x.s a2, v0
; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma
-; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, s7
-; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, s8
-; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, s9
-; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, s2
; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, s3
-; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, s4
-; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, s5
; RV32ZVE32F-NEXT: vsll.vi v8, v8, 3
; RV32ZVE32F-NEXT: andi s2, a2, 1
; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1
@@ -5771,27 +5759,15 @@ define void @mscatter_baseidx_v8i64(<8 x i64> %val, ptr %base, <8 x i64> %idxs,
; RV32ZVE32F-NEXT: sw a3, 0(a0)
; RV32ZVE32F-NEXT: sw a4, 4(a0)
; RV32ZVE32F-NEXT: .LBB51_9: # %else14
-; RV32ZVE32F-NEXT: lw s0, 44(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT: lw s1, 40(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT: lw s2, 36(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT: lw s3, 32(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT: lw s4, 28(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT: lw s5, 24(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT: lw s6, 20(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT: lw s7, 16(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT: lw s8, 12(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT: lw s9, 8(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT: lw s0, 12(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT: lw s1, 8(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT: lw s2, 4(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT: lw s3, 0(sp) # 4-byte Folded Reload
; RV32ZVE32F-NEXT: .cfi_restore s0
; RV32ZVE32F-NEXT: .cfi_restore s1
; RV32ZVE32F-NEXT: .cfi_restore s2
; RV32ZVE32F-NEXT: .cfi_restore s3
-; RV32ZVE32F-NEXT: .cfi_restore s4
-; RV32ZVE32F-NEXT: .cfi_restore s5
-; RV32ZVE32F-NEXT: .cfi_restore s6
-; RV32ZVE32F-NEXT: .cfi_restore s7
-; RV32ZVE32F-NEXT: .cfi_restore s8
-; RV32ZVE32F-NEXT: .cfi_restore s9
-; RV32ZVE32F-NEXT: addi sp, sp, 48
+; RV32ZVE32F-NEXT: addi sp, sp, 16
; RV32ZVE32F-NEXT: .cfi_def_cfa_offset 0
; RV32ZVE32F-NEXT: ret
; RV32ZVE32F-NEXT: .LBB51_10: # %cond.store
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll
index 03d5762b4903ef..036fee6a13ca4c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll
@@ -1364,19 +1364,16 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
; ZVFHMIN32-NEXT: vs1r.v v10, (a2) # Unknown-size Folded Spill
; ZVFHMIN32-NEXT: vsetivli zero, 1, e16, m2, ta, ma
; ZVFHMIN32-NEXT: vslidedown.vi v26, v8, 15
-; ZVFHMIN32-NEXT: vslidedown.vi v20, v8, 14
-; ZVFHMIN32-NEXT: vslidedown.vi v28, v8, 13
-; ZVFHMIN32-NEXT: vslidedown.vi v10, v8, 12
-; ZVFHMIN32-NEXT: csrr a2, vlenb
-; ZVFHMIN32-NEXT: slli a2, a2, 1
-; ZVFHMIN32-NEXT: add a2, sp, a2
-; ZVFHMIN32-NEXT: addi a2, a2, 848
+; ZVFHMIN32-NEXT: vslidedown.vi v28, v8, 14
+; ZVFHMIN32-NEXT: vslidedown.vi v10, v8, 13
+; ZVFHMIN32-NEXT: addi a2, sp, 848
; ZVFHMIN32-NEXT: vs2r.v v10, (a2) # Unknown-size Folded Spill
-; ZVFHMIN32-NEXT: vslidedown.vi v4, v8, 11
-; ZVFHMIN32-NEXT: vslidedown.vi v2, v8, 10
-; ZVFHMIN32-NEXT: vslidedown.vi v30, v8, 9
-; ZVFHMIN32-NEXT: vslidedown.vi v22, v8, 8
-; ZVFHMIN32-NEXT: vmv.x.s a4, v16
+; ZVFHMIN32-NEXT: vslidedown.vi v6, v8, 12
+; ZVFHMIN32-NEXT: vslidedown.vi v2, v8, 11
+; ZVFHMIN32-NEXT: vslidedown.vi v22, v8, 10
+; ZVFHMIN32-NEXT: vslidedown.vi v20, v8, 9
+; ZVFHMIN32-NEXT: vslidedown.vi v18, v8, 8
+; ZVFHMIN32-NEXT: vmv.x.s a3, v16
; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
@@ -1384,52 +1381,51 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
; ZVFHMIN32-NEXT: lh a0, 560(sp)
; ZVFHMIN32-NEXT: lh a1, 304(sp)
; ZVFHMIN32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; ZVFHMIN32-NEXT: vslidedown.vi v3, v16, 7
-; ZVFHMIN32-NEXT: vslidedown.vi v31, v16, 6
-; ZVFHMIN32-NEXT: vslidedown.vi v5, v16, 5
+; ZVFHMIN32-NEXT: vslidedown.vi v21, v16, 7
+; ZVFHMIN32-NEXT: vslidedown.vi v3, v16, 6
+; ZVFHMIN32-NEXT: vslidedown.vi v19, v16, 5
; ZVFHMIN32-NEXT: vslidedown.vi v23, v16, 4
; ZVFHMIN32-NEXT: vslidedown.vi v8, v16, 3
; ZVFHMIN32-NEXT: csrr a2, vlenb
-; ZVFHMIN32-NEXT: li a3, 21
-; ZVFHMIN32-NEXT: mul a2, a2, a3
+; ZVFHMIN32-NEXT: li a4, 10
+; ZVFHMIN32-NEXT: mul a2, a2, a4
; ZVFHMIN32-NEXT: add a2, sp, a2
; ZVFHMIN32-NEXT: addi a2, a2, 848
; ZVFHMIN32-NEXT: vs1r.v v8, (a2) # Unknown-size Folded Spill
; ZVFHMIN32-NEXT: vslidedown.vi v8, v16, 2
; ZVFHMIN32-NEXT: csrr a2, vlenb
-; ZVFHMIN32-NEXT: li a3, 20
-; ZVFHMIN32-NEXT: mul a2, a2, a3
+; ZVFHMIN32-NEXT: slli a2, a2, 4
; ZVFHMIN32-NEXT: add a2, sp, a2
; ZVFHMIN32-NEXT: addi a2, a2, 848
; ZVFHMIN32-NEXT: vs1r.v v8, (a2) # Unknown-size Folded Spill
; ZVFHMIN32-NEXT: vslidedown.vi v8, v16, 1
; ZVFHMIN32-NEXT: csrr a2, vlenb
-; ZVFHMIN32-NEXT: li a3, 22
-; ZVFHMIN32-NEXT: mul a2, a2, a3
+; ZVFHMIN32-NEXT: slli a4, a2, 4
+; ZVFHMIN32-NEXT: sub a2, a4, a2
; ZVFHMIN32-NEXT: add a2, sp, a2
; ZVFHMIN32-NEXT: addi a2, a2, 848
; ZVFHMIN32-NEXT: vs1r.v v8, (a2) # Unknown-size Folded Spill
; ZVFHMIN32-NEXT: vsetivli zero, 1, e16, m2, ta, ma
-; ZVFHMIN32-NEXT: vslidedown.vi v18, v16, 15
-; ZVFHMIN32-NEXT: vslidedown.vi v14, v16, 14
-; ZVFHMIN32-NEXT: vslidedown.vi v8, v16, 13
+; ZVFHMIN32-NEXT: vslidedown.vi v14, v16, 15
+; ZVFHMIN32-NEXT: vslidedown.vi v8, v16, 14
+; ZVFHMIN32-NEXT: vslidedown.vi v10, v16, 13
; ZVFHMIN32-NEXT: vslidedown.vi v12, v16, 12
-; ZVFHMIN32-NEXT: vslidedown.vi v10, v16, 11
-; ZVFHMIN32-NEXT: vslidedown.vi v6, v16, 10
+; ZVFHMIN32-NEXT: vslidedown.vi v30, v16, 11
; ZVFHMIN32-NEXT: csrr a2, vlenb
-; ZVFHMIN32-NEXT: li a3, 18
-; ZVFHMIN32-NEXT: mul a2, a2, a3
+; ZVFHMIN32-NEXT: slli a4, a2, 4
+; ZVFHMIN32-NEXT: add a2, a4, a2
; ZVFHMIN32-NEXT: add a2, sp, a2
; ZVFHMIN32-NEXT: addi a2, a2, 848
-; ZVFHMIN32-NEXT: vs2r.v v6, (a2) # Unknown-size Folded Spill
-; ZVFHMIN32-NEXT: vslidedown.vi v6, v16, 9
+; ZVFHMIN32-NEXT: vs2r.v v30, (a2) # Unknown-size Folded Spill
+; ZVFHMIN32-NEXT: vslidedown.vi v30, v16, 10
; ZVFHMIN32-NEXT: csrr a2, vlenb
-; ZVFHMIN32-NEXT: li a3, 14
-; ZVFHMIN32-NEXT: mul a2, a2, a3
+; ZVFHMIN32-NEXT: li a4, 11
+; ZVFHMIN32-NEXT: mul a2, a2, a4
; ZVFHMIN32-NEXT: add a2, sp, a2
; ZVFHMIN32-NEXT: addi a2, a2, 848
-; ZVFHMIN32-NEXT: vs2r.v v6, (a2) # Unknown-size Folded Spill
-; ZVFHMIN32-NEXT: vslidedown.vi v6, v16, 8
+; ZVFHMIN32-NEXT: vs2r.v v30, (a2) # Unknown-size Folded Spill
+; ZVFHMIN32-NEXT: vslidedown.vi v4, v16, 9
+; ZVFHMIN32-NEXT: vslidedown.vi v30, v16, 8
; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
@@ -1437,12 +1433,12 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
; ZVFHMIN32-NEXT: lh a0, 558(sp)
; ZVFHMIN32-NEXT: lh a1, 302(sp)
; ZVFHMIN32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; ZVFHMIN32-NEXT: vslidedown.vi v13, v0, 7
-; ZVFHMIN32-NEXT: vslidedown.vi v29, v0, 6
-; ZVFHMIN32-NEXT: vslidedown.vi v11, v0, 5
-; ZVFHMIN32-NEXT: vslidedown.vi v7, v0, 4
-; ZVFHMIN32-NEXT: vslidedown.vi v9, v0, 3
-; ZVFHMIN32-NEXT: vslidedown.vi v21, v0, 2
+; ZVFHMIN32-NEXT: vslidedown.vi v11, v0, 7
+; ZVFHMIN32-NEXT: vslidedown.vi v7, v0, 6
+; ZVFHMIN32-NEXT: vslidedown.vi v9, v0, 5
+; ZVFHMIN32-NEXT: vslidedown.vi v29, v0, 4
+; ZVFHMIN32-NEXT: vslidedown.vi v31, v0, 3
+; ZVFHMIN32-NEXT: vslidedown.vi v5, v0, 2
; ZVFHMIN32-NEXT: vslidedown.vi v27, v0, 1
; ZVFHMIN32-NEXT: vsetivli zero, 1, e16, m2, ta, ma
; ZVFHMIN32-NEXT: vslidedown.vi v16, v0, 15
@@ -1453,63 +1449,63 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
; ZVFHMIN32-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill
; ZVFHMIN32-NEXT: vslidedown.vi v16, v0, 14
; ZVFHMIN32-NEXT: csrr a2, vlenb
-; ZVFHMIN32-NEXT: slli a2, a2, 3
+; ZVFHMIN32-NEXT: slli a2, a2, 1
; ZVFHMIN32-NEXT: add a2, sp, a2
; ZVFHMIN32-NEXT: addi a2, a2, 848
; ZVFHMIN32-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill
; ZVFHMIN32-NEXT: vslidedown.vi v16, v0, 13
; ZVFHMIN32-NEXT: csrr a2, vlenb
-; ZVFHMIN32-NEXT: li a3, 6
-; ZVFHMIN32-NEXT: mul a2, a2, a3
+; ZVFHMIN32-NEXT: li a4, 6
+; ZVFHMIN32-NEXT: mul a2, a2, a4
; ZVFHMIN32-NEXT: add a2, sp, a2
; ZVFHMIN32-NEXT: addi a2, a2, 848
; ZVFHMIN32-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill
; ZVFHMIN32-NEXT: vslidedown.vi v16, v0, 12
; ZVFHMIN32-NEXT: csrr a2, vlenb
-; ZVFHMIN32-NEXT: li a3, 12
-; ZVFHMIN32-NEXT: mul a2, a2, a3
+; ZVFHMIN32-NEXT: slli a2, a2, 3
; ZVFHMIN32-NEXT: add a2, sp, a2
; ZVFHMIN32-NEXT: addi a2, a2, 848
; ZVFHMIN32-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill
; ZVFHMIN32-NEXT: vslidedown.vi v16, v0, 11
; ZVFHMIN32-NEXT: csrr a2, vlenb
-; ZVFHMIN32-NEXT: li a3, 10
-; ZVFHMIN32-NEXT: mul a2, a2, a3
+; ZVFHMIN32-NEXT: li a4, 13
+; ZVFHMIN32-NEXT: mul a2, a2, a4
; ZVFHMIN32-NEXT: add a2, sp, a2
; ZVFHMIN32-NEXT: addi a2, a2, 848
; ZVFHMIN32-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill
; ZVFHMIN32-NEXT: vslidedown.vi v16, v0, 10
; ZVFHMIN32-NEXT: csrr a2, vlenb
-; ZVFHMIN32-NEXT: slli a2, a2, 4
+; ZVFHMIN32-NEXT: li a4, 19
+; ZVFHMIN32-NEXT: mul a2, a2, a4
; ZVFHMIN32-NEXT: add a2, sp, a2
; ZVFHMIN32-NEXT: addi a2, a2, 848
; ZVFHMIN32-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill
; ZVFHMIN32-NEXT: vslidedown.vi v16, v0, 9
+; ZVFHMIN32-NEXT: csrr a2, vlenb
+; ZVFHMIN32-NEXT: li a4, 21
+; ZVFHMIN32-NEXT: mul a2, a2, a4
+; ZVFHMIN32-NEXT: add a2, sp, a2
+; ZVFHMIN32-NEXT: addi a2, a2, 848
+; ZVFHMIN32-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill
; ZVFHMIN32-NEXT: vslidedown.vi v0, v0, 8
-; ZVFHMIN32-NEXT: addi a2, sp, 848
-; ZVFHMIN32-NEXT: vs2r.v v0, (a2) # Unknown-size Folded Spill
-; ZVFHMIN32-NEXT: vmv.x.s t4, v26
; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
; ZVFHMIN32-NEXT: sb a0, 215(sp)
; ZVFHMIN32-NEXT: lh a0, 556(sp)
; ZVFHMIN32-NEXT: lh a1, 300(sp)
-; ZVFHMIN32-NEXT: vmv.x.s t3, v20
-; ZVFHMIN32-NEXT: vmv.x.s t1, v28
+; ZVFHMIN32-NEXT: vmv.x.s t3, v26
+; ZVFHMIN32-NEXT: vmv.x.s t2, v28
; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
; ZVFHMIN32-NEXT: sb a0, 214(sp)
; ZVFHMIN32-NEXT: lh a0, 554(sp)
; ZVFHMIN32-NEXT: lh a1, 298(sp)
-; ZVFHMIN32-NEXT: csrr a2, vlenb
-; ZVFHMIN32-NEXT: slli a2, a2, 1
-; ZVFHMIN32-NEXT: add a2, sp, a2
-; ZVFHMIN32-NEXT: addi a2, a2, 848
-; ZVFHMIN32-NEXT: vl2r.v v0, (a2) # Unknown-size Folded Reload
-; ZVFHMIN32-NEXT: vmv.x.s t2, v0
-; ZVFHMIN32-NEXT: vmv.x.s t0, v4
+; ZVFHMIN32-NEXT: addi a2, sp, 848
+; ZVFHMIN32-NEXT: vl2r.v v16, (a2) # Unknown-size Folded Reload
+; ZVFHMIN32-NEXT: vmv.x.s t1, v16
+; ZVFHMIN32-NEXT: vmv.x.s t0, v6
; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
@@ -1517,229 +1513,234 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
; ZVFHMIN32-NEXT: lh a0, 552(sp)
; ZVFHMIN32-NEXT: lh a1, 296(sp)
; ZVFHMIN32-NEXT: vmv.x.s a7, v2
-; ZVFHMIN32-NEXT: vmv.x.s a6, v30
+; ZVFHMIN32-NEXT: vmv.x.s a6, v22
; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
; ZVFHMIN32-NEXT: sb a0, 212(sp)
; ZVFHMIN32-NEXT: lh a0, 550(sp)
; ZVFHMIN32-NEXT: lh a1, 294(sp)
-; ZVFHMIN32-NEXT: vmv.x.s a5, v22
+; ZVFHMIN32-NEXT: vmv.x.s a5, v20
; ZVFHMIN32-NEXT: vmv.x.s a2, v18
-; ZVFHMIN32-NEXT: sw a2, 112(sp) # 4-byte Folded Spill
+; ZVFHMIN32-NEXT: sw a2, 108(sp) # 4-byte Folded Spill
; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
; ZVFHMIN32-NEXT: sb a0, 211(sp)
-; ZVFHMIN32-NEXT: lh a1, 548(sp)
-; ZVFHMIN32-NEXT: lh t5, 292(sp)
-; ZVFHMIN32-NEXT: vmv.x.s a0, v14
-; ZVFHMIN32-NEXT: sw a0, 116(sp) # 4-byte Folded Spill
-; ZVFHMIN32-NEXT: vmv.x.s a0, v8
-; ZVFHMIN32-NEXT: sw a0, 124(sp) # 4-byte Folded Spill
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT: fmv.h.x fa4, t5
-; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4
-; ZVFHMIN32-NEXT: sb a1, 210(sp)
-; ZVFHMIN32-NEXT: lh a1, 546(sp)
-; ZVFHMIN32-NEXT: lh t5, 290(sp)
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a4
-; ZVFHMIN32-NEXT: vmv.x.s a4, v24
+; ZVFHMIN32-NEXT: lh a0, 548(sp)
+; ZVFHMIN32-NEXT: lh a1, 292(sp)
+; ZVFHMIN32-NEXT: vmv.x.s a2, v14
+; ZVFHMIN32-NEXT: sw a2, 116(sp) # 4-byte Folded Spill
+; ZVFHMIN32-NEXT: vmv.x.s a2, v8
+; ZVFHMIN32-NEXT: sw a2, 124(sp) # 4-byte Folded Spill
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT: fmv.h.x fa3, t5
-; ZVFHMIN32-NEXT: feq.h a1, fa4, fa3
-; ZVFHMIN32-NEXT: sb a1, 209(sp)
-; ZVFHMIN32-NEXT: lh a1, 544(sp)
-; ZVFHMIN32-NEXT: lh t5, 288(sp)
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a4
-; ZVFHMIN32-NEXT: feq.h a4, fa5, fa4
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT: fmv.h.x fa4, t5
-; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4
-; ZVFHMIN32-NEXT: sb a4, 192(sp)
-; ZVFHMIN32-NEXT: sb a1, 208(sp)
-; ZVFHMIN32-NEXT: lh t5, 738(sp)
-; ZVFHMIN32-NEXT: lh t6, 482(sp)
-; ZVFHMIN32-NEXT: vmv.x.s a0, v12
-; ZVFHMIN32-NEXT: sw a0, 108(sp) # 4-byte Folded Spill
-; ZVFHMIN32-NEXT: vmv.x.s a0, v10
-; ZVFHMIN32-NEXT: sw a0, 120(sp) # 4-byte Folded Spill
-; ZVFHMIN32-NEXT: fmv.h.x fa5, t5
-; ZVFHMIN32-NEXT: fmv.h.x fa4, t6
-; ZVFHMIN32-NEXT: feq.h t5, fa5, fa4
-; ZVFHMIN32-NEXT: sb t5, 177(sp)
-; ZVFHMIN32-NEXT: lh t5, 736(sp)
-; ZVFHMIN32-NEXT: lh t6, 480(sp)
-; ZVFHMIN32-NEXT: csrr a0, vlenb
-; ZVFHMIN32-NEXT: li a1, 29
-; ZVFHMIN32-NEXT: mul a0, a0, a1
-; ZVFHMIN32-NEXT: add a0, sp, a0
-; ZVFHMIN32-NEXT: lh s5, 848(a0) # 8-byte Folded Reload
-; ZVFHMIN32-NEXT: csrr a0, vlenb
-; ZVFHMIN32-NEXT: li a1, 28
-; ZVFHMIN32-NEXT: mul a0, a0, a1
-; ZVFHMIN32-NEXT: add a0, sp, a0
-; ZVFHMIN32-NEXT: lh s6, 848(a0) # 8-byte Folded Reload
-; ZVFHMIN32-NEXT: fmv.h.x fa5, t5
-; ZVFHMIN32-NEXT: fmv.h.x fa4, t6
-; ZVFHMIN32-NEXT: feq.h t5, fa5, fa4
-; ZVFHMIN32-NEXT: sb t5, 176(sp)
-; ZVFHMIN32-NEXT: lh t5, 734(sp)
-; ZVFHMIN32-NEXT: lh t6, 478(sp)
-; ZVFHMIN32-NEXT: csrr a0, vlenb
-; ZVFHMIN32-NEXT: li a1, 27
-; ZVFHMIN32-NEXT: mul a0, a0, a1
-; ZVFHMIN32-NEXT: add a0, sp, a0
-; ZVFHMIN32-NEXT: lh s7, 848(a0) # 8-byte Folded Reload
-; ZVFHMIN32-NEXT: csrr a0, vlenb
-; ZVFHMIN32-NEXT: li a1, 26
-; ZVFHMIN32-NEXT: mul a0, a0, a1
-; ZVFHMIN32-NEXT: add a0, sp, a0
-; ZVFHMIN32-NEXT: lh s8, 848(a0) # 8-byte Folded Reload
-; ZVFHMIN32-NEXT: fmv.h.x fa5, t5
-; ZVFHMIN32-NEXT: fmv.h.x fa4, t6
-; ZVFHMIN32-NEXT: feq.h t5, fa5, fa4
-; ZVFHMIN32-NEXT: sb t5, 175(sp)
-; ZVFHMIN32-NEXT: lh t5, 732(sp)
-; ZVFHMIN32-NEXT: lh t6, 476(sp)
-; ZVFHMIN32-NEXT: csrr a0, vlenb
-; ZVFHMIN32-NEXT: li a1, 25
-; ZVFHMIN32-NEXT: mul a0, a0, a1
-; ZVFHMIN32-NEXT: add a0, sp, a0
-; ZVFHMIN32-NEXT: lh s4, 848(a0) # 8-byte Folded Reload
-; ZVFHMIN32-NEXT: csrr a0, vlenb
-; ZVFHMIN32-NEXT: li a1, 24
-; ZVFHMIN32-NEXT: mul a0, a0, a1
-; ZVFHMIN32-NEXT: add a0, sp, a0
-; ZVFHMIN32-NEXT: lh s3, 848(a0) # 8-byte Folded Reload
-; ZVFHMIN32-NEXT: fmv.h.x fa5, t5
-; ZVFHMIN32-NEXT: fmv.h.x fa4, t6
-; ZVFHMIN32-NEXT: feq.h t5, fa5, fa4
-; ZVFHMIN32-NEXT: sb t5, 174(sp)
-; ZVFHMIN32-NEXT: lh t6, 730(sp)
-; ZVFHMIN32-NEXT: lh s9, 474(sp)
-; ZVFHMIN32-NEXT: csrr a0, vlenb
-; ZVFHMIN32-NEXT: li a1, 23
-; ZVFHMIN32-NEXT: mul a0, a0, a1
-; ZVFHMIN32-NEXT: add a0, sp, a0
-; ZVFHMIN32-NEXT: lh s2, 848(a0) # 8-byte Folded Reload
-; ZVFHMIN32-NEXT: vmv.x.s t5, v3
-; ZVFHMIN32-NEXT: fmv.h.x fa5, t6
+; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT: sb a0, 210(sp)
+; ZVFHMIN32-NEXT: lh a0, 546(sp)
+; ZVFHMIN32-NEXT: lh a1, 290(sp)
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a3
+; ZVFHMIN32-NEXT: vmv.x.s a3, v24
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a0
+; ZVFHMIN32-NEXT: fmv.h.x fa3, a1
+; ZVFHMIN32-NEXT: feq.h a0, fa4, fa3
+; ZVFHMIN32-NEXT: sb a0, 209(sp)
+; ZVFHMIN32-NEXT: lh a0, 544(sp)
+; ZVFHMIN32-NEXT: lh a1, 288(sp)
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a3
+; ZVFHMIN32-NEXT: feq.h a3, fa5, fa4
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT: sb a3, 192(sp)
+; ZVFHMIN32-NEXT: sb a0, 208(sp)
+; ZVFHMIN32-NEXT: lh a0, 738(sp)
+; ZVFHMIN32-NEXT: lh a1, 482(sp)
+; ZVFHMIN32-NEXT: vmv.x.s a2, v10
+; ZVFHMIN32-NEXT: sw a2, 112(sp) # 4-byte Folded Spill
+; ZVFHMIN32-NEXT: vmv.x.s a2, v12
+; ZVFHMIN32-NEXT: sw a2, 120(sp) # 4-byte Folded Spill
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT: sb a0, 177(sp)
+; ZVFHMIN32-NEXT: lh a0, 736(sp)
+; ZVFHMIN32-NEXT: lh a1, 480(sp)
+; ZVFHMIN32-NEXT: csrr a2, vlenb
+; ZVFHMIN32-NEXT: li a3, 29
+; ZVFHMIN32-NEXT: mul a2, a2, a3
+; ZVFHMIN32-NEXT: add a2, sp, a2
+; ZVFHMIN32-NEXT: lh s5, 848(a2) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT: csrr a2, vlenb
+; ZVFHMIN32-NEXT: li a3, 28
+; ZVFHMIN32-NEXT: mul a2, a2, a3
+; ZVFHMIN32-NEXT: add a2, sp, a2
+; ZVFHMIN32-NEXT: lh s2, 848(a2) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT: sb a0, 176(sp)
+; ZVFHMIN32-NEXT: lh a0, 734(sp)
+; ZVFHMIN32-NEXT: lh a1, 478(sp)
+; ZVFHMIN32-NEXT: csrr a2, vlenb
+; ZVFHMIN32-NEXT: li a3, 27
+; ZVFHMIN32-NEXT: mul a2, a2, a3
+; ZVFHMIN32-NEXT: add a2, sp, a2
+; ZVFHMIN32-NEXT: lh s6, 848(a2) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT: csrr a2, vlenb
+; ZVFHMIN32-NEXT: li a3, 26
+; ZVFHMIN32-NEXT: mul a2, a2, a3
+; ZVFHMIN32-NEXT: add a2, sp, a2
+; ZVFHMIN32-NEXT: lh s3, 848(a2) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT: sb a0, 175(sp)
+; ZVFHMIN32-NEXT: lh a0, 732(sp)
+; ZVFHMIN32-NEXT: lh a1, 476(sp)
+; ZVFHMIN32-NEXT: csrr a2, vlenb
+; ZVFHMIN32-NEXT: li a3, 25
+; ZVFHMIN32-NEXT: mul a2, a2, a3
+; ZVFHMIN32-NEXT: add a2, sp, a2
+; ZVFHMIN32-NEXT: lh s7, 848(a2) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT: csrr a2, vlenb
+; ZVFHMIN32-NEXT: li a3, 24
+; ZVFHMIN32-NEXT: mul a2, a2, a3
+; ZVFHMIN32-NEXT: add a2, sp, a2
+; ZVFHMIN32-NEXT: lh s4, 848(a2) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT: sb a0, 174(sp)
+; ZVFHMIN32-NEXT: lh a0, 730(sp)
+; ZVFHMIN32-NEXT: lh a1, 474(sp)
+; ZVFHMIN32-NEXT: csrr a2, vlenb
+; ZVFHMIN32-NEXT: li a3, 23
+; ZVFHMIN32-NEXT: mul a2, a2, a3
+; ZVFHMIN32-NEXT: add a2, sp, a2
+; ZVFHMIN32-NEXT: lh s8, 848(a2) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT: vmv.x.s t4, v21
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT: sb a0, 173(sp)
+; ZVFHMIN32-NEXT: lh a0, 728(sp)
+; ZVFHMIN32-NEXT: lh a1, 472(sp)
+; ZVFHMIN32-NEXT: vmv.x.s t6, v3
+; ZVFHMIN32-NEXT: vmv.x.s t5, v19
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT: sb a0, 172(sp)
+; ZVFHMIN32-NEXT: lh a0, 726(sp)
+; ZVFHMIN32-NEXT: lh a1, 470(sp)
+; ZVFHMIN32-NEXT: vmv.x.s s10, v11
+; ZVFHMIN32-NEXT: vmv.x.s s11, v7
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT: sb a0, 171(sp)
+; ZVFHMIN32-NEXT: lh a0, 724(sp)
+; ZVFHMIN32-NEXT: lh s9, 468(sp)
+; ZVFHMIN32-NEXT: vmv.x.s a4, v9
+; ZVFHMIN32-NEXT: vmv.x.s ra, v29
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
; ZVFHMIN32-NEXT: fmv.h.x fa4, s9
-; ZVFHMIN32-NEXT: feq.h t6, fa5, fa4
-; ZVFHMIN32-NEXT: sb t6, 173(sp)
-; ZVFHMIN32-NEXT: lh s9, 728(sp)
-; ZVFHMIN32-NEXT: lh s10, 472(sp)
-; ZVFHMIN32-NEXT: vmv.x.s t6, v31
-; ZVFHMIN32-NEXT: vmv.x.s ra, v13
-; ZVFHMIN32-NEXT: fmv.h.x fa5, s9
-; ZVFHMIN32-NEXT: fmv.h.x fa4, s10
-; ZVFHMIN32-NEXT: feq.h s9, fa5, fa4
-; ZVFHMIN32-NEXT: sb s9, 172(sp)
-; ZVFHMIN32-NEXT: lh s9, 726(sp)
-; ZVFHMIN32-NEXT: lh s10, 470(sp)
-; ZVFHMIN32-NEXT: vmv.x.s a2, v29
-; ZVFHMIN32-NEXT: vmv.x.s a3, v11
-; ZVFHMIN32-NEXT: fmv.h.x fa5, s9
-; ZVFHMIN32-NEXT: fmv.h.x fa4, s10
-; ZVFHMIN32-NEXT: feq.h s9, fa5, fa4
-; ZVFHMIN32-NEXT: sb s9, 171(sp)
-; ZVFHMIN32-NEXT: lh s10, 724(sp)
-; ZVFHMIN32-NEXT: lh s11, 468(sp)
-; ZVFHMIN32-NEXT: vmv.x.s a4, v7
-; ZVFHMIN32-NEXT: vmv.x.s s9, v9
-; ZVFHMIN32-NEXT: fmv.h.x fa5, s10
-; ZVFHMIN32-NEXT: fmv.h.x fa4, s11
-; ZVFHMIN32-NEXT: feq.h s10, fa5, fa4
-; ZVFHMIN32-NEXT: sb s10, 170(sp)
+; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT: sb a0, 170(sp)
; ZVFHMIN32-NEXT: lh a0, 722(sp)
; ZVFHMIN32-NEXT: lh a1, 466(sp)
-; ZVFHMIN32-NEXT: vmv.x.s s10, v21
-; ZVFHMIN32-NEXT: vmv.x.s s11, v27
+; ZVFHMIN32-NEXT: vmv.x.s s9, v31
+; ZVFHMIN32-NEXT: vmv.x.s a3, v5
; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
; ZVFHMIN32-NEXT: sb a0, 169(sp)
; ZVFHMIN32-NEXT: lh a0, 720(sp)
; ZVFHMIN32-NEXT: lh a1, 464(sp)
+; ZVFHMIN32-NEXT: vmv.x.s a2, v27
; ZVFHMIN32-NEXT: fmv.h.x fa5, s5
-; ZVFHMIN32-NEXT: fmv.h.x fa4, s6
-; ZVFHMIN32-NEXT: fmv.h.x fa3, a0
-; ZVFHMIN32-NEXT: fmv.h.x fa2, a1
-; ZVFHMIN32-NEXT: feq.h a0, fa3, fa2
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a0
+; ZVFHMIN32-NEXT: fmv.h.x fa3, a1
+; ZVFHMIN32-NEXT: feq.h a0, fa4, fa3
; ZVFHMIN32-NEXT: sb a0, 168(sp)
; ZVFHMIN32-NEXT: lh a0, 718(sp)
; ZVFHMIN32-NEXT: lh a1, 462(sp)
-; ZVFHMIN32-NEXT: fmv.h.x fa3, s7
-; ZVFHMIN32-NEXT: fmv.h.x fa2, s8
-; ZVFHMIN32-NEXT: fmv.h.x fa1, a0
-; ZVFHMIN32-NEXT: fmv.h.x fa0, a1
-; ZVFHMIN32-NEXT: feq.h a0, fa1, fa0
-; ZVFHMIN32-NEXT: fmv.h.x fa1, ra
+; ZVFHMIN32-NEXT: fmv.h.x fa4, s2
+; ZVFHMIN32-NEXT: fmv.h.x fa3, s6
+; ZVFHMIN32-NEXT: fmv.h.x fa2, a0
+; ZVFHMIN32-NEXT: fmv.h.x fa1, a1
+; ZVFHMIN32-NEXT: feq.h a0, fa2, fa1
; ZVFHMIN32-NEXT: sb a0, 167(sp)
; ZVFHMIN32-NEXT: lh a0, 716(sp)
-; ZVFHMIN32-NEXT: fmv.h.x fa0, a2
; ZVFHMIN32-NEXT: lh a1, 460(sp)
-; ZVFHMIN32-NEXT: feq.h s5, fa5, fa1
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT: feq.h a0, fa4, fa0
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4
-; ZVFHMIN32-NEXT: fmv.h.x fa5, s4
-; ZVFHMIN32-NEXT: sb a1, 166(sp)
-; ZVFHMIN32-NEXT: lh a1, 714(sp)
-; ZVFHMIN32-NEXT: lh a2, 458(sp)
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a3
-; ZVFHMIN32-NEXT: feq.h a3, fa3, fa4
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT: fmv.h.x fa3, a2
-; ZVFHMIN32-NEXT: feq.h a1, fa4, fa3
-; ZVFHMIN32-NEXT: fmv.h.x fa4, s3
-; ZVFHMIN32-NEXT: sb a1, 165(sp)
-; ZVFHMIN32-NEXT: lh a1, 712(sp)
-; ZVFHMIN32-NEXT: lh a2, 456(sp)
-; ZVFHMIN32-NEXT: fmv.h.x fa3, a4
-; ZVFHMIN32-NEXT: feq.h a4, fa2, fa3
-; ZVFHMIN32-NEXT: fmv.h.x fa3, a1
-; ZVFHMIN32-NEXT: fmv.h.x fa2, a2
-; ZVFHMIN32-NEXT: feq.h a1, fa3, fa2
-; ZVFHMIN32-NEXT: fmv.h.x fa3, s2
-; ZVFHMIN32-NEXT: sb a1, 164(sp)
-; ZVFHMIN32-NEXT: lh a1, 710(sp)
-; ZVFHMIN32-NEXT: lh a2, 454(sp)
-; ZVFHMIN32-NEXT: fmv.h.x fa2, s9
-; ZVFHMIN32-NEXT: feq.h s2, fa5, fa2
+; ZVFHMIN32-NEXT: fmv.h.x fa2, s3
+; ZVFHMIN32-NEXT: fmv.h.x fa1, s7
+; ZVFHMIN32-NEXT: fmv.h.x fa0, a0
+; ZVFHMIN32-NEXT: fmv.h.x ft0, a1
+; ZVFHMIN32-NEXT: feq.h a0, fa0, ft0
+; ZVFHMIN32-NEXT: sb a0, 166(sp)
+; ZVFHMIN32-NEXT: lh a0, 714(sp)
+; ZVFHMIN32-NEXT: lh a1, 458(sp)
+; ZVFHMIN32-NEXT: fmv.h.x fa0, s4
+; ZVFHMIN32-NEXT: fmv.h.x ft0, s8
+; ZVFHMIN32-NEXT: fmv.h.x ft1, a0
+; ZVFHMIN32-NEXT: fmv.h.x ft2, a1
+; ZVFHMIN32-NEXT: feq.h a0, ft1, ft2
+; ZVFHMIN32-NEXT: sb a0, 165(sp)
+; ZVFHMIN32-NEXT: lh a0, 712(sp)
+; ZVFHMIN32-NEXT: lh a1, 456(sp)
+; ZVFHMIN32-NEXT: fmv.h.x ft1, s10
+; ZVFHMIN32-NEXT: fmv.h.x ft2, s11
+; ZVFHMIN32-NEXT: fmv.h.x ft3, a0
+; ZVFHMIN32-NEXT: fmv.h.x ft4, a1
+; ZVFHMIN32-NEXT: feq.h a0, ft3, ft4
+; ZVFHMIN32-NEXT: sb a0, 164(sp)
+; ZVFHMIN32-NEXT: lh a0, 710(sp)
+; ZVFHMIN32-NEXT: fmv.h.x ft3, a4
+; ZVFHMIN32-NEXT: lh a1, 454(sp)
+; ZVFHMIN32-NEXT: fmv.h.x ft4, ra
+; ZVFHMIN32-NEXT: fmv.h.x ft5, a0
+; ZVFHMIN32-NEXT: feq.h a0, fa5, ft1
; ZVFHMIN32-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT: fmv.h.x fa2, a2
-; ZVFHMIN32-NEXT: feq.h a1, fa5, fa2
-; ZVFHMIN32-NEXT: fmv.h.x fa5, s10
-; ZVFHMIN32-NEXT: fmv.h.x fa2, s11
+; ZVFHMIN32-NEXT: feq.h a1, ft5, fa5
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a3
; ZVFHMIN32-NEXT: sb a1, 163(sp)
; ZVFHMIN32-NEXT: lh a1, 708(sp)
+; ZVFHMIN32-NEXT: fmv.h.x ft1, a2
; ZVFHMIN32-NEXT: lh a2, 452(sp)
-; ZVFHMIN32-NEXT: feq.h s3, fa4, fa5
-; ZVFHMIN32-NEXT: feq.h s4, fa3, fa2
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a2
-; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4
-; ZVFHMIN32-NEXT: sb a1, 162(sp)
-; ZVFHMIN32-NEXT: lh a1, 706(sp)
-; ZVFHMIN32-NEXT: lh a2, 450(sp)
-; ZVFHMIN32-NEXT: sb s4, 129(sp)
-; ZVFHMIN32-NEXT: sb s3, 130(sp)
-; ZVFHMIN32-NEXT: sb s2, 131(sp)
-; ZVFHMIN32-NEXT: sb a4, 132(sp)
+; ZVFHMIN32-NEXT: feq.h a3, fa0, fa5
; ZVFHMIN32-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a2
-; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT: feq.h a1, ft0, ft1
+; ZVFHMIN32-NEXT: fmv.h.x fa0, a2
+; ZVFHMIN32-NEXT: feq.h a2, fa5, fa0
+; ZVFHMIN32-NEXT: fmv.h.x fa5, s9
+; ZVFHMIN32-NEXT: sb a2, 162(sp)
+; ZVFHMIN32-NEXT: lh a2, 706(sp)
+; ZVFHMIN32-NEXT: lh a4, 450(sp)
+; ZVFHMIN32-NEXT: sb a1, 129(sp)
+; ZVFHMIN32-NEXT: feq.h a1, fa1, fa5
+; ZVFHMIN32-NEXT: sb a3, 130(sp)
+; ZVFHMIN32-NEXT: feq.h a3, fa2, ft4
+; ZVFHMIN32-NEXT: sb a1, 131(sp)
+; ZVFHMIN32-NEXT: feq.h a1, fa4, ft2
+; ZVFHMIN32-NEXT: sb a3, 132(sp)
+; ZVFHMIN32-NEXT: feq.h a3, fa3, ft3
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a2
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a4
+; ZVFHMIN32-NEXT: feq.h a2, fa5, fa4
; ZVFHMIN32-NEXT: sb a3, 133(sp)
-; ZVFHMIN32-NEXT: sb a0, 134(sp)
-; ZVFHMIN32-NEXT: sb s5, 135(sp)
-; ZVFHMIN32-NEXT: sb a1, 161(sp)
+; ZVFHMIN32-NEXT: sb a1, 134(sp)
+; ZVFHMIN32-NEXT: sb a0, 135(sp)
+; ZVFHMIN32-NEXT: sb a2, 161(sp)
; ZVFHMIN32-NEXT: lh a0, 610(sp)
; ZVFHMIN32-NEXT: lh a1, 354(sp)
-; ZVFHMIN32-NEXT: vmv.x.s s6, v5
-; ZVFHMIN32-NEXT: vmv.x.s s5, v23
+; ZVFHMIN32-NEXT: vmv.x.s s4, v23
+; ZVFHMIN32-NEXT: csrr a2, vlenb
+; ZVFHMIN32-NEXT: li a3, 10
+; ZVFHMIN32-NEXT: mul a2, a2, a3
+; ZVFHMIN32-NEXT: add a2, sp, a2
+; ZVFHMIN32-NEXT: lh s2, 848(a2) # 8-byte Folded Reload
; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
@@ -1747,13 +1748,12 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
; ZVFHMIN32-NEXT: lh a0, 608(sp)
; ZVFHMIN32-NEXT: lh a1, 352(sp)
; ZVFHMIN32-NEXT: csrr a2, vlenb
-; ZVFHMIN32-NEXT: li a3, 21
-; ZVFHMIN32-NEXT: mul a2, a2, a3
+; ZVFHMIN32-NEXT: slli a2, a2, 4
; ZVFHMIN32-NEXT: add a2, sp, a2
-; ZVFHMIN32-NEXT: lh s4, 848(a2) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT: lh s5, 848(a2) # 8-byte Folded Reload
; ZVFHMIN32-NEXT: csrr a2, vlenb
-; ZVFHMIN32-NEXT: li a3, 20
-; ZVFHMIN32-NEXT: mul a2, a2, a3
+; ZVFHMIN32-NEXT: slli a3, a2, 4
+; ZVFHMIN32-NEXT: sub a2, a3, a2
; ZVFHMIN32-NEXT: add a2, sp, a2
; ZVFHMIN32-NEXT: lh s3, 848(a2) # 8-byte Folded Reload
; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
@@ -1762,153 +1762,148 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
; ZVFHMIN32-NEXT: sb a0, 240(sp)
; ZVFHMIN32-NEXT: lh a0, 606(sp)
; ZVFHMIN32-NEXT: lh a1, 350(sp)
-; ZVFHMIN32-NEXT: csrr a2, vlenb
-; ZVFHMIN32-NEXT: li a3, 22
-; ZVFHMIN32-NEXT: mul a2, a2, a3
-; ZVFHMIN32-NEXT: add a2, sp, a2
-; ZVFHMIN32-NEXT: lh s2, 848(a2) # 8-byte Folded Reload
-; ZVFHMIN32-NEXT: fmv.h.x fa5, t5
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a0
-; ZVFHMIN32-NEXT: fmv.h.x fa3, a1
-; ZVFHMIN32-NEXT: feq.h a0, fa4, fa3
+; ZVFHMIN32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 7
+; ZVFHMIN32-NEXT: vmv.x.s s6, v8
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
; ZVFHMIN32-NEXT: sb a0, 239(sp)
; ZVFHMIN32-NEXT: lh a0, 604(sp)
; ZVFHMIN32-NEXT: lh a1, 348(sp)
-; ZVFHMIN32-NEXT: fmv.h.x fa4, t6
-; ZVFHMIN32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 7
-; ZVFHMIN32-NEXT: fmv.h.x fa3, a0
-; ZVFHMIN32-NEXT: fmv.h.x fa2, a1
-; ZVFHMIN32-NEXT: feq.h a0, fa3, fa2
+; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 6
+; ZVFHMIN32-NEXT: vmv.x.s s7, v8
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
; ZVFHMIN32-NEXT: sb a0, 238(sp)
; ZVFHMIN32-NEXT: lh a0, 602(sp)
; ZVFHMIN32-NEXT: lh a1, 346(sp)
-; ZVFHMIN32-NEXT: vmv.x.s a2, v8
-; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 6
-; ZVFHMIN32-NEXT: fmv.h.x fa3, a0
-; ZVFHMIN32-NEXT: fmv.h.x fa2, a1
-; ZVFHMIN32-NEXT: feq.h a0, fa3, fa2
+; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 5
+; ZVFHMIN32-NEXT: vmv.x.s s8, v8
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
; ZVFHMIN32-NEXT: sb a0, 237(sp)
; ZVFHMIN32-NEXT: lh a0, 600(sp)
; ZVFHMIN32-NEXT: lh a1, 344(sp)
-; ZVFHMIN32-NEXT: vmv.x.s a3, v8
-; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 5
-; ZVFHMIN32-NEXT: fmv.h.x fa3, a0
-; ZVFHMIN32-NEXT: fmv.h.x fa2, a1
-; ZVFHMIN32-NEXT: feq.h a0, fa3, fa2
+; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 4
+; ZVFHMIN32-NEXT: vmv.x.s s9, v8
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
; ZVFHMIN32-NEXT: sb a0, 236(sp)
; ZVFHMIN32-NEXT: lh a0, 598(sp)
; ZVFHMIN32-NEXT: lh a1, 342(sp)
-; ZVFHMIN32-NEXT: vmv.x.s a4, v8
-; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 4
-; ZVFHMIN32-NEXT: fmv.h.x fa3, a0
-; ZVFHMIN32-NEXT: fmv.h.x fa2, a1
-; ZVFHMIN32-NEXT: feq.h a0, fa3, fa2
+; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 3
+; ZVFHMIN32-NEXT: vmv.x.s s10, v8
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
; ZVFHMIN32-NEXT: sb a0, 235(sp)
; ZVFHMIN32-NEXT: lh a0, 596(sp)
; ZVFHMIN32-NEXT: lh a1, 340(sp)
-; ZVFHMIN32-NEXT: vmv.x.s s8, v8
-; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 3
-; ZVFHMIN32-NEXT: fmv.h.x fa3, a0
-; ZVFHMIN32-NEXT: fmv.h.x fa2, a1
-; ZVFHMIN32-NEXT: feq.h a0, fa3, fa2
+; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 2
+; ZVFHMIN32-NEXT: vmv.x.s s11, v8
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
; ZVFHMIN32-NEXT: sb a0, 234(sp)
; ZVFHMIN32-NEXT: lh a0, 594(sp)
; ZVFHMIN32-NEXT: lh a1, 338(sp)
-; ZVFHMIN32-NEXT: vmv.x.s s9, v8
-; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 2
-; ZVFHMIN32-NEXT: fmv.h.x fa3, a0
-; ZVFHMIN32-NEXT: fmv.h.x fa2, a1
-; ZVFHMIN32-NEXT: feq.h a0, fa3, fa2
+; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 1
+; ZVFHMIN32-NEXT: vmv.x.s ra, v8
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
; ZVFHMIN32-NEXT: sb a0, 233(sp)
; ZVFHMIN32-NEXT: lh a0, 592(sp)
-; ZVFHMIN32-NEXT: vmv.x.s a1, v8
-; ZVFHMIN32-NEXT: lh t5, 336(sp)
-; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 1
+; ZVFHMIN32-NEXT: lh a1, 336(sp)
+; ZVFHMIN32-NEXT: fmv.h.x fa5, t4
+; ZVFHMIN32-NEXT: fmv.h.x fa4, t6
; ZVFHMIN32-NEXT: fmv.h.x fa3, a0
-; ZVFHMIN32-NEXT: vmv.x.s s7, v8
-; ZVFHMIN32-NEXT: fmv.h.x fa2, t5
+; ZVFHMIN32-NEXT: fmv.h.x fa2, a1
; ZVFHMIN32-NEXT: feq.h a0, fa3, fa2
-; ZVFHMIN32-NEXT: fmv.h.x fa3, a2
; ZVFHMIN32-NEXT: sb a0, 232(sp)
; ZVFHMIN32-NEXT: lh a0, 590(sp)
-; ZVFHMIN32-NEXT: fmv.h.x fa2, a3
-; ZVFHMIN32-NEXT: lh a2, 334(sp)
-; ZVFHMIN32-NEXT: feq.h t5, fa5, fa3
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT: feq.h t6, fa4, fa2
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a2
-; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT: fmv.h.x fa5, s6
+; ZVFHMIN32-NEXT: lh a1, 334(sp)
+; ZVFHMIN32-NEXT: fmv.h.x fa3, t5
+; ZVFHMIN32-NEXT: fmv.h.x fa2, s4
+; ZVFHMIN32-NEXT: fmv.h.x fa1, a0
+; ZVFHMIN32-NEXT: fmv.h.x fa0, a1
+; ZVFHMIN32-NEXT: feq.h a0, fa1, fa0
; ZVFHMIN32-NEXT: sb a0, 231(sp)
; ZVFHMIN32-NEXT: lh a0, 588(sp)
-; ZVFHMIN32-NEXT: lh a2, 332(sp)
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a4
-; ZVFHMIN32-NEXT: feq.h a3, fa5, fa4
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a2
-; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT: fmv.h.x fa5, s5
+; ZVFHMIN32-NEXT: lh a1, 332(sp)
+; ZVFHMIN32-NEXT: fmv.h.x fa1, s2
+; ZVFHMIN32-NEXT: fmv.h.x fa0, s5
+; ZVFHMIN32-NEXT: fmv.h.x ft0, a0
+; ZVFHMIN32-NEXT: fmv.h.x ft1, a1
+; ZVFHMIN32-NEXT: feq.h a0, ft0, ft1
; ZVFHMIN32-NEXT: sb a0, 230(sp)
; ZVFHMIN32-NEXT: lh a0, 586(sp)
-; ZVFHMIN32-NEXT: lh a2, 330(sp)
-; ZVFHMIN32-NEXT: fmv.h.x fa4, s8
-; ZVFHMIN32-NEXT: feq.h a4, fa5, fa4
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a2
-; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT: fmv.h.x fa5, s4
-; ZVFHMIN32-NEXT: sb a0, 229(sp)
-; ZVFHMIN32-NEXT: lh a0, 584(sp)
+; ZVFHMIN32-NEXT: fmv.h.x ft0, s3
+; ZVFHMIN32-NEXT: lh a1, 330(sp)
+; ZVFHMIN32-NEXT: fmv.h.x ft1, s6
+; ZVFHMIN32-NEXT: fmv.h.x ft2, a0
+; ZVFHMIN32-NEXT: feq.h a0, fa5, ft1
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT: feq.h a1, ft2, fa5
+; ZVFHMIN32-NEXT: fmv.h.x fa5, s7
+; ZVFHMIN32-NEXT: sb a1, 229(sp)
+; ZVFHMIN32-NEXT: lh a1, 584(sp)
+; ZVFHMIN32-NEXT: fmv.h.x ft1, s8
; ZVFHMIN32-NEXT: lh a2, 328(sp)
-; ZVFHMIN32-NEXT: fmv.h.x fa4, s9
-; ZVFHMIN32-NEXT: feq.h s4, fa5, fa4
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a2
-; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT: fmv.h.x fa5, s3
-; ZVFHMIN32-NEXT: sb a0, 228(sp)
-; ZVFHMIN32-NEXT: lh a0, 582(sp)
-; ZVFHMIN32-NEXT: lh a2, 326(sp)
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a2
-; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT: fmv.h.x fa5, s2
-; ZVFHMIN32-NEXT: sb a0, 227(sp)
-; ZVFHMIN32-NEXT: lh a0, 580(sp)
-; ZVFHMIN32-NEXT: lh a2, 324(sp)
-; ZVFHMIN32-NEXT: fmv.h.x fa4, s7
-; ZVFHMIN32-NEXT: feq.h s2, fa5, fa4
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a2
-; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT: sb a0, 226(sp)
-; ZVFHMIN32-NEXT: lh a0, 578(sp)
-; ZVFHMIN32-NEXT: lh a2, 322(sp)
-; ZVFHMIN32-NEXT: sb s2, 193(sp)
-; ZVFHMIN32-NEXT: sb a1, 194(sp)
-; ZVFHMIN32-NEXT: sb s4, 195(sp)
-; ZVFHMIN32-NEXT: sb a4, 196(sp)
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT: feq.h a3, fa4, fa5
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT: feq.h a1, fa3, ft1
; ZVFHMIN32-NEXT: fmv.h.x fa4, a2
-; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT: sb a3, 197(sp)
-; ZVFHMIN32-NEXT: sb t6, 198(sp)
-; ZVFHMIN32-NEXT: sb t5, 199(sp)
-; ZVFHMIN32-NEXT: sb a0, 225(sp)
+; ZVFHMIN32-NEXT: feq.h a2, fa5, fa4
+; ZVFHMIN32-NEXT: fmv.h.x fa5, s9
+; ZVFHMIN32-NEXT: sb a2, 228(sp)
+; ZVFHMIN32-NEXT: lh a2, 582(sp)
+; ZVFHMIN32-NEXT: lh a4, 326(sp)
+; ZVFHMIN32-NEXT: fmv.h.x fa4, s10
+; ZVFHMIN32-NEXT: feq.h t4, fa2, fa5
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a2
+; ZVFHMIN32-NEXT: fmv.h.x fa3, a4
+; ZVFHMIN32-NEXT: feq.h a2, fa5, fa3
+; ZVFHMIN32-NEXT: fmv.h.x fa5, s11
+; ZVFHMIN32-NEXT: fmv.h.x fa3, ra
+; ZVFHMIN32-NEXT: sb a2, 227(sp)
+; ZVFHMIN32-NEXT: lh a2, 580(sp)
+; ZVFHMIN32-NEXT: lh a4, 324(sp)
+; ZVFHMIN32-NEXT: feq.h t5, fa0, fa5
+; ZVFHMIN32-NEXT: feq.h t6, ft0, fa3
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a2
+; ZVFHMIN32-NEXT: fmv.h.x fa3, a4
+; ZVFHMIN32-NEXT: feq.h a2, fa5, fa3
+; ZVFHMIN32-NEXT: sb a2, 226(sp)
+; ZVFHMIN32-NEXT: lh a2, 578(sp)
+; ZVFHMIN32-NEXT: lh a4, 322(sp)
+; ZVFHMIN32-NEXT: sb t6, 193(sp)
+; ZVFHMIN32-NEXT: feq.h t6, fa1, fa4
+; ZVFHMIN32-NEXT: sb t5, 194(sp)
+; ZVFHMIN32-NEXT: sb t6, 195(sp)
+; ZVFHMIN32-NEXT: sb t4, 196(sp)
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a2
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a4
+; ZVFHMIN32-NEXT: feq.h a2, fa5, fa4
+; ZVFHMIN32-NEXT: sb a1, 197(sp)
+; ZVFHMIN32-NEXT: sb a3, 198(sp)
+; ZVFHMIN32-NEXT: sb a0, 199(sp)
+; ZVFHMIN32-NEXT: sb a2, 225(sp)
; ZVFHMIN32-NEXT: lh a0, 766(sp)
; ZVFHMIN32-NEXT: lh a1, 510(sp)
; ZVFHMIN32-NEXT: csrr a2, vlenb
-; ZVFHMIN32-NEXT: li a3, 18
-; ZVFHMIN32-NEXT: mul a2, a2, a3
+; ZVFHMIN32-NEXT: slli a3, a2, 4
+; ZVFHMIN32-NEXT: add a2, a3, a2
; ZVFHMIN32-NEXT: add a2, sp, a2
; ZVFHMIN32-NEXT: addi a2, a2, 848
; ZVFHMIN32-NEXT: vl2r.v v8, (a2) # Unknown-size Folded Reload
; ZVFHMIN32-NEXT: vmv.x.s s2, v8
; ZVFHMIN32-NEXT: csrr a2, vlenb
-; ZVFHMIN32-NEXT: li a3, 14
+; ZVFHMIN32-NEXT: li a3, 11
; ZVFHMIN32-NEXT: mul a2, a2, a3
; ZVFHMIN32-NEXT: add a2, sp, a2
; ZVFHMIN32-NEXT: addi a2, a2, 848
@@ -1920,165 +1915,171 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
; ZVFHMIN32-NEXT: sb a0, 191(sp)
; ZVFHMIN32-NEXT: lh a0, 764(sp)
; ZVFHMIN32-NEXT: lh a1, 508(sp)
-; ZVFHMIN32-NEXT: vmv.x.s t5, v6
-; ZVFHMIN32-NEXT: csrr a2, vlenb
-; ZVFHMIN32-NEXT: slli a2, a2, 2
-; ZVFHMIN32-NEXT: add a2, sp, a2
-; ZVFHMIN32-NEXT: addi a2, a2, 848
-; ZVFHMIN32-NEXT: vl2r.v v8, (a2) # Unknown-size Folded Reload
-; ZVFHMIN32-NEXT: vmv.x.s a2, v8
+; ZVFHMIN32-NEXT: vmv.x.s t5, v4
+; ZVFHMIN32-NEXT: vmv.x.s t4, v30
; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
; ZVFHMIN32-NEXT: sb a0, 190(sp)
; ZVFHMIN32-NEXT: lh a0, 762(sp)
; ZVFHMIN32-NEXT: lh a1, 506(sp)
+; ZVFHMIN32-NEXT: csrr a2, vlenb
+; ZVFHMIN32-NEXT: slli a2, a2, 2
+; ZVFHMIN32-NEXT: add a2, sp, a2
+; ZVFHMIN32-NEXT: addi a2, a2, 848
+; ZVFHMIN32-NEXT: vl2r.v v8, (a2) # Unknown-size Folded Reload
+; ZVFHMIN32-NEXT: vmv.x.s a2, v8
; ZVFHMIN32-NEXT: csrr a3, vlenb
-; ZVFHMIN32-NEXT: slli a3, a3, 3
+; ZVFHMIN32-NEXT: slli a3, a3, 1
; ZVFHMIN32-NEXT: add a3, sp, a3
; ZVFHMIN32-NEXT: addi a3, a3, 848
; ZVFHMIN32-NEXT: vl2r.v v8, (a3) # Unknown-size Folded Reload
; ZVFHMIN32-NEXT: vmv.x.s a3, v8
-; ZVFHMIN32-NEXT: csrr a4, vlenb
-; ZVFHMIN32-NEXT: li s3, 6
-; ZVFHMIN32-NEXT: mul a4, a4, s3
-; ZVFHMIN32-NEXT: add a4, sp, a4
-; ZVFHMIN32-NEXT: addi a4, a4, 848
-; ZVFHMIN32-NEXT: vl2r.v v8, (a4) # Unknown-size Folded Reload
-; ZVFHMIN32-NEXT: vmv.x.s a4, v8
; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
; ZVFHMIN32-NEXT: sb a0, 189(sp)
; ZVFHMIN32-NEXT: lh a0, 760(sp)
; ZVFHMIN32-NEXT: lh a1, 504(sp)
-; ZVFHMIN32-NEXT: csrr s3, vlenb
-; ZVFHMIN32-NEXT: li s4, 12
-; ZVFHMIN32-NEXT: mul s3, s3, s4
-; ZVFHMIN32-NEXT: add s3, sp, s3
-; ZVFHMIN32-NEXT: addi s3, s3, 848
-; ZVFHMIN32-NEXT: vl2r.v v8, (s3) # Unknown-size Folded Reload
-; ZVFHMIN32-NEXT: vmv.x.s s6, v8
-; ZVFHMIN32-NEXT: csrr s3, vlenb
-; ZVFHMIN32-NEXT: li s4, 10
-; ZVFHMIN32-NEXT: mul s3, s3, s4
-; ZVFHMIN32-NEXT: add s3, sp, s3
-; ZVFHMIN32-NEXT: addi s3, s3, 848
-; ZVFHMIN32-NEXT: vl2r.v v8, (s3) # Unknown-size Folded Reload
-; ZVFHMIN32-NEXT: vmv.x.s s4, v8
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT: fmv.h.x fa5, t3
+; ZVFHMIN32-NEXT: csrr a4, vlenb
+; ZVFHMIN32-NEXT: li t3, 6
+; ZVFHMIN32-NEXT: mul a4, a4, t3
+; ZVFHMIN32-NEXT: add a4, sp, a4
+; ZVFHMIN32-NEXT: addi a4, a4, 848
+; ZVFHMIN32-NEXT: vl2r.v v8, (a4) # Unknown-size Folded Reload
+; ZVFHMIN32-NEXT: vmv.x.s a4, v8
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a0
+; ZVFHMIN32-NEXT: fmv.h.x fa3, a1
+; ZVFHMIN32-NEXT: feq.h a0, fa4, fa3
; ZVFHMIN32-NEXT: sb a0, 188(sp)
; ZVFHMIN32-NEXT: lh a0, 758(sp)
; ZVFHMIN32-NEXT: lh a1, 502(sp)
-; ZVFHMIN32-NEXT: csrr s3, vlenb
-; ZVFHMIN32-NEXT: slli s3, s3, 4
-; ZVFHMIN32-NEXT: add s3, sp, s3
-; ZVFHMIN32-NEXT: addi s3, s3, 848
-; ZVFHMIN32-NEXT: vl2r.v v8, (s3) # Unknown-size Folded Reload
-; ZVFHMIN32-NEXT: vmv.x.s s5, v8
-; ZVFHMIN32-NEXT: vmv.x.s s3, v16
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT: fmv.h.x fa5, t4
+; ZVFHMIN32-NEXT: fmv.h.x fa4, t2
+; ZVFHMIN32-NEXT: csrr t2, vlenb
+; ZVFHMIN32-NEXT: slli t2, t2, 3
+; ZVFHMIN32-NEXT: add t2, sp, t2
+; ZVFHMIN32-NEXT: addi t2, t2, 848
+; ZVFHMIN32-NEXT: vl2r.v v8, (t2) # Unknown-size Folded Reload
+; ZVFHMIN32-NEXT: vmv.x.s t2, v8
+; ZVFHMIN32-NEXT: fmv.h.x fa3, a0
+; ZVFHMIN32-NEXT: fmv.h.x fa2, a1
+; ZVFHMIN32-NEXT: feq.h a0, fa3, fa2
; ZVFHMIN32-NEXT: sb a0, 187(sp)
; ZVFHMIN32-NEXT: lh a0, 756(sp)
; ZVFHMIN32-NEXT: lh a1, 500(sp)
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a2
-; ZVFHMIN32-NEXT: feq.h t4, fa5, fa4
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT: fmv.h.x fa5, t3
+; ZVFHMIN32-NEXT: fmv.h.x fa3, t1
+; ZVFHMIN32-NEXT: csrr t1, vlenb
+; ZVFHMIN32-NEXT: li t3, 13
+; ZVFHMIN32-NEXT: mul t1, t1, t3
+; ZVFHMIN32-NEXT: add t1, sp, t1
+; ZVFHMIN32-NEXT: addi t1, t1, 848
+; ZVFHMIN32-NEXT: vl2r.v v8, (t1) # Unknown-size Folded Reload
+; ZVFHMIN32-NEXT: vmv.x.s t3, v8
+; ZVFHMIN32-NEXT: fmv.h.x fa2, a0
+; ZVFHMIN32-NEXT: fmv.h.x fa1, a1
+; ZVFHMIN32-NEXT: feq.h a0, fa2, fa1
; ZVFHMIN32-NEXT: sb a0, 186(sp)
; ZVFHMIN32-NEXT: lh a0, 754(sp)
+; ZVFHMIN32-NEXT: fmv.h.x fa2, t0
; ZVFHMIN32-NEXT: lh a1, 498(sp)
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a3
-; ZVFHMIN32-NEXT: feq.h t3, fa5, fa4
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT: fmv.h.x fa5, t1
-; ZVFHMIN32-NEXT: sb a0, 185(sp)
-; ZVFHMIN32-NEXT: lh a0, 752(sp)
-; ZVFHMIN32-NEXT: lh a1, 496(sp)
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a4
-; ZVFHMIN32-NEXT: feq.h t1, fa5, fa4
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT: fmv.h.x fa5, t2
-; ZVFHMIN32-NEXT: sb a0, 184(sp)
-; ZVFHMIN32-NEXT: lh a0, 750(sp)
-; ZVFHMIN32-NEXT: lh a1, 494(sp)
-; ZVFHMIN32-NEXT: fmv.h.x fa4, s6
+; ZVFHMIN32-NEXT: csrr t0, vlenb
+; ZVFHMIN32-NEXT: li t1, 19
+; ZVFHMIN32-NEXT: mul t0, t0, t1
+; ZVFHMIN32-NEXT: add t0, sp, t0
+; ZVFHMIN32-NEXT: addi t0, t0, 848
+; ZVFHMIN32-NEXT: vl2r.v v8, (t0) # Unknown-size Folded Reload
+; ZVFHMIN32-NEXT: vmv.x.s s3, v8
+; ZVFHMIN32-NEXT: fmv.h.x fa1, a0
+; ZVFHMIN32-NEXT: csrr a0, vlenb
+; ZVFHMIN32-NEXT: li t0, 21
+; ZVFHMIN32-NEXT: mul a0, a0, t0
+; ZVFHMIN32-NEXT: add a0, sp, a0
+; ZVFHMIN32-NEXT: addi a0, a0, 848
+; ZVFHMIN32-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN32-NEXT: vmv.x.s a0, v8
+; ZVFHMIN32-NEXT: fmv.h.x fa0, a1
+; ZVFHMIN32-NEXT: feq.h a1, fa1, fa0
+; ZVFHMIN32-NEXT: fmv.h.x fa1, a2
+; ZVFHMIN32-NEXT: sb a1, 185(sp)
+; ZVFHMIN32-NEXT: lh a1, 752(sp)
+; ZVFHMIN32-NEXT: fmv.h.x fa0, a3
+; ZVFHMIN32-NEXT: lh a2, 496(sp)
+; ZVFHMIN32-NEXT: feq.h t0, fa5, fa1
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT: feq.h t1, fa4, fa0
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a2
+; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a4
+; ZVFHMIN32-NEXT: sb a1, 184(sp)
+; ZVFHMIN32-NEXT: lh a1, 750(sp)
+; ZVFHMIN32-NEXT: fmv.h.x fa4, t2
+; ZVFHMIN32-NEXT: lh a2, 494(sp)
+; ZVFHMIN32-NEXT: feq.h a3, fa3, fa5
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT: feq.h a1, fa2, fa4
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a2
; ZVFHMIN32-NEXT: feq.h a2, fa5, fa4
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT: fmv.h.x fa5, t0
-; ZVFHMIN32-NEXT: sb a0, 183(sp)
-; ZVFHMIN32-NEXT: lh a0, 748(sp)
-; ZVFHMIN32-NEXT: lh a1, 492(sp)
-; ZVFHMIN32-NEXT: fmv.h.x fa4, s4
-; ZVFHMIN32-NEXT: feq.h a3, fa5, fa4
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
; ZVFHMIN32-NEXT: fmv.h.x fa5, a7
-; ZVFHMIN32-NEXT: sb a0, 182(sp)
-; ZVFHMIN32-NEXT: lh a0, 746(sp)
-; ZVFHMIN32-NEXT: lh a1, 490(sp)
-; ZVFHMIN32-NEXT: fmv.h.x fa4, s5
-; ZVFHMIN32-NEXT: feq.h a4, fa5, fa4
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT: sb a2, 183(sp)
+; ZVFHMIN32-NEXT: lh a2, 748(sp)
+; ZVFHMIN32-NEXT: lh a4, 492(sp)
+; ZVFHMIN32-NEXT: fmv.h.x fa4, t3
+; ZVFHMIN32-NEXT: feq.h a7, fa5, fa4
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a2
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a4
+; ZVFHMIN32-NEXT: feq.h a2, fa5, fa4
; ZVFHMIN32-NEXT: fmv.h.x fa5, a6
-; ZVFHMIN32-NEXT: sb a0, 181(sp)
-; ZVFHMIN32-NEXT: lh a0, 744(sp)
-; ZVFHMIN32-NEXT: lh a1, 488(sp)
+; ZVFHMIN32-NEXT: sb a2, 182(sp)
+; ZVFHMIN32-NEXT: lh a2, 746(sp)
+; ZVFHMIN32-NEXT: lh a4, 490(sp)
; ZVFHMIN32-NEXT: fmv.h.x fa4, s3
; ZVFHMIN32-NEXT: feq.h a6, fa5, fa4
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a2
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a4
+; ZVFHMIN32-NEXT: feq.h a2, fa5, fa4
; ZVFHMIN32-NEXT: fmv.h.x fa5, a5
-; ZVFHMIN32-NEXT: addi a1, sp, 848
-; ZVFHMIN32-NEXT: vl2r.v v8, (a1) # Unknown-size Folded Reload
-; ZVFHMIN32-NEXT: vmv.x.s a1, v8
+; ZVFHMIN32-NEXT: sb a2, 181(sp)
+; ZVFHMIN32-NEXT: lh a2, 744(sp)
+; ZVFHMIN32-NEXT: lh a4, 488(sp)
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a0
+; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a2
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a4
+; ZVFHMIN32-NEXT: feq.h a2, fa5, fa4
+; ZVFHMIN32-NEXT: lw a4, 108(sp) # 4-byte Folded Reload
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a4
+; ZVFHMIN32-NEXT: vmv.x.s a5, v0
; ZVFHMIN32-NEXT: vsetivli zero, 1, e16, m2, ta, ma
; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 15
-; ZVFHMIN32-NEXT: vmv.x.s a5, v8
-; ZVFHMIN32-NEXT: sb a0, 180(sp)
-; ZVFHMIN32-NEXT: lh a0, 742(sp)
-; ZVFHMIN32-NEXT: lh a7, 486(sp)
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a7
-; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT: sb a0, 179(sp)
-; ZVFHMIN32-NEXT: lh a0, 740(sp)
-; ZVFHMIN32-NEXT: lh a7, 484(sp)
-; ZVFHMIN32-NEXT: sb a2, 140(sp)
-; ZVFHMIN32-NEXT: sb t1, 141(sp)
-; ZVFHMIN32-NEXT: sb t3, 142(sp)
-; ZVFHMIN32-NEXT: sb t4, 143(sp)
-; ZVFHMIN32-NEXT: sb a1, 136(sp)
-; ZVFHMIN32-NEXT: sb a6, 137(sp)
-; ZVFHMIN32-NEXT: sb a4, 138(sp)
-; ZVFHMIN32-NEXT: sb a3, 139(sp)
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a7
+; ZVFHMIN32-NEXT: vmv.x.s a4, v8
+; ZVFHMIN32-NEXT: sb a2, 180(sp)
+; ZVFHMIN32-NEXT: lh a2, 742(sp)
+; ZVFHMIN32-NEXT: lh t2, 486(sp)
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a5
+; ZVFHMIN32-NEXT: feq.h a5, fa5, fa4
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a2
+; ZVFHMIN32-NEXT: fmv.h.x fa4, t2
+; ZVFHMIN32-NEXT: feq.h a2, fa5, fa4
+; ZVFHMIN32-NEXT: sb a2, 179(sp)
+; ZVFHMIN32-NEXT: lh a2, 740(sp)
+; ZVFHMIN32-NEXT: lh t2, 484(sp)
+; ZVFHMIN32-NEXT: sb a1, 140(sp)
+; ZVFHMIN32-NEXT: sb a3, 141(sp)
+; ZVFHMIN32-NEXT: sb t1, 142(sp)
+; ZVFHMIN32-NEXT: sb t0, 143(sp)
+; ZVFHMIN32-NEXT: sb a5, 136(sp)
+; ZVFHMIN32-NEXT: sb a0, 137(sp)
+; ZVFHMIN32-NEXT: sb a6, 138(sp)
+; ZVFHMIN32-NEXT: sb a7, 139(sp)
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a2
+; ZVFHMIN32-NEXT: fmv.h.x fa4, t2
; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
; ZVFHMIN32-NEXT: sb a0, 178(sp)
; ZVFHMIN32-NEXT: lh a0, 638(sp)
; ZVFHMIN32-NEXT: lh a1, 382(sp)
; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 14
-; ZVFHMIN32-NEXT: vmv.x.s t3, v8
+; ZVFHMIN32-NEXT: vmv.x.s t2, v8
; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
@@ -2086,7 +2087,7 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
; ZVFHMIN32-NEXT: lh a0, 636(sp)
; ZVFHMIN32-NEXT: lh a1, 380(sp)
; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 13
-; ZVFHMIN32-NEXT: vmv.x.s t2, v8
+; ZVFHMIN32-NEXT: vmv.x.s t1, v8
; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
@@ -2094,7 +2095,7 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
; ZVFHMIN32-NEXT: lh a0, 634(sp)
; ZVFHMIN32-NEXT: lh a1, 378(sp)
; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 12
-; ZVFHMIN32-NEXT: vmv.x.s t1, v8
+; ZVFHMIN32-NEXT: vmv.x.s t0, v8
; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
@@ -2102,7 +2103,7 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
; ZVFHMIN32-NEXT: lh a0, 632(sp)
; ZVFHMIN32-NEXT: lh a1, 376(sp)
; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 11
-; ZVFHMIN32-NEXT: vmv.x.s t0, v8
+; ZVFHMIN32-NEXT: vmv.x.s a7, v8
; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
@@ -2110,7 +2111,7 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
; ZVFHMIN32-NEXT: lh a0, 630(sp)
; ZVFHMIN32-NEXT: lh a1, 374(sp)
; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 10
-; ZVFHMIN32-NEXT: vmv.x.s a7, v8
+; ZVFHMIN32-NEXT: vmv.x.s a6, v8
; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
@@ -2118,102 +2119,101 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
; ZVFHMIN32-NEXT: lh a0, 628(sp)
; ZVFHMIN32-NEXT: lh a1, 372(sp)
; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 9
-; ZVFHMIN32-NEXT: vmv.x.s a6, v8
+; ZVFHMIN32-NEXT: vmv.x.s a5, v8
; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT: lw a1, 112(sp) # 4-byte Folded Reload
+; ZVFHMIN32-NEXT: lw a1, 116(sp) # 4-byte Folded Reload
; ZVFHMIN32-NEXT: fmv.h.x fa5, a1
; ZVFHMIN32-NEXT: sb a0, 250(sp)
; ZVFHMIN32-NEXT: lh a0, 626(sp)
; ZVFHMIN32-NEXT: lh a1, 370(sp)
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a5
-; ZVFHMIN32-NEXT: feq.h a3, fa5, fa4
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT: lw a1, 116(sp) # 4-byte Folded Reload
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT: sb a0, 249(sp)
-; ZVFHMIN32-NEXT: lh a0, 624(sp)
-; ZVFHMIN32-NEXT: lh a1, 368(sp)
-; ZVFHMIN32-NEXT: fmv.h.x fa4, t3
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a4
; ZVFHMIN32-NEXT: feq.h a2, fa5, fa4
; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
; ZVFHMIN32-NEXT: lw a1, 124(sp) # 4-byte Folded Reload
; ZVFHMIN32-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT: sb a0, 248(sp)
-; ZVFHMIN32-NEXT: lh a0, 622(sp)
-; ZVFHMIN32-NEXT: lh a1, 366(sp)
+; ZVFHMIN32-NEXT: sb a0, 249(sp)
+; ZVFHMIN32-NEXT: lh a1, 624(sp)
+; ZVFHMIN32-NEXT: lh a3, 368(sp)
; ZVFHMIN32-NEXT: fmv.h.x fa4, t2
-; ZVFHMIN32-NEXT: feq.h a4, fa5, fa4
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT: lw a1, 108(sp) # 4-byte Folded Reload
; ZVFHMIN32-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT: sb a0, 247(sp)
-; ZVFHMIN32-NEXT: lh a0, 620(sp)
-; ZVFHMIN32-NEXT: lh a1, 364(sp)
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a3
+; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT: lw a3, 112(sp) # 4-byte Folded Reload
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a3
+; ZVFHMIN32-NEXT: sb a1, 248(sp)
+; ZVFHMIN32-NEXT: lh a1, 622(sp)
+; ZVFHMIN32-NEXT: lh a3, 366(sp)
; ZVFHMIN32-NEXT: fmv.h.x fa4, t1
-; ZVFHMIN32-NEXT: feq.h a5, fa5, fa4
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT: lw a1, 120(sp) # 4-byte Folded Reload
+; ZVFHMIN32-NEXT: feq.h a4, fa5, fa4
; ZVFHMIN32-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT: sb a0, 246(sp)
-; ZVFHMIN32-NEXT: lh a0, 618(sp)
-; ZVFHMIN32-NEXT: lh a1, 362(sp)
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a3
+; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT: lw a3, 120(sp) # 4-byte Folded Reload
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a3
+; ZVFHMIN32-NEXT: sb a1, 247(sp)
+; ZVFHMIN32-NEXT: lh a1, 620(sp)
+; ZVFHMIN32-NEXT: lh a3, 364(sp)
; ZVFHMIN32-NEXT: fmv.h.x fa4, t0
; ZVFHMIN32-NEXT: feq.h t0, fa5, fa4
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a3
+; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4
; ZVFHMIN32-NEXT: fmv.h.x fa5, s2
-; ZVFHMIN32-NEXT: sb a0, 245(sp)
-; ZVFHMIN32-NEXT: lh a0, 616(sp)
-; ZVFHMIN32-NEXT: lh a1, 360(sp)
+; ZVFHMIN32-NEXT: sb a1, 246(sp)
+; ZVFHMIN32-NEXT: lh a1, 618(sp)
+; ZVFHMIN32-NEXT: lh a3, 362(sp)
; ZVFHMIN32-NEXT: fmv.h.x fa4, a7
; ZVFHMIN32-NEXT: feq.h a7, fa5, fa4
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a3
+; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4
; ZVFHMIN32-NEXT: fmv.h.x fa5, t6
-; ZVFHMIN32-NEXT: sb a0, 244(sp)
-; ZVFHMIN32-NEXT: lh a0, 614(sp)
-; ZVFHMIN32-NEXT: lh a1, 358(sp)
+; ZVFHMIN32-NEXT: sb a1, 245(sp)
+; ZVFHMIN32-NEXT: lh a1, 616(sp)
+; ZVFHMIN32-NEXT: lh a3, 360(sp)
; ZVFHMIN32-NEXT: fmv.h.x fa4, a6
; ZVFHMIN32-NEXT: feq.h a6, fa5, fa4
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a3
+; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4
; ZVFHMIN32-NEXT: fmv.h.x fa5, t5
+; ZVFHMIN32-NEXT: sb a1, 244(sp)
+; ZVFHMIN32-NEXT: lh a1, 614(sp)
+; ZVFHMIN32-NEXT: lh a3, 358(sp)
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a5
+; ZVFHMIN32-NEXT: feq.h a5, fa5, fa4
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a3
+; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT: fmv.h.x fa5, t4
; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 8
-; ZVFHMIN32-NEXT: vmv.x.s a1, v8
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT: sb a0, 243(sp)
-; ZVFHMIN32-NEXT: lh a0, 612(sp)
-; ZVFHMIN32-NEXT: lh a1, 356(sp)
-; ZVFHMIN32-NEXT: sb a5, 204(sp)
+; ZVFHMIN32-NEXT: vmv.x.s a3, v8
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a3
+; ZVFHMIN32-NEXT: sb a1, 243(sp)
+; ZVFHMIN32-NEXT: lh a1, 612(sp)
+; ZVFHMIN32-NEXT: lh a3, 356(sp)
+; ZVFHMIN32-NEXT: sb t0, 204(sp)
; ZVFHMIN32-NEXT: sb a4, 205(sp)
-; ZVFHMIN32-NEXT: sb a2, 206(sp)
-; ZVFHMIN32-NEXT: sb a3, 207(sp)
-; ZVFHMIN32-NEXT: feq.h a2, fa5, fa4
-; ZVFHMIN32-NEXT: sb a2, 200(sp)
-; ZVFHMIN32-NEXT: sb a6, 201(sp)
-; ZVFHMIN32-NEXT: sb a7, 202(sp)
-; ZVFHMIN32-NEXT: sb t0, 203(sp)
-; ZVFHMIN32-NEXT: li a2, 128
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT: sb a0, 242(sp)
-; ZVFHMIN32-NEXT: addi a0, sp, 128
-; ZVFHMIN32-NEXT: vsetvli zero, a2, e8, m8, ta, ma
-; ZVFHMIN32-NEXT: vle8.v v8, (a0)
+; ZVFHMIN32-NEXT: sb a0, 206(sp)
+; ZVFHMIN32-NEXT: sb a2, 207(sp)
+; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT: sb a0, 200(sp)
+; ZVFHMIN32-NEXT: sb a5, 201(sp)
+; ZVFHMIN32-NEXT: sb a6, 202(sp)
+; ZVFHMIN32-NEXT: sb a7, 203(sp)
+; ZVFHMIN32-NEXT: li a0, 128
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a3
+; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT: sb a1, 242(sp)
+; ZVFHMIN32-NEXT: addi a1, sp, 128
+; ZVFHMIN32-NEXT: vsetvli zero, a0, e8, m8, ta, ma
+; ZVFHMIN32-NEXT: vle8.v v8, (a1)
; ZVFHMIN32-NEXT: vand.vi v8, v8, 1
; ZVFHMIN32-NEXT: vmsne.vi v0, v8, 0
; ZVFHMIN32-NEXT: addi sp, s0, -896
@@ -2498,19 +2498,16 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
; ZVFHMIN64-NEXT: vs1r.v v10, (a2) # Unknown-size Folded Spill
; ZVFHMIN64-NEXT: vsetivli zero, 1, e16, m2, ta, ma
; ZVFHMIN64-NEXT: vslidedown.vi v26, v8, 15
-; ZVFHMIN64-NEXT: vslidedown.vi v20, v8, 14
-; ZVFHMIN64-NEXT: vslidedown.vi v28, v8, 13
-; ZVFHMIN64-NEXT: vslidedown.vi v10, v8, 12
-; ZVFHMIN64-NEXT: csrr a2, vlenb
-; ZVFHMIN64-NEXT: slli a2, a2, 1
-; ZVFHMIN64-NEXT: add a2, sp, a2
-; ZVFHMIN64-NEXT: addi a2, a2, 800
+; ZVFHMIN64-NEXT: vslidedown.vi v28, v8, 14
+; ZVFHMIN64-NEXT: vslidedown.vi v10, v8, 13
+; ZVFHMIN64-NEXT: addi a2, sp, 800
; ZVFHMIN64-NEXT: vs2r.v v10, (a2) # Unknown-size Folded Spill
-; ZVFHMIN64-NEXT: vslidedown.vi v4, v8, 11
-; ZVFHMIN64-NEXT: vslidedown.vi v2, v8, 10
-; ZVFHMIN64-NEXT: vslidedown.vi v30, v8, 9
-; ZVFHMIN64-NEXT: vslidedown.vi v22, v8, 8
-; ZVFHMIN64-NEXT: vmv.x.s a4, v16
+; ZVFHMIN64-NEXT: vslidedown.vi v6, v8, 12
+; ZVFHMIN64-NEXT: vslidedown.vi v2, v8, 11
+; ZVFHMIN64-NEXT: vslidedown.vi v22, v8, 10
+; ZVFHMIN64-NEXT: vslidedown.vi v20, v8, 9
+; ZVFHMIN64-NEXT: vslidedown.vi v18, v8, 8
+; ZVFHMIN64-NEXT: vmv.x.s a3, v16
; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
@@ -2518,52 +2515,51 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
; ZVFHMIN64-NEXT: lh a0, 560(sp)
; ZVFHMIN64-NEXT: lh a1, 304(sp)
; ZVFHMIN64-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; ZVFHMIN64-NEXT: vslidedown.vi v3, v16, 7
-; ZVFHMIN64-NEXT: vslidedown.vi v31, v16, 6
-; ZVFHMIN64-NEXT: vslidedown.vi v5, v16, 5
+; ZVFHMIN64-NEXT: vslidedown.vi v21, v16, 7
+; ZVFHMIN64-NEXT: vslidedown.vi v3, v16, 6
+; ZVFHMIN64-NEXT: vslidedown.vi v19, v16, 5
; ZVFHMIN64-NEXT: vslidedown.vi v23, v16, 4
; ZVFHMIN64-NEXT: vslidedown.vi v8, v16, 3
; ZVFHMIN64-NEXT: csrr a2, vlenb
-; ZVFHMIN64-NEXT: li a3, 21
-; ZVFHMIN64-NEXT: mul a2, a2, a3
+; ZVFHMIN64-NEXT: li a4, 10
+; ZVFHMIN64-NEXT: mul a2, a2, a4
; ZVFHMIN64-NEXT: add a2, sp, a2
; ZVFHMIN64-NEXT: addi a2, a2, 800
; ZVFHMIN64-NEXT: vs1r.v v8, (a2) # Unknown-size Folded Spill
; ZVFHMIN64-NEXT: vslidedown.vi v8, v16, 2
; ZVFHMIN64-NEXT: csrr a2, vlenb
-; ZVFHMIN64-NEXT: li a3, 20
-; ZVFHMIN64-NEXT: mul a2, a2, a3
+; ZVFHMIN64-NEXT: slli a2, a2, 4
; ZVFHMIN64-NEXT: add a2, sp, a2
; ZVFHMIN64-NEXT: addi a2, a2, 800
; ZVFHMIN64-NEXT: vs1r.v v8, (a2) # Unknown-size Folded Spill
; ZVFHMIN64-NEXT: vslidedown.vi v8, v16, 1
; ZVFHMIN64-NEXT: csrr a2, vlenb
-; ZVFHMIN64-NEXT: li a3, 22
-; ZVFHMIN64-NEXT: mul a2, a2, a3
+; ZVFHMIN64-NEXT: slli a4, a2, 4
+; ZVFHMIN64-NEXT: sub a2, a4, a2
; ZVFHMIN64-NEXT: add a2, sp, a2
; ZVFHMIN64-NEXT: addi a2, a2, 800
; ZVFHMIN64-NEXT: vs1r.v v8, (a2) # Unknown-size Folded Spill
; ZVFHMIN64-NEXT: vsetivli zero, 1, e16, m2, ta, ma
-; ZVFHMIN64-NEXT: vslidedown.vi v18, v16, 15
-; ZVFHMIN64-NEXT: vslidedown.vi v14, v16, 14
-; ZVFHMIN64-NEXT: vslidedown.vi v8, v16, 13
+; ZVFHMIN64-NEXT: vslidedown.vi v14, v16, 15
+; ZVFHMIN64-NEXT: vslidedown.vi v8, v16, 14
+; ZVFHMIN64-NEXT: vslidedown.vi v10, v16, 13
; ZVFHMIN64-NEXT: vslidedown.vi v12, v16, 12
-; ZVFHMIN64-NEXT: vslidedown.vi v10, v16, 11
-; ZVFHMIN64-NEXT: vslidedown.vi v6, v16, 10
+; ZVFHMIN64-NEXT: vslidedown.vi v30, v16, 11
; ZVFHMIN64-NEXT: csrr a2, vlenb
-; ZVFHMIN64-NEXT: li a3, 18
-; ZVFHMIN64-NEXT: mul a2, a2, a3
+; ZVFHMIN64-NEXT: slli a4, a2, 4
+; ZVFHMIN64-NEXT: add a2, a4, a2
; ZVFHMIN64-NEXT: add a2, sp, a2
; ZVFHMIN64-NEXT: addi a2, a2, 800
-; ZVFHMIN64-NEXT: vs2r.v v6, (a2) # Unknown-size Folded Spill
-; ZVFHMIN64-NEXT: vslidedown.vi v6, v16, 9
+; ZVFHMIN64-NEXT: vs2r.v v30, (a2) # Unknown-size Folded Spill
+; ZVFHMIN64-NEXT: vslidedown.vi v30, v16, 10
; ZVFHMIN64-NEXT: csrr a2, vlenb
-; ZVFHMIN64-NEXT: li a3, 14
-; ZVFHMIN64-NEXT: mul a2, a2, a3
+; ZVFHMIN64-NEXT: li a4, 11
+; ZVFHMIN64-NEXT: mul a2, a2, a4
; ZVFHMIN64-NEXT: add a2, sp, a2
; ZVFHMIN64-NEXT: addi a2, a2, 800
-; ZVFHMIN64-NEXT: vs2r.v v6, (a2) # Unknown-size Folded Spill
-; ZVFHMIN64-NEXT: vslidedown.vi v6, v16, 8
+; ZVFHMIN64-NEXT: vs2r.v v30, (a2) # Unknown-size Folded Spill
+; ZVFHMIN64-NEXT: vslidedown.vi v4, v16, 9
+; ZVFHMIN64-NEXT: vslidedown.vi v30, v16, 8
; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
@@ -2571,12 +2567,12 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
; ZVFHMIN64-NEXT: lh a0, 558(sp)
; ZVFHMIN64-NEXT: lh a1, 302(sp)
; ZVFHMIN64-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; ZVFHMIN64-NEXT: vslidedown.vi v13, v0, 7
-; ZVFHMIN64-NEXT: vslidedown.vi v29, v0, 6
-; ZVFHMIN64-NEXT: vslidedown.vi v11, v0, 5
-; ZVFHMIN64-NEXT: vslidedown.vi v7, v0, 4
-; ZVFHMIN64-NEXT: vslidedown.vi v9, v0, 3
-; ZVFHMIN64-NEXT: vslidedown.vi v21, v0, 2
+; ZVFHMIN64-NEXT: vslidedown.vi v11, v0, 7
+; ZVFHMIN64-NEXT: vslidedown.vi v7, v0, 6
+; ZVFHMIN64-NEXT: vslidedown.vi v9, v0, 5
+; ZVFHMIN64-NEXT: vslidedown.vi v29, v0, 4
+; ZVFHMIN64-NEXT: vslidedown.vi v31, v0, 3
+; ZVFHMIN64-NEXT: vslidedown.vi v5, v0, 2
; ZVFHMIN64-NEXT: vslidedown.vi v27, v0, 1
; ZVFHMIN64-NEXT: vsetivli zero, 1, e16, m2, ta, ma
; ZVFHMIN64-NEXT: vslidedown.vi v16, v0, 15
@@ -2587,63 +2583,63 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
; ZVFHMIN64-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill
; ZVFHMIN64-NEXT: vslidedown.vi v16, v0, 14
; ZVFHMIN64-NEXT: csrr a2, vlenb
-; ZVFHMIN64-NEXT: slli a2, a2, 3
+; ZVFHMIN64-NEXT: slli a2, a2, 1
; ZVFHMIN64-NEXT: add a2, sp, a2
; ZVFHMIN64-NEXT: addi a2, a2, 800
; ZVFHMIN64-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill
; ZVFHMIN64-NEXT: vslidedown.vi v16, v0, 13
; ZVFHMIN64-NEXT: csrr a2, vlenb
-; ZVFHMIN64-NEXT: li a3, 6
-; ZVFHMIN64-NEXT: mul a2, a2, a3
+; ZVFHMIN64-NEXT: li a4, 6
+; ZVFHMIN64-NEXT: mul a2, a2, a4
; ZVFHMIN64-NEXT: add a2, sp, a2
; ZVFHMIN64-NEXT: addi a2, a2, 800
; ZVFHMIN64-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill
; ZVFHMIN64-NEXT: vslidedown.vi v16, v0, 12
; ZVFHMIN64-NEXT: csrr a2, vlenb
-; ZVFHMIN64-NEXT: li a3, 12
-; ZVFHMIN64-NEXT: mul a2, a2, a3
+; ZVFHMIN64-NEXT: slli a2, a2, 3
; ZVFHMIN64-NEXT: add a2, sp, a2
; ZVFHMIN64-NEXT: addi a2, a2, 800
; ZVFHMIN64-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill
; ZVFHMIN64-NEXT: vslidedown.vi v16, v0, 11
; ZVFHMIN64-NEXT: csrr a2, vlenb
-; ZVFHMIN64-NEXT: li a3, 10
-; ZVFHMIN64-NEXT: mul a2, a2, a3
+; ZVFHMIN64-NEXT: li a4, 13
+; ZVFHMIN64-NEXT: mul a2, a2, a4
; ZVFHMIN64-NEXT: add a2, sp, a2
; ZVFHMIN64-NEXT: addi a2, a2, 800
; ZVFHMIN64-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill
; ZVFHMIN64-NEXT: vslidedown.vi v16, v0, 10
; ZVFHMIN64-NEXT: csrr a2, vlenb
-; ZVFHMIN64-NEXT: slli a2, a2, 4
+; ZVFHMIN64-NEXT: li a4, 19
+; ZVFHMIN64-NEXT: mul a2, a2, a4
; ZVFHMIN64-NEXT: add a2, sp, a2
; ZVFHMIN64-NEXT: addi a2, a2, 800
; ZVFHMIN64-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill
; ZVFHMIN64-NEXT: vslidedown.vi v16, v0, 9
+; ZVFHMIN64-NEXT: csrr a2, vlenb
+; ZVFHMIN64-NEXT: li a4, 21
+; ZVFHMIN64-NEXT: mul a2, a2, a4
+; ZVFHMIN64-NEXT: add a2, sp, a2
+; ZVFHMIN64-NEXT: addi a2, a2, 800
+; ZVFHMIN64-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill
; ZVFHMIN64-NEXT: vslidedown.vi v0, v0, 8
-; ZVFHMIN64-NEXT: addi a2, sp, 800
-; ZVFHMIN64-NEXT: vs2r.v v0, (a2) # Unknown-size Folded Spill
-; ZVFHMIN64-NEXT: vmv.x.s t4, v26
; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
; ZVFHMIN64-NEXT: sb a0, 215(sp)
; ZVFHMIN64-NEXT: lh a0, 556(sp)
; ZVFHMIN64-NEXT: lh a1, 300(sp)
-; ZVFHMIN64-NEXT: vmv.x.s t3, v20
-; ZVFHMIN64-NEXT: vmv.x.s t1, v28
+; ZVFHMIN64-NEXT: vmv.x.s t3, v26
+; ZVFHMIN64-NEXT: vmv.x.s t2, v28
; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
; ZVFHMIN64-NEXT: sb a0, 214(sp)
; ZVFHMIN64-NEXT: lh a0, 554(sp)
; ZVFHMIN64-NEXT: lh a1, 298(sp)
-; ZVFHMIN64-NEXT: csrr a2, vlenb
-; ZVFHMIN64-NEXT: slli a2, a2, 1
-; ZVFHMIN64-NEXT: add a2, sp, a2
-; ZVFHMIN64-NEXT: addi a2, a2, 800
-; ZVFHMIN64-NEXT: vl2r.v v0, (a2) # Unknown-size Folded Reload
-; ZVFHMIN64-NEXT: vmv.x.s t2, v0
-; ZVFHMIN64-NEXT: vmv.x.s t0, v4
+; ZVFHMIN64-NEXT: addi a2, sp, 800
+; ZVFHMIN64-NEXT: vl2r.v v16, (a2) # Unknown-size Folded Reload
+; ZVFHMIN64-NEXT: vmv.x.s t1, v16
+; ZVFHMIN64-NEXT: vmv.x.s t0, v6
; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
@@ -2651,229 +2647,234 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
; ZVFHMIN64-NEXT: lh a0, 552(sp)
; ZVFHMIN64-NEXT: lh a1, 296(sp)
; ZVFHMIN64-NEXT: vmv.x.s a7, v2
-; ZVFHMIN64-NEXT: vmv.x.s a6, v30
+; ZVFHMIN64-NEXT: vmv.x.s a6, v22
; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
; ZVFHMIN64-NEXT: sb a0, 212(sp)
; ZVFHMIN64-NEXT: lh a0, 550(sp)
; ZVFHMIN64-NEXT: lh a1, 294(sp)
-; ZVFHMIN64-NEXT: vmv.x.s a5, v22
+; ZVFHMIN64-NEXT: vmv.x.s a5, v20
; ZVFHMIN64-NEXT: vmv.x.s a2, v18
-; ZVFHMIN64-NEXT: sd a2, 96(sp) # 8-byte Folded Spill
+; ZVFHMIN64-NEXT: sd a2, 88(sp) # 8-byte Folded Spill
; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
; ZVFHMIN64-NEXT: sb a0, 211(sp)
-; ZVFHMIN64-NEXT: lh a1, 548(sp)
-; ZVFHMIN64-NEXT: lh t5, 292(sp)
-; ZVFHMIN64-NEXT: vmv.x.s a0, v14
-; ZVFHMIN64-NEXT: sd a0, 104(sp) # 8-byte Folded Spill
-; ZVFHMIN64-NEXT: vmv.x.s a0, v8
-; ZVFHMIN64-NEXT: sd a0, 120(sp) # 8-byte Folded Spill
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT: fmv.h.x fa4, t5
-; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4
-; ZVFHMIN64-NEXT: sb a1, 210(sp)
-; ZVFHMIN64-NEXT: lh a1, 546(sp)
-; ZVFHMIN64-NEXT: lh t5, 290(sp)
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a4
-; ZVFHMIN64-NEXT: vmv.x.s a4, v24
+; ZVFHMIN64-NEXT: lh a0, 548(sp)
+; ZVFHMIN64-NEXT: lh a1, 292(sp)
+; ZVFHMIN64-NEXT: vmv.x.s a2, v14
+; ZVFHMIN64-NEXT: sd a2, 104(sp) # 8-byte Folded Spill
+; ZVFHMIN64-NEXT: vmv.x.s a2, v8
+; ZVFHMIN64-NEXT: sd a2, 120(sp) # 8-byte Folded Spill
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT: fmv.h.x fa3, t5
-; ZVFHMIN64-NEXT: feq.h a1, fa4, fa3
-; ZVFHMIN64-NEXT: sb a1, 209(sp)
-; ZVFHMIN64-NEXT: lh a1, 544(sp)
-; ZVFHMIN64-NEXT: lh t5, 288(sp)
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a4
-; ZVFHMIN64-NEXT: feq.h a4, fa5, fa4
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT: fmv.h.x fa4, t5
-; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4
-; ZVFHMIN64-NEXT: sb a4, 192(sp)
-; ZVFHMIN64-NEXT: sb a1, 208(sp)
-; ZVFHMIN64-NEXT: lh t5, 738(sp)
-; ZVFHMIN64-NEXT: lh t6, 482(sp)
-; ZVFHMIN64-NEXT: vmv.x.s a0, v12
-; ZVFHMIN64-NEXT: sd a0, 88(sp) # 8-byte Folded Spill
-; ZVFHMIN64-NEXT: vmv.x.s a0, v10
-; ZVFHMIN64-NEXT: sd a0, 112(sp) # 8-byte Folded Spill
-; ZVFHMIN64-NEXT: fmv.h.x fa5, t5
-; ZVFHMIN64-NEXT: fmv.h.x fa4, t6
-; ZVFHMIN64-NEXT: feq.h t5, fa5, fa4
-; ZVFHMIN64-NEXT: sb t5, 177(sp)
-; ZVFHMIN64-NEXT: lh t5, 736(sp)
-; ZVFHMIN64-NEXT: lh t6, 480(sp)
-; ZVFHMIN64-NEXT: csrr a0, vlenb
-; ZVFHMIN64-NEXT: li a1, 29
-; ZVFHMIN64-NEXT: mul a0, a0, a1
-; ZVFHMIN64-NEXT: add a0, sp, a0
-; ZVFHMIN64-NEXT: lh s5, 800(a0) # 8-byte Folded Reload
-; ZVFHMIN64-NEXT: csrr a0, vlenb
-; ZVFHMIN64-NEXT: li a1, 28
-; ZVFHMIN64-NEXT: mul a0, a0, a1
-; ZVFHMIN64-NEXT: add a0, sp, a0
-; ZVFHMIN64-NEXT: lh s6, 800(a0) # 8-byte Folded Reload
-; ZVFHMIN64-NEXT: fmv.h.x fa5, t5
-; ZVFHMIN64-NEXT: fmv.h.x fa4, t6
-; ZVFHMIN64-NEXT: feq.h t5, fa5, fa4
-; ZVFHMIN64-NEXT: sb t5, 176(sp)
-; ZVFHMIN64-NEXT: lh t5, 734(sp)
-; ZVFHMIN64-NEXT: lh t6, 478(sp)
-; ZVFHMIN64-NEXT: csrr a0, vlenb
-; ZVFHMIN64-NEXT: li a1, 27
-; ZVFHMIN64-NEXT: mul a0, a0, a1
-; ZVFHMIN64-NEXT: add a0, sp, a0
-; ZVFHMIN64-NEXT: lh s7, 800(a0) # 8-byte Folded Reload
-; ZVFHMIN64-NEXT: csrr a0, vlenb
-; ZVFHMIN64-NEXT: li a1, 26
-; ZVFHMIN64-NEXT: mul a0, a0, a1
-; ZVFHMIN64-NEXT: add a0, sp, a0
-; ZVFHMIN64-NEXT: lh s8, 800(a0) # 8-byte Folded Reload
-; ZVFHMIN64-NEXT: fmv.h.x fa5, t5
-; ZVFHMIN64-NEXT: fmv.h.x fa4, t6
-; ZVFHMIN64-NEXT: feq.h t5, fa5, fa4
-; ZVFHMIN64-NEXT: sb t5, 175(sp)
-; ZVFHMIN64-NEXT: lh t5, 732(sp)
-; ZVFHMIN64-NEXT: lh t6, 476(sp)
-; ZVFHMIN64-NEXT: csrr a0, vlenb
-; ZVFHMIN64-NEXT: li a1, 25
-; ZVFHMIN64-NEXT: mul a0, a0, a1
-; ZVFHMIN64-NEXT: add a0, sp, a0
-; ZVFHMIN64-NEXT: lh s4, 800(a0) # 8-byte Folded Reload
-; ZVFHMIN64-NEXT: csrr a0, vlenb
-; ZVFHMIN64-NEXT: li a1, 24
-; ZVFHMIN64-NEXT: mul a0, a0, a1
-; ZVFHMIN64-NEXT: add a0, sp, a0
-; ZVFHMIN64-NEXT: lh s3, 800(a0) # 8-byte Folded Reload
-; ZVFHMIN64-NEXT: fmv.h.x fa5, t5
-; ZVFHMIN64-NEXT: fmv.h.x fa4, t6
-; ZVFHMIN64-NEXT: feq.h t5, fa5, fa4
-; ZVFHMIN64-NEXT: sb t5, 174(sp)
-; ZVFHMIN64-NEXT: lh t6, 730(sp)
-; ZVFHMIN64-NEXT: lh s9, 474(sp)
-; ZVFHMIN64-NEXT: csrr a0, vlenb
-; ZVFHMIN64-NEXT: li a1, 23
-; ZVFHMIN64-NEXT: mul a0, a0, a1
-; ZVFHMIN64-NEXT: add a0, sp, a0
-; ZVFHMIN64-NEXT: lh s2, 800(a0) # 8-byte Folded Reload
-; ZVFHMIN64-NEXT: vmv.x.s t5, v3
-; ZVFHMIN64-NEXT: fmv.h.x fa5, t6
+; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT: sb a0, 210(sp)
+; ZVFHMIN64-NEXT: lh a0, 546(sp)
+; ZVFHMIN64-NEXT: lh a1, 290(sp)
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a3
+; ZVFHMIN64-NEXT: vmv.x.s a3, v24
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a0
+; ZVFHMIN64-NEXT: fmv.h.x fa3, a1
+; ZVFHMIN64-NEXT: feq.h a0, fa4, fa3
+; ZVFHMIN64-NEXT: sb a0, 209(sp)
+; ZVFHMIN64-NEXT: lh a0, 544(sp)
+; ZVFHMIN64-NEXT: lh a1, 288(sp)
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a3
+; ZVFHMIN64-NEXT: feq.h a3, fa5, fa4
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT: sb a3, 192(sp)
+; ZVFHMIN64-NEXT: sb a0, 208(sp)
+; ZVFHMIN64-NEXT: lh a0, 738(sp)
+; ZVFHMIN64-NEXT: lh a1, 482(sp)
+; ZVFHMIN64-NEXT: vmv.x.s a2, v10
+; ZVFHMIN64-NEXT: sd a2, 96(sp) # 8-byte Folded Spill
+; ZVFHMIN64-NEXT: vmv.x.s a2, v12
+; ZVFHMIN64-NEXT: sd a2, 112(sp) # 8-byte Folded Spill
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT: sb a0, 177(sp)
+; ZVFHMIN64-NEXT: lh a0, 736(sp)
+; ZVFHMIN64-NEXT: lh a1, 480(sp)
+; ZVFHMIN64-NEXT: csrr a2, vlenb
+; ZVFHMIN64-NEXT: li a3, 29
+; ZVFHMIN64-NEXT: mul a2, a2, a3
+; ZVFHMIN64-NEXT: add a2, sp, a2
+; ZVFHMIN64-NEXT: lh s5, 800(a2) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT: csrr a2, vlenb
+; ZVFHMIN64-NEXT: li a3, 28
+; ZVFHMIN64-NEXT: mul a2, a2, a3
+; ZVFHMIN64-NEXT: add a2, sp, a2
+; ZVFHMIN64-NEXT: lh s2, 800(a2) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT: sb a0, 176(sp)
+; ZVFHMIN64-NEXT: lh a0, 734(sp)
+; ZVFHMIN64-NEXT: lh a1, 478(sp)
+; ZVFHMIN64-NEXT: csrr a2, vlenb
+; ZVFHMIN64-NEXT: li a3, 27
+; ZVFHMIN64-NEXT: mul a2, a2, a3
+; ZVFHMIN64-NEXT: add a2, sp, a2
+; ZVFHMIN64-NEXT: lh s6, 800(a2) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT: csrr a2, vlenb
+; ZVFHMIN64-NEXT: li a3, 26
+; ZVFHMIN64-NEXT: mul a2, a2, a3
+; ZVFHMIN64-NEXT: add a2, sp, a2
+; ZVFHMIN64-NEXT: lh s3, 800(a2) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT: sb a0, 175(sp)
+; ZVFHMIN64-NEXT: lh a0, 732(sp)
+; ZVFHMIN64-NEXT: lh a1, 476(sp)
+; ZVFHMIN64-NEXT: csrr a2, vlenb
+; ZVFHMIN64-NEXT: li a3, 25
+; ZVFHMIN64-NEXT: mul a2, a2, a3
+; ZVFHMIN64-NEXT: add a2, sp, a2
+; ZVFHMIN64-NEXT: lh s7, 800(a2) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT: csrr a2, vlenb
+; ZVFHMIN64-NEXT: li a3, 24
+; ZVFHMIN64-NEXT: mul a2, a2, a3
+; ZVFHMIN64-NEXT: add a2, sp, a2
+; ZVFHMIN64-NEXT: lh s4, 800(a2) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT: sb a0, 174(sp)
+; ZVFHMIN64-NEXT: lh a0, 730(sp)
+; ZVFHMIN64-NEXT: lh a1, 474(sp)
+; ZVFHMIN64-NEXT: csrr a2, vlenb
+; ZVFHMIN64-NEXT: li a3, 23
+; ZVFHMIN64-NEXT: mul a2, a2, a3
+; ZVFHMIN64-NEXT: add a2, sp, a2
+; ZVFHMIN64-NEXT: lh s8, 800(a2) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT: vmv.x.s t4, v21
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT: sb a0, 173(sp)
+; ZVFHMIN64-NEXT: lh a0, 728(sp)
+; ZVFHMIN64-NEXT: lh a1, 472(sp)
+; ZVFHMIN64-NEXT: vmv.x.s t6, v3
+; ZVFHMIN64-NEXT: vmv.x.s t5, v19
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT: sb a0, 172(sp)
+; ZVFHMIN64-NEXT: lh a0, 726(sp)
+; ZVFHMIN64-NEXT: lh a1, 470(sp)
+; ZVFHMIN64-NEXT: vmv.x.s s10, v11
+; ZVFHMIN64-NEXT: vmv.x.s s11, v7
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT: sb a0, 171(sp)
+; ZVFHMIN64-NEXT: lh a0, 724(sp)
+; ZVFHMIN64-NEXT: lh s9, 468(sp)
+; ZVFHMIN64-NEXT: vmv.x.s a4, v9
+; ZVFHMIN64-NEXT: vmv.x.s ra, v29
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
; ZVFHMIN64-NEXT: fmv.h.x fa4, s9
-; ZVFHMIN64-NEXT: feq.h t6, fa5, fa4
-; ZVFHMIN64-NEXT: sb t6, 173(sp)
-; ZVFHMIN64-NEXT: lh s9, 728(sp)
-; ZVFHMIN64-NEXT: lh s10, 472(sp)
-; ZVFHMIN64-NEXT: vmv.x.s t6, v31
-; ZVFHMIN64-NEXT: vmv.x.s ra, v13
-; ZVFHMIN64-NEXT: fmv.h.x fa5, s9
-; ZVFHMIN64-NEXT: fmv.h.x fa4, s10
-; ZVFHMIN64-NEXT: feq.h s9, fa5, fa4
-; ZVFHMIN64-NEXT: sb s9, 172(sp)
-; ZVFHMIN64-NEXT: lh s9, 726(sp)
-; ZVFHMIN64-NEXT: lh s10, 470(sp)
-; ZVFHMIN64-NEXT: vmv.x.s a2, v29
-; ZVFHMIN64-NEXT: vmv.x.s a3, v11
-; ZVFHMIN64-NEXT: fmv.h.x fa5, s9
-; ZVFHMIN64-NEXT: fmv.h.x fa4, s10
-; ZVFHMIN64-NEXT: feq.h s9, fa5, fa4
-; ZVFHMIN64-NEXT: sb s9, 171(sp)
-; ZVFHMIN64-NEXT: lh s10, 724(sp)
-; ZVFHMIN64-NEXT: lh s11, 468(sp)
-; ZVFHMIN64-NEXT: vmv.x.s a4, v7
-; ZVFHMIN64-NEXT: vmv.x.s s9, v9
-; ZVFHMIN64-NEXT: fmv.h.x fa5, s10
-; ZVFHMIN64-NEXT: fmv.h.x fa4, s11
-; ZVFHMIN64-NEXT: feq.h s10, fa5, fa4
-; ZVFHMIN64-NEXT: sb s10, 170(sp)
+; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT: sb a0, 170(sp)
; ZVFHMIN64-NEXT: lh a0, 722(sp)
; ZVFHMIN64-NEXT: lh a1, 466(sp)
-; ZVFHMIN64-NEXT: vmv.x.s s10, v21
-; ZVFHMIN64-NEXT: vmv.x.s s11, v27
+; ZVFHMIN64-NEXT: vmv.x.s s9, v31
+; ZVFHMIN64-NEXT: vmv.x.s a3, v5
; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
; ZVFHMIN64-NEXT: sb a0, 169(sp)
; ZVFHMIN64-NEXT: lh a0, 720(sp)
; ZVFHMIN64-NEXT: lh a1, 464(sp)
+; ZVFHMIN64-NEXT: vmv.x.s a2, v27
; ZVFHMIN64-NEXT: fmv.h.x fa5, s5
-; ZVFHMIN64-NEXT: fmv.h.x fa4, s6
-; ZVFHMIN64-NEXT: fmv.h.x fa3, a0
-; ZVFHMIN64-NEXT: fmv.h.x fa2, a1
-; ZVFHMIN64-NEXT: feq.h a0, fa3, fa2
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a0
+; ZVFHMIN64-NEXT: fmv.h.x fa3, a1
+; ZVFHMIN64-NEXT: feq.h a0, fa4, fa3
; ZVFHMIN64-NEXT: sb a0, 168(sp)
; ZVFHMIN64-NEXT: lh a0, 718(sp)
; ZVFHMIN64-NEXT: lh a1, 462(sp)
-; ZVFHMIN64-NEXT: fmv.h.x fa3, s7
-; ZVFHMIN64-NEXT: fmv.h.x fa2, s8
-; ZVFHMIN64-NEXT: fmv.h.x fa1, a0
-; ZVFHMIN64-NEXT: fmv.h.x fa0, a1
-; ZVFHMIN64-NEXT: feq.h a0, fa1, fa0
-; ZVFHMIN64-NEXT: fmv.h.x fa1, ra
+; ZVFHMIN64-NEXT: fmv.h.x fa4, s2
+; ZVFHMIN64-NEXT: fmv.h.x fa3, s6
+; ZVFHMIN64-NEXT: fmv.h.x fa2, a0
+; ZVFHMIN64-NEXT: fmv.h.x fa1, a1
+; ZVFHMIN64-NEXT: feq.h a0, fa2, fa1
; ZVFHMIN64-NEXT: sb a0, 167(sp)
; ZVFHMIN64-NEXT: lh a0, 716(sp)
-; ZVFHMIN64-NEXT: fmv.h.x fa0, a2
; ZVFHMIN64-NEXT: lh a1, 460(sp)
-; ZVFHMIN64-NEXT: feq.h s5, fa5, fa1
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT: feq.h a0, fa4, fa0
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4
-; ZVFHMIN64-NEXT: fmv.h.x fa5, s4
-; ZVFHMIN64-NEXT: sb a1, 166(sp)
-; ZVFHMIN64-NEXT: lh a1, 714(sp)
-; ZVFHMIN64-NEXT: lh a2, 458(sp)
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a3
-; ZVFHMIN64-NEXT: feq.h a3, fa3, fa4
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT: fmv.h.x fa3, a2
-; ZVFHMIN64-NEXT: feq.h a1, fa4, fa3
-; ZVFHMIN64-NEXT: fmv.h.x fa4, s3
-; ZVFHMIN64-NEXT: sb a1, 165(sp)
-; ZVFHMIN64-NEXT: lh a1, 712(sp)
-; ZVFHMIN64-NEXT: lh a2, 456(sp)
-; ZVFHMIN64-NEXT: fmv.h.x fa3, a4
-; ZVFHMIN64-NEXT: feq.h a4, fa2, fa3
-; ZVFHMIN64-NEXT: fmv.h.x fa3, a1
-; ZVFHMIN64-NEXT: fmv.h.x fa2, a2
-; ZVFHMIN64-NEXT: feq.h a1, fa3, fa2
-; ZVFHMIN64-NEXT: fmv.h.x fa3, s2
-; ZVFHMIN64-NEXT: sb a1, 164(sp)
-; ZVFHMIN64-NEXT: lh a1, 710(sp)
-; ZVFHMIN64-NEXT: lh a2, 454(sp)
-; ZVFHMIN64-NEXT: fmv.h.x fa2, s9
-; ZVFHMIN64-NEXT: feq.h s2, fa5, fa2
+; ZVFHMIN64-NEXT: fmv.h.x fa2, s3
+; ZVFHMIN64-NEXT: fmv.h.x fa1, s7
+; ZVFHMIN64-NEXT: fmv.h.x fa0, a0
+; ZVFHMIN64-NEXT: fmv.h.x ft0, a1
+; ZVFHMIN64-NEXT: feq.h a0, fa0, ft0
+; ZVFHMIN64-NEXT: sb a0, 166(sp)
+; ZVFHMIN64-NEXT: lh a0, 714(sp)
+; ZVFHMIN64-NEXT: lh a1, 458(sp)
+; ZVFHMIN64-NEXT: fmv.h.x fa0, s4
+; ZVFHMIN64-NEXT: fmv.h.x ft0, s8
+; ZVFHMIN64-NEXT: fmv.h.x ft1, a0
+; ZVFHMIN64-NEXT: fmv.h.x ft2, a1
+; ZVFHMIN64-NEXT: feq.h a0, ft1, ft2
+; ZVFHMIN64-NEXT: sb a0, 165(sp)
+; ZVFHMIN64-NEXT: lh a0, 712(sp)
+; ZVFHMIN64-NEXT: lh a1, 456(sp)
+; ZVFHMIN64-NEXT: fmv.h.x ft1, s10
+; ZVFHMIN64-NEXT: fmv.h.x ft2, s11
+; ZVFHMIN64-NEXT: fmv.h.x ft3, a0
+; ZVFHMIN64-NEXT: fmv.h.x ft4, a1
+; ZVFHMIN64-NEXT: feq.h a0, ft3, ft4
+; ZVFHMIN64-NEXT: sb a0, 164(sp)
+; ZVFHMIN64-NEXT: lh a0, 710(sp)
+; ZVFHMIN64-NEXT: fmv.h.x ft3, a4
+; ZVFHMIN64-NEXT: lh a1, 454(sp)
+; ZVFHMIN64-NEXT: fmv.h.x ft4, ra
+; ZVFHMIN64-NEXT: fmv.h.x ft5, a0
+; ZVFHMIN64-NEXT: feq.h a0, fa5, ft1
; ZVFHMIN64-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT: fmv.h.x fa2, a2
-; ZVFHMIN64-NEXT: feq.h a1, fa5, fa2
-; ZVFHMIN64-NEXT: fmv.h.x fa5, s10
-; ZVFHMIN64-NEXT: fmv.h.x fa2, s11
+; ZVFHMIN64-NEXT: feq.h a1, ft5, fa5
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a3
; ZVFHMIN64-NEXT: sb a1, 163(sp)
; ZVFHMIN64-NEXT: lh a1, 708(sp)
+; ZVFHMIN64-NEXT: fmv.h.x ft1, a2
; ZVFHMIN64-NEXT: lh a2, 452(sp)
-; ZVFHMIN64-NEXT: feq.h s3, fa4, fa5
-; ZVFHMIN64-NEXT: feq.h s4, fa3, fa2
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a2
-; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4
-; ZVFHMIN64-NEXT: sb a1, 162(sp)
-; ZVFHMIN64-NEXT: lh a1, 706(sp)
-; ZVFHMIN64-NEXT: lh a2, 450(sp)
-; ZVFHMIN64-NEXT: sb s4, 129(sp)
-; ZVFHMIN64-NEXT: sb s3, 130(sp)
-; ZVFHMIN64-NEXT: sb s2, 131(sp)
-; ZVFHMIN64-NEXT: sb a4, 132(sp)
+; ZVFHMIN64-NEXT: feq.h a3, fa0, fa5
; ZVFHMIN64-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a2
-; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT: feq.h a1, ft0, ft1
+; ZVFHMIN64-NEXT: fmv.h.x fa0, a2
+; ZVFHMIN64-NEXT: feq.h a2, fa5, fa0
+; ZVFHMIN64-NEXT: fmv.h.x fa5, s9
+; ZVFHMIN64-NEXT: sb a2, 162(sp)
+; ZVFHMIN64-NEXT: lh a2, 706(sp)
+; ZVFHMIN64-NEXT: lh a4, 450(sp)
+; ZVFHMIN64-NEXT: sb a1, 129(sp)
+; ZVFHMIN64-NEXT: feq.h a1, fa1, fa5
+; ZVFHMIN64-NEXT: sb a3, 130(sp)
+; ZVFHMIN64-NEXT: feq.h a3, fa2, ft4
+; ZVFHMIN64-NEXT: sb a1, 131(sp)
+; ZVFHMIN64-NEXT: feq.h a1, fa4, ft2
+; ZVFHMIN64-NEXT: sb a3, 132(sp)
+; ZVFHMIN64-NEXT: feq.h a3, fa3, ft3
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a2
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a4
+; ZVFHMIN64-NEXT: feq.h a2, fa5, fa4
; ZVFHMIN64-NEXT: sb a3, 133(sp)
-; ZVFHMIN64-NEXT: sb a0, 134(sp)
-; ZVFHMIN64-NEXT: sb s5, 135(sp)
-; ZVFHMIN64-NEXT: sb a1, 161(sp)
+; ZVFHMIN64-NEXT: sb a1, 134(sp)
+; ZVFHMIN64-NEXT: sb a0, 135(sp)
+; ZVFHMIN64-NEXT: sb a2, 161(sp)
; ZVFHMIN64-NEXT: lh a0, 610(sp)
; ZVFHMIN64-NEXT: lh a1, 354(sp)
-; ZVFHMIN64-NEXT: vmv.x.s s6, v5
-; ZVFHMIN64-NEXT: vmv.x.s s5, v23
+; ZVFHMIN64-NEXT: vmv.x.s s4, v23
+; ZVFHMIN64-NEXT: csrr a2, vlenb
+; ZVFHMIN64-NEXT: li a3, 10
+; ZVFHMIN64-NEXT: mul a2, a2, a3
+; ZVFHMIN64-NEXT: add a2, sp, a2
+; ZVFHMIN64-NEXT: lh s2, 800(a2) # 8-byte Folded Reload
; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
@@ -2881,13 +2882,12 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
; ZVFHMIN64-NEXT: lh a0, 608(sp)
; ZVFHMIN64-NEXT: lh a1, 352(sp)
; ZVFHMIN64-NEXT: csrr a2, vlenb
-; ZVFHMIN64-NEXT: li a3, 21
-; ZVFHMIN64-NEXT: mul a2, a2, a3
+; ZVFHMIN64-NEXT: slli a2, a2, 4
; ZVFHMIN64-NEXT: add a2, sp, a2
-; ZVFHMIN64-NEXT: lh s4, 800(a2) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT: lh s5, 800(a2) # 8-byte Folded Reload
; ZVFHMIN64-NEXT: csrr a2, vlenb
-; ZVFHMIN64-NEXT: li a3, 20
-; ZVFHMIN64-NEXT: mul a2, a2, a3
+; ZVFHMIN64-NEXT: slli a3, a2, 4
+; ZVFHMIN64-NEXT: sub a2, a3, a2
; ZVFHMIN64-NEXT: add a2, sp, a2
; ZVFHMIN64-NEXT: lh s3, 800(a2) # 8-byte Folded Reload
; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
@@ -2896,153 +2896,148 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
; ZVFHMIN64-NEXT: sb a0, 240(sp)
; ZVFHMIN64-NEXT: lh a0, 606(sp)
; ZVFHMIN64-NEXT: lh a1, 350(sp)
-; ZVFHMIN64-NEXT: csrr a2, vlenb
-; ZVFHMIN64-NEXT: li a3, 22
-; ZVFHMIN64-NEXT: mul a2, a2, a3
-; ZVFHMIN64-NEXT: add a2, sp, a2
-; ZVFHMIN64-NEXT: lh s2, 800(a2) # 8-byte Folded Reload
-; ZVFHMIN64-NEXT: fmv.h.x fa5, t5
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a0
-; ZVFHMIN64-NEXT: fmv.h.x fa3, a1
-; ZVFHMIN64-NEXT: feq.h a0, fa4, fa3
+; ZVFHMIN64-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 7
+; ZVFHMIN64-NEXT: vmv.x.s s6, v8
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
; ZVFHMIN64-NEXT: sb a0, 239(sp)
; ZVFHMIN64-NEXT: lh a0, 604(sp)
; ZVFHMIN64-NEXT: lh a1, 348(sp)
-; ZVFHMIN64-NEXT: fmv.h.x fa4, t6
-; ZVFHMIN64-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 7
-; ZVFHMIN64-NEXT: fmv.h.x fa3, a0
-; ZVFHMIN64-NEXT: fmv.h.x fa2, a1
-; ZVFHMIN64-NEXT: feq.h a0, fa3, fa2
+; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 6
+; ZVFHMIN64-NEXT: vmv.x.s s7, v8
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
; ZVFHMIN64-NEXT: sb a0, 238(sp)
; ZVFHMIN64-NEXT: lh a0, 602(sp)
; ZVFHMIN64-NEXT: lh a1, 346(sp)
-; ZVFHMIN64-NEXT: vmv.x.s a2, v8
-; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 6
-; ZVFHMIN64-NEXT: fmv.h.x fa3, a0
-; ZVFHMIN64-NEXT: fmv.h.x fa2, a1
-; ZVFHMIN64-NEXT: feq.h a0, fa3, fa2
+; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 5
+; ZVFHMIN64-NEXT: vmv.x.s s8, v8
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
; ZVFHMIN64-NEXT: sb a0, 237(sp)
; ZVFHMIN64-NEXT: lh a0, 600(sp)
; ZVFHMIN64-NEXT: lh a1, 344(sp)
-; ZVFHMIN64-NEXT: vmv.x.s a3, v8
-; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 5
-; ZVFHMIN64-NEXT: fmv.h.x fa3, a0
-; ZVFHMIN64-NEXT: fmv.h.x fa2, a1
-; ZVFHMIN64-NEXT: feq.h a0, fa3, fa2
+; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 4
+; ZVFHMIN64-NEXT: vmv.x.s s9, v8
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
; ZVFHMIN64-NEXT: sb a0, 236(sp)
; ZVFHMIN64-NEXT: lh a0, 598(sp)
; ZVFHMIN64-NEXT: lh a1, 342(sp)
-; ZVFHMIN64-NEXT: vmv.x.s a4, v8
-; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 4
-; ZVFHMIN64-NEXT: fmv.h.x fa3, a0
-; ZVFHMIN64-NEXT: fmv.h.x fa2, a1
-; ZVFHMIN64-NEXT: feq.h a0, fa3, fa2
+; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 3
+; ZVFHMIN64-NEXT: vmv.x.s s10, v8
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
; ZVFHMIN64-NEXT: sb a0, 235(sp)
; ZVFHMIN64-NEXT: lh a0, 596(sp)
; ZVFHMIN64-NEXT: lh a1, 340(sp)
-; ZVFHMIN64-NEXT: vmv.x.s s8, v8
-; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 3
-; ZVFHMIN64-NEXT: fmv.h.x fa3, a0
-; ZVFHMIN64-NEXT: fmv.h.x fa2, a1
-; ZVFHMIN64-NEXT: feq.h a0, fa3, fa2
+; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 2
+; ZVFHMIN64-NEXT: vmv.x.s s11, v8
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
; ZVFHMIN64-NEXT: sb a0, 234(sp)
; ZVFHMIN64-NEXT: lh a0, 594(sp)
; ZVFHMIN64-NEXT: lh a1, 338(sp)
-; ZVFHMIN64-NEXT: vmv.x.s s9, v8
-; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 2
-; ZVFHMIN64-NEXT: fmv.h.x fa3, a0
-; ZVFHMIN64-NEXT: fmv.h.x fa2, a1
-; ZVFHMIN64-NEXT: feq.h a0, fa3, fa2
+; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 1
+; ZVFHMIN64-NEXT: vmv.x.s ra, v8
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
; ZVFHMIN64-NEXT: sb a0, 233(sp)
; ZVFHMIN64-NEXT: lh a0, 592(sp)
-; ZVFHMIN64-NEXT: vmv.x.s a1, v8
-; ZVFHMIN64-NEXT: lh t5, 336(sp)
-; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 1
+; ZVFHMIN64-NEXT: lh a1, 336(sp)
+; ZVFHMIN64-NEXT: fmv.h.x fa5, t4
+; ZVFHMIN64-NEXT: fmv.h.x fa4, t6
; ZVFHMIN64-NEXT: fmv.h.x fa3, a0
-; ZVFHMIN64-NEXT: vmv.x.s s7, v8
-; ZVFHMIN64-NEXT: fmv.h.x fa2, t5
+; ZVFHMIN64-NEXT: fmv.h.x fa2, a1
; ZVFHMIN64-NEXT: feq.h a0, fa3, fa2
-; ZVFHMIN64-NEXT: fmv.h.x fa3, a2
; ZVFHMIN64-NEXT: sb a0, 232(sp)
; ZVFHMIN64-NEXT: lh a0, 590(sp)
-; ZVFHMIN64-NEXT: fmv.h.x fa2, a3
-; ZVFHMIN64-NEXT: lh a2, 334(sp)
-; ZVFHMIN64-NEXT: feq.h t5, fa5, fa3
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT: feq.h t6, fa4, fa2
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a2
-; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT: fmv.h.x fa5, s6
+; ZVFHMIN64-NEXT: lh a1, 334(sp)
+; ZVFHMIN64-NEXT: fmv.h.x fa3, t5
+; ZVFHMIN64-NEXT: fmv.h.x fa2, s4
+; ZVFHMIN64-NEXT: fmv.h.x fa1, a0
+; ZVFHMIN64-NEXT: fmv.h.x fa0, a1
+; ZVFHMIN64-NEXT: feq.h a0, fa1, fa0
; ZVFHMIN64-NEXT: sb a0, 231(sp)
; ZVFHMIN64-NEXT: lh a0, 588(sp)
-; ZVFHMIN64-NEXT: lh a2, 332(sp)
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a4
-; ZVFHMIN64-NEXT: feq.h a3, fa5, fa4
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a2
-; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT: fmv.h.x fa5, s5
+; ZVFHMIN64-NEXT: lh a1, 332(sp)
+; ZVFHMIN64-NEXT: fmv.h.x fa1, s2
+; ZVFHMIN64-NEXT: fmv.h.x fa0, s5
+; ZVFHMIN64-NEXT: fmv.h.x ft0, a0
+; ZVFHMIN64-NEXT: fmv.h.x ft1, a1
+; ZVFHMIN64-NEXT: feq.h a0, ft0, ft1
; ZVFHMIN64-NEXT: sb a0, 230(sp)
; ZVFHMIN64-NEXT: lh a0, 586(sp)
-; ZVFHMIN64-NEXT: lh a2, 330(sp)
-; ZVFHMIN64-NEXT: fmv.h.x fa4, s8
-; ZVFHMIN64-NEXT: feq.h a4, fa5, fa4
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a2
-; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT: fmv.h.x fa5, s4
-; ZVFHMIN64-NEXT: sb a0, 229(sp)
-; ZVFHMIN64-NEXT: lh a0, 584(sp)
+; ZVFHMIN64-NEXT: fmv.h.x ft0, s3
+; ZVFHMIN64-NEXT: lh a1, 330(sp)
+; ZVFHMIN64-NEXT: fmv.h.x ft1, s6
+; ZVFHMIN64-NEXT: fmv.h.x ft2, a0
+; ZVFHMIN64-NEXT: feq.h a0, fa5, ft1
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT: feq.h a1, ft2, fa5
+; ZVFHMIN64-NEXT: fmv.h.x fa5, s7
+; ZVFHMIN64-NEXT: sb a1, 229(sp)
+; ZVFHMIN64-NEXT: lh a1, 584(sp)
+; ZVFHMIN64-NEXT: fmv.h.x ft1, s8
; ZVFHMIN64-NEXT: lh a2, 328(sp)
-; ZVFHMIN64-NEXT: fmv.h.x fa4, s9
-; ZVFHMIN64-NEXT: feq.h s4, fa5, fa4
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a2
-; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT: fmv.h.x fa5, s3
-; ZVFHMIN64-NEXT: sb a0, 228(sp)
-; ZVFHMIN64-NEXT: lh a0, 582(sp)
-; ZVFHMIN64-NEXT: lh a2, 326(sp)
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a2
-; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT: fmv.h.x fa5, s2
-; ZVFHMIN64-NEXT: sb a0, 227(sp)
-; ZVFHMIN64-NEXT: lh a0, 580(sp)
-; ZVFHMIN64-NEXT: lh a2, 324(sp)
-; ZVFHMIN64-NEXT: fmv.h.x fa4, s7
-; ZVFHMIN64-NEXT: feq.h s2, fa5, fa4
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a2
-; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT: sb a0, 226(sp)
-; ZVFHMIN64-NEXT: lh a0, 578(sp)
-; ZVFHMIN64-NEXT: lh a2, 322(sp)
-; ZVFHMIN64-NEXT: sb s2, 193(sp)
-; ZVFHMIN64-NEXT: sb a1, 194(sp)
-; ZVFHMIN64-NEXT: sb s4, 195(sp)
-; ZVFHMIN64-NEXT: sb a4, 196(sp)
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT: feq.h a3, fa4, fa5
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT: feq.h a1, fa3, ft1
; ZVFHMIN64-NEXT: fmv.h.x fa4, a2
-; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT: sb a3, 197(sp)
-; ZVFHMIN64-NEXT: sb t6, 198(sp)
-; ZVFHMIN64-NEXT: sb t5, 199(sp)
-; ZVFHMIN64-NEXT: sb a0, 225(sp)
+; ZVFHMIN64-NEXT: feq.h a2, fa5, fa4
+; ZVFHMIN64-NEXT: fmv.h.x fa5, s9
+; ZVFHMIN64-NEXT: sb a2, 228(sp)
+; ZVFHMIN64-NEXT: lh a2, 582(sp)
+; ZVFHMIN64-NEXT: lh a4, 326(sp)
+; ZVFHMIN64-NEXT: fmv.h.x fa4, s10
+; ZVFHMIN64-NEXT: feq.h t4, fa2, fa5
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a2
+; ZVFHMIN64-NEXT: fmv.h.x fa3, a4
+; ZVFHMIN64-NEXT: feq.h a2, fa5, fa3
+; ZVFHMIN64-NEXT: fmv.h.x fa5, s11
+; ZVFHMIN64-NEXT: fmv.h.x fa3, ra
+; ZVFHMIN64-NEXT: sb a2, 227(sp)
+; ZVFHMIN64-NEXT: lh a2, 580(sp)
+; ZVFHMIN64-NEXT: lh a4, 324(sp)
+; ZVFHMIN64-NEXT: feq.h t5, fa0, fa5
+; ZVFHMIN64-NEXT: feq.h t6, ft0, fa3
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a2
+; ZVFHMIN64-NEXT: fmv.h.x fa3, a4
+; ZVFHMIN64-NEXT: feq.h a2, fa5, fa3
+; ZVFHMIN64-NEXT: sb a2, 226(sp)
+; ZVFHMIN64-NEXT: lh a2, 578(sp)
+; ZVFHMIN64-NEXT: lh a4, 322(sp)
+; ZVFHMIN64-NEXT: sb t6, 193(sp)
+; ZVFHMIN64-NEXT: feq.h t6, fa1, fa4
+; ZVFHMIN64-NEXT: sb t5, 194(sp)
+; ZVFHMIN64-NEXT: sb t6, 195(sp)
+; ZVFHMIN64-NEXT: sb t4, 196(sp)
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a2
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a4
+; ZVFHMIN64-NEXT: feq.h a2, fa5, fa4
+; ZVFHMIN64-NEXT: sb a1, 197(sp)
+; ZVFHMIN64-NEXT: sb a3, 198(sp)
+; ZVFHMIN64-NEXT: sb a0, 199(sp)
+; ZVFHMIN64-NEXT: sb a2, 225(sp)
; ZVFHMIN64-NEXT: lh a0, 766(sp)
; ZVFHMIN64-NEXT: lh a1, 510(sp)
; ZVFHMIN64-NEXT: csrr a2, vlenb
-; ZVFHMIN64-NEXT: li a3, 18
-; ZVFHMIN64-NEXT: mul a2, a2, a3
+; ZVFHMIN64-NEXT: slli a3, a2, 4
+; ZVFHMIN64-NEXT: add a2, a3, a2
; ZVFHMIN64-NEXT: add a2, sp, a2
; ZVFHMIN64-NEXT: addi a2, a2, 800
; ZVFHMIN64-NEXT: vl2r.v v8, (a2) # Unknown-size Folded Reload
; ZVFHMIN64-NEXT: vmv.x.s s2, v8
; ZVFHMIN64-NEXT: csrr a2, vlenb
-; ZVFHMIN64-NEXT: li a3, 14
+; ZVFHMIN64-NEXT: li a3, 11
; ZVFHMIN64-NEXT: mul a2, a2, a3
; ZVFHMIN64-NEXT: add a2, sp, a2
; ZVFHMIN64-NEXT: addi a2, a2, 800
@@ -3054,165 +3049,171 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
; ZVFHMIN64-NEXT: sb a0, 191(sp)
; ZVFHMIN64-NEXT: lh a0, 764(sp)
; ZVFHMIN64-NEXT: lh a1, 508(sp)
-; ZVFHMIN64-NEXT: vmv.x.s t5, v6
-; ZVFHMIN64-NEXT: csrr a2, vlenb
-; ZVFHMIN64-NEXT: slli a2, a2, 2
-; ZVFHMIN64-NEXT: add a2, sp, a2
-; ZVFHMIN64-NEXT: addi a2, a2, 800
-; ZVFHMIN64-NEXT: vl2r.v v8, (a2) # Unknown-size Folded Reload
-; ZVFHMIN64-NEXT: vmv.x.s a2, v8
+; ZVFHMIN64-NEXT: vmv.x.s t5, v4
+; ZVFHMIN64-NEXT: vmv.x.s t4, v30
; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
; ZVFHMIN64-NEXT: sb a0, 190(sp)
; ZVFHMIN64-NEXT: lh a0, 762(sp)
; ZVFHMIN64-NEXT: lh a1, 506(sp)
+; ZVFHMIN64-NEXT: csrr a2, vlenb
+; ZVFHMIN64-NEXT: slli a2, a2, 2
+; ZVFHMIN64-NEXT: add a2, sp, a2
+; ZVFHMIN64-NEXT: addi a2, a2, 800
+; ZVFHMIN64-NEXT: vl2r.v v8, (a2) # Unknown-size Folded Reload
+; ZVFHMIN64-NEXT: vmv.x.s a2, v8
; ZVFHMIN64-NEXT: csrr a3, vlenb
-; ZVFHMIN64-NEXT: slli a3, a3, 3
+; ZVFHMIN64-NEXT: slli a3, a3, 1
; ZVFHMIN64-NEXT: add a3, sp, a3
; ZVFHMIN64-NEXT: addi a3, a3, 800
; ZVFHMIN64-NEXT: vl2r.v v8, (a3) # Unknown-size Folded Reload
; ZVFHMIN64-NEXT: vmv.x.s a3, v8
-; ZVFHMIN64-NEXT: csrr a4, vlenb
-; ZVFHMIN64-NEXT: li s3, 6
-; ZVFHMIN64-NEXT: mul a4, a4, s3
-; ZVFHMIN64-NEXT: add a4, sp, a4
-; ZVFHMIN64-NEXT: addi a4, a4, 800
-; ZVFHMIN64-NEXT: vl2r.v v8, (a4) # Unknown-size Folded Reload
-; ZVFHMIN64-NEXT: vmv.x.s a4, v8
; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
; ZVFHMIN64-NEXT: sb a0, 189(sp)
; ZVFHMIN64-NEXT: lh a0, 760(sp)
; ZVFHMIN64-NEXT: lh a1, 504(sp)
-; ZVFHMIN64-NEXT: csrr s3, vlenb
-; ZVFHMIN64-NEXT: li s4, 12
-; ZVFHMIN64-NEXT: mul s3, s3, s4
-; ZVFHMIN64-NEXT: add s3, sp, s3
-; ZVFHMIN64-NEXT: addi s3, s3, 800
-; ZVFHMIN64-NEXT: vl2r.v v8, (s3) # Unknown-size Folded Reload
-; ZVFHMIN64-NEXT: vmv.x.s s6, v8
-; ZVFHMIN64-NEXT: csrr s3, vlenb
-; ZVFHMIN64-NEXT: li s4, 10
-; ZVFHMIN64-NEXT: mul s3, s3, s4
-; ZVFHMIN64-NEXT: add s3, sp, s3
-; ZVFHMIN64-NEXT: addi s3, s3, 800
-; ZVFHMIN64-NEXT: vl2r.v v8, (s3) # Unknown-size Folded Reload
-; ZVFHMIN64-NEXT: vmv.x.s s4, v8
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT: fmv.h.x fa5, t3
+; ZVFHMIN64-NEXT: csrr a4, vlenb
+; ZVFHMIN64-NEXT: li t3, 6
+; ZVFHMIN64-NEXT: mul a4, a4, t3
+; ZVFHMIN64-NEXT: add a4, sp, a4
+; ZVFHMIN64-NEXT: addi a4, a4, 800
+; ZVFHMIN64-NEXT: vl2r.v v8, (a4) # Unknown-size Folded Reload
+; ZVFHMIN64-NEXT: vmv.x.s a4, v8
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a0
+; ZVFHMIN64-NEXT: fmv.h.x fa3, a1
+; ZVFHMIN64-NEXT: feq.h a0, fa4, fa3
; ZVFHMIN64-NEXT: sb a0, 188(sp)
; ZVFHMIN64-NEXT: lh a0, 758(sp)
; ZVFHMIN64-NEXT: lh a1, 502(sp)
-; ZVFHMIN64-NEXT: csrr s3, vlenb
-; ZVFHMIN64-NEXT: slli s3, s3, 4
-; ZVFHMIN64-NEXT: add s3, sp, s3
-; ZVFHMIN64-NEXT: addi s3, s3, 800
-; ZVFHMIN64-NEXT: vl2r.v v8, (s3) # Unknown-size Folded Reload
-; ZVFHMIN64-NEXT: vmv.x.s s5, v8
-; ZVFHMIN64-NEXT: vmv.x.s s3, v16
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT: fmv.h.x fa5, t4
+; ZVFHMIN64-NEXT: fmv.h.x fa4, t2
+; ZVFHMIN64-NEXT: csrr t2, vlenb
+; ZVFHMIN64-NEXT: slli t2, t2, 3
+; ZVFHMIN64-NEXT: add t2, sp, t2
+; ZVFHMIN64-NEXT: addi t2, t2, 800
+; ZVFHMIN64-NEXT: vl2r.v v8, (t2) # Unknown-size Folded Reload
+; ZVFHMIN64-NEXT: vmv.x.s t2, v8
+; ZVFHMIN64-NEXT: fmv.h.x fa3, a0
+; ZVFHMIN64-NEXT: fmv.h.x fa2, a1
+; ZVFHMIN64-NEXT: feq.h a0, fa3, fa2
; ZVFHMIN64-NEXT: sb a0, 187(sp)
; ZVFHMIN64-NEXT: lh a0, 756(sp)
; ZVFHMIN64-NEXT: lh a1, 500(sp)
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a2
-; ZVFHMIN64-NEXT: feq.h t4, fa5, fa4
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT: fmv.h.x fa5, t3
+; ZVFHMIN64-NEXT: fmv.h.x fa3, t1
+; ZVFHMIN64-NEXT: csrr t1, vlenb
+; ZVFHMIN64-NEXT: li t3, 13
+; ZVFHMIN64-NEXT: mul t1, t1, t3
+; ZVFHMIN64-NEXT: add t1, sp, t1
+; ZVFHMIN64-NEXT: addi t1, t1, 800
+; ZVFHMIN64-NEXT: vl2r.v v8, (t1) # Unknown-size Folded Reload
+; ZVFHMIN64-NEXT: vmv.x.s t3, v8
+; ZVFHMIN64-NEXT: fmv.h.x fa2, a0
+; ZVFHMIN64-NEXT: fmv.h.x fa1, a1
+; ZVFHMIN64-NEXT: feq.h a0, fa2, fa1
; ZVFHMIN64-NEXT: sb a0, 186(sp)
; ZVFHMIN64-NEXT: lh a0, 754(sp)
+; ZVFHMIN64-NEXT: fmv.h.x fa2, t0
; ZVFHMIN64-NEXT: lh a1, 498(sp)
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a3
-; ZVFHMIN64-NEXT: feq.h t3, fa5, fa4
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT: fmv.h.x fa5, t1
-; ZVFHMIN64-NEXT: sb a0, 185(sp)
-; ZVFHMIN64-NEXT: lh a0, 752(sp)
-; ZVFHMIN64-NEXT: lh a1, 496(sp)
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a4
-; ZVFHMIN64-NEXT: feq.h t1, fa5, fa4
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT: fmv.h.x fa5, t2
-; ZVFHMIN64-NEXT: sb a0, 184(sp)
-; ZVFHMIN64-NEXT: lh a0, 750(sp)
-; ZVFHMIN64-NEXT: lh a1, 494(sp)
-; ZVFHMIN64-NEXT: fmv.h.x fa4, s6
+; ZVFHMIN64-NEXT: csrr t0, vlenb
+; ZVFHMIN64-NEXT: li t1, 19
+; ZVFHMIN64-NEXT: mul t0, t0, t1
+; ZVFHMIN64-NEXT: add t0, sp, t0
+; ZVFHMIN64-NEXT: addi t0, t0, 800
+; ZVFHMIN64-NEXT: vl2r.v v8, (t0) # Unknown-size Folded Reload
+; ZVFHMIN64-NEXT: vmv.x.s s3, v8
+; ZVFHMIN64-NEXT: fmv.h.x fa1, a0
+; ZVFHMIN64-NEXT: csrr a0, vlenb
+; ZVFHMIN64-NEXT: li t0, 21
+; ZVFHMIN64-NEXT: mul a0, a0, t0
+; ZVFHMIN64-NEXT: add a0, sp, a0
+; ZVFHMIN64-NEXT: addi a0, a0, 800
+; ZVFHMIN64-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN64-NEXT: vmv.x.s a0, v8
+; ZVFHMIN64-NEXT: fmv.h.x fa0, a1
+; ZVFHMIN64-NEXT: feq.h a1, fa1, fa0
+; ZVFHMIN64-NEXT: fmv.h.x fa1, a2
+; ZVFHMIN64-NEXT: sb a1, 185(sp)
+; ZVFHMIN64-NEXT: lh a1, 752(sp)
+; ZVFHMIN64-NEXT: fmv.h.x fa0, a3
+; ZVFHMIN64-NEXT: lh a2, 496(sp)
+; ZVFHMIN64-NEXT: feq.h t0, fa5, fa1
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT: feq.h t1, fa4, fa0
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a2
+; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a4
+; ZVFHMIN64-NEXT: sb a1, 184(sp)
+; ZVFHMIN64-NEXT: lh a1, 750(sp)
+; ZVFHMIN64-NEXT: fmv.h.x fa4, t2
+; ZVFHMIN64-NEXT: lh a2, 494(sp)
+; ZVFHMIN64-NEXT: feq.h a3, fa3, fa5
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT: feq.h a1, fa2, fa4
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a2
; ZVFHMIN64-NEXT: feq.h a2, fa5, fa4
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT: fmv.h.x fa5, t0
-; ZVFHMIN64-NEXT: sb a0, 183(sp)
-; ZVFHMIN64-NEXT: lh a0, 748(sp)
-; ZVFHMIN64-NEXT: lh a1, 492(sp)
-; ZVFHMIN64-NEXT: fmv.h.x fa4, s4
-; ZVFHMIN64-NEXT: feq.h a3, fa5, fa4
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
; ZVFHMIN64-NEXT: fmv.h.x fa5, a7
-; ZVFHMIN64-NEXT: sb a0, 182(sp)
-; ZVFHMIN64-NEXT: lh a0, 746(sp)
-; ZVFHMIN64-NEXT: lh a1, 490(sp)
-; ZVFHMIN64-NEXT: fmv.h.x fa4, s5
-; ZVFHMIN64-NEXT: feq.h a4, fa5, fa4
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT: sb a2, 183(sp)
+; ZVFHMIN64-NEXT: lh a2, 748(sp)
+; ZVFHMIN64-NEXT: lh a4, 492(sp)
+; ZVFHMIN64-NEXT: fmv.h.x fa4, t3
+; ZVFHMIN64-NEXT: feq.h a7, fa5, fa4
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a2
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a4
+; ZVFHMIN64-NEXT: feq.h a2, fa5, fa4
; ZVFHMIN64-NEXT: fmv.h.x fa5, a6
-; ZVFHMIN64-NEXT: sb a0, 181(sp)
-; ZVFHMIN64-NEXT: lh a0, 744(sp)
-; ZVFHMIN64-NEXT: lh a1, 488(sp)
+; ZVFHMIN64-NEXT: sb a2, 182(sp)
+; ZVFHMIN64-NEXT: lh a2, 746(sp)
+; ZVFHMIN64-NEXT: lh a4, 490(sp)
; ZVFHMIN64-NEXT: fmv.h.x fa4, s3
; ZVFHMIN64-NEXT: feq.h a6, fa5, fa4
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a2
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a4
+; ZVFHMIN64-NEXT: feq.h a2, fa5, fa4
; ZVFHMIN64-NEXT: fmv.h.x fa5, a5
-; ZVFHMIN64-NEXT: addi a1, sp, 800
-; ZVFHMIN64-NEXT: vl2r.v v8, (a1) # Unknown-size Folded Reload
-; ZVFHMIN64-NEXT: vmv.x.s a1, v8
+; ZVFHMIN64-NEXT: sb a2, 181(sp)
+; ZVFHMIN64-NEXT: lh a2, 744(sp)
+; ZVFHMIN64-NEXT: lh a4, 488(sp)
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a0
+; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a2
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a4
+; ZVFHMIN64-NEXT: feq.h a2, fa5, fa4
+; ZVFHMIN64-NEXT: ld a4, 88(sp) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a4
+; ZVFHMIN64-NEXT: vmv.x.s a5, v0
; ZVFHMIN64-NEXT: vsetivli zero, 1, e16, m2, ta, ma
; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 15
-; ZVFHMIN64-NEXT: vmv.x.s a5, v8
-; ZVFHMIN64-NEXT: sb a0, 180(sp)
-; ZVFHMIN64-NEXT: lh a0, 742(sp)
-; ZVFHMIN64-NEXT: lh a7, 486(sp)
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a7
-; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT: sb a0, 179(sp)
-; ZVFHMIN64-NEXT: lh a0, 740(sp)
-; ZVFHMIN64-NEXT: lh a7, 484(sp)
-; ZVFHMIN64-NEXT: sb a2, 140(sp)
-; ZVFHMIN64-NEXT: sb t1, 141(sp)
-; ZVFHMIN64-NEXT: sb t3, 142(sp)
-; ZVFHMIN64-NEXT: sb t4, 143(sp)
-; ZVFHMIN64-NEXT: sb a1, 136(sp)
-; ZVFHMIN64-NEXT: sb a6, 137(sp)
-; ZVFHMIN64-NEXT: sb a4, 138(sp)
-; ZVFHMIN64-NEXT: sb a3, 139(sp)
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a7
+; ZVFHMIN64-NEXT: vmv.x.s a4, v8
+; ZVFHMIN64-NEXT: sb a2, 180(sp)
+; ZVFHMIN64-NEXT: lh a2, 742(sp)
+; ZVFHMIN64-NEXT: lh t2, 486(sp)
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a5
+; ZVFHMIN64-NEXT: feq.h a5, fa5, fa4
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a2
+; ZVFHMIN64-NEXT: fmv.h.x fa4, t2
+; ZVFHMIN64-NEXT: feq.h a2, fa5, fa4
+; ZVFHMIN64-NEXT: sb a2, 179(sp)
+; ZVFHMIN64-NEXT: lh a2, 740(sp)
+; ZVFHMIN64-NEXT: lh t2, 484(sp)
+; ZVFHMIN64-NEXT: sb a1, 140(sp)
+; ZVFHMIN64-NEXT: sb a3, 141(sp)
+; ZVFHMIN64-NEXT: sb t1, 142(sp)
+; ZVFHMIN64-NEXT: sb t0, 143(sp)
+; ZVFHMIN64-NEXT: sb a5, 136(sp)
+; ZVFHMIN64-NEXT: sb a0, 137(sp)
+; ZVFHMIN64-NEXT: sb a6, 138(sp)
+; ZVFHMIN64-NEXT: sb a7, 139(sp)
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a2
+; ZVFHMIN64-NEXT: fmv.h.x fa4, t2
; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
; ZVFHMIN64-NEXT: sb a0, 178(sp)
; ZVFHMIN64-NEXT: lh a0, 638(sp)
; ZVFHMIN64-NEXT: lh a1, 382(sp)
; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 14
-; ZVFHMIN64-NEXT: vmv.x.s t3, v8
+; ZVFHMIN64-NEXT: vmv.x.s t2, v8
; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
@@ -3220,7 +3221,7 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
; ZVFHMIN64-NEXT: lh a0, 636(sp)
; ZVFHMIN64-NEXT: lh a1, 380(sp)
; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 13
-; ZVFHMIN64-NEXT: vmv.x.s t2, v8
+; ZVFHMIN64-NEXT: vmv.x.s t1, v8
; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
@@ -3228,7 +3229,7 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
; ZVFHMIN64-NEXT: lh a0, 634(sp)
; ZVFHMIN64-NEXT: lh a1, 378(sp)
; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 12
-; ZVFHMIN64-NEXT: vmv.x.s t1, v8
+; ZVFHMIN64-NEXT: vmv.x.s t0, v8
; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
@@ -3236,7 +3237,7 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
; ZVFHMIN64-NEXT: lh a0, 632(sp)
; ZVFHMIN64-NEXT: lh a1, 376(sp)
; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 11
-; ZVFHMIN64-NEXT: vmv.x.s t0, v8
+; ZVFHMIN64-NEXT: vmv.x.s a7, v8
; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
@@ -3244,7 +3245,7 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
; ZVFHMIN64-NEXT: lh a0, 630(sp)
; ZVFHMIN64-NEXT: lh a1, 374(sp)
; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 10
-; ZVFHMIN64-NEXT: vmv.x.s a7, v8
+; ZVFHMIN64-NEXT: vmv.x.s a6, v8
; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
@@ -3252,102 +3253,101 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
; ZVFHMIN64-NEXT: lh a0, 628(sp)
; ZVFHMIN64-NEXT: lh a1, 372(sp)
; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 9
-; ZVFHMIN64-NEXT: vmv.x.s a6, v8
+; ZVFHMIN64-NEXT: vmv.x.s a5, v8
; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT: ld a1, 96(sp) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT: ld a1, 104(sp) # 8-byte Folded Reload
; ZVFHMIN64-NEXT: fmv.h.x fa5, a1
; ZVFHMIN64-NEXT: sb a0, 250(sp)
; ZVFHMIN64-NEXT: lh a0, 626(sp)
; ZVFHMIN64-NEXT: lh a1, 370(sp)
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a5
-; ZVFHMIN64-NEXT: feq.h a3, fa5, fa4
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT: ld a1, 104(sp) # 8-byte Folded Reload
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT: sb a0, 249(sp)
-; ZVFHMIN64-NEXT: lh a0, 624(sp)
-; ZVFHMIN64-NEXT: lh a1, 368(sp)
-; ZVFHMIN64-NEXT: fmv.h.x fa4, t3
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a4
; ZVFHMIN64-NEXT: feq.h a2, fa5, fa4
; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
; ZVFHMIN64-NEXT: ld a1, 120(sp) # 8-byte Folded Reload
; ZVFHMIN64-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT: sb a0, 248(sp)
-; ZVFHMIN64-NEXT: lh a0, 622(sp)
-; ZVFHMIN64-NEXT: lh a1, 366(sp)
+; ZVFHMIN64-NEXT: sb a0, 249(sp)
+; ZVFHMIN64-NEXT: lh a1, 624(sp)
+; ZVFHMIN64-NEXT: lh a3, 368(sp)
; ZVFHMIN64-NEXT: fmv.h.x fa4, t2
-; ZVFHMIN64-NEXT: feq.h a4, fa5, fa4
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT: ld a1, 88(sp) # 8-byte Folded Reload
; ZVFHMIN64-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT: sb a0, 247(sp)
-; ZVFHMIN64-NEXT: lh a0, 620(sp)
-; ZVFHMIN64-NEXT: lh a1, 364(sp)
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a3
+; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT: ld a3, 96(sp) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a3
+; ZVFHMIN64-NEXT: sb a1, 248(sp)
+; ZVFHMIN64-NEXT: lh a1, 622(sp)
+; ZVFHMIN64-NEXT: lh a3, 366(sp)
; ZVFHMIN64-NEXT: fmv.h.x fa4, t1
-; ZVFHMIN64-NEXT: feq.h a5, fa5, fa4
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT: ld a1, 112(sp) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT: feq.h a4, fa5, fa4
; ZVFHMIN64-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT: sb a0, 246(sp)
-; ZVFHMIN64-NEXT: lh a0, 618(sp)
-; ZVFHMIN64-NEXT: lh a1, 362(sp)
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a3
+; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT: ld a3, 112(sp) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a3
+; ZVFHMIN64-NEXT: sb a1, 247(sp)
+; ZVFHMIN64-NEXT: lh a1, 620(sp)
+; ZVFHMIN64-NEXT: lh a3, 364(sp)
; ZVFHMIN64-NEXT: fmv.h.x fa4, t0
; ZVFHMIN64-NEXT: feq.h t0, fa5, fa4
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a3
+; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4
; ZVFHMIN64-NEXT: fmv.h.x fa5, s2
-; ZVFHMIN64-NEXT: sb a0, 245(sp)
-; ZVFHMIN64-NEXT: lh a0, 616(sp)
-; ZVFHMIN64-NEXT: lh a1, 360(sp)
+; ZVFHMIN64-NEXT: sb a1, 246(sp)
+; ZVFHMIN64-NEXT: lh a1, 618(sp)
+; ZVFHMIN64-NEXT: lh a3, 362(sp)
; ZVFHMIN64-NEXT: fmv.h.x fa4, a7
; ZVFHMIN64-NEXT: feq.h a7, fa5, fa4
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a3
+; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4
; ZVFHMIN64-NEXT: fmv.h.x fa5, t6
-; ZVFHMIN64-NEXT: sb a0, 244(sp)
-; ZVFHMIN64-NEXT: lh a0, 614(sp)
-; ZVFHMIN64-NEXT: lh a1, 358(sp)
+; ZVFHMIN64-NEXT: sb a1, 245(sp)
+; ZVFHMIN64-NEXT: lh a1, 616(sp)
+; ZVFHMIN64-NEXT: lh a3, 360(sp)
; ZVFHMIN64-NEXT: fmv.h.x fa4, a6
; ZVFHMIN64-NEXT: feq.h a6, fa5, fa4
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a3
+; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4
; ZVFHMIN64-NEXT: fmv.h.x fa5, t5
+; ZVFHMIN64-NEXT: sb a1, 244(sp)
+; ZVFHMIN64-NEXT: lh a1, 614(sp)
+; ZVFHMIN64-NEXT: lh a3, 358(sp)
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a5
+; ZVFHMIN64-NEXT: feq.h a5, fa5, fa4
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a3
+; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT: fmv.h.x fa5, t4
; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 8
-; ZVFHMIN64-NEXT: vmv.x.s a1, v8
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT: sb a0, 243(sp)
-; ZVFHMIN64-NEXT: lh a0, 612(sp)
-; ZVFHMIN64-NEXT: lh a1, 356(sp)
-; ZVFHMIN64-NEXT: sb a5, 204(sp)
+; ZVFHMIN64-NEXT: vmv.x.s a3, v8
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a3
+; ZVFHMIN64-NEXT: sb a1, 243(sp)
+; ZVFHMIN64-NEXT: lh a1, 612(sp)
+; ZVFHMIN64-NEXT: lh a3, 356(sp)
+; ZVFHMIN64-NEXT: sb t0, 204(sp)
; ZVFHMIN64-NEXT: sb a4, 205(sp)
-; ZVFHMIN64-NEXT: sb a2, 206(sp)
-; ZVFHMIN64-NEXT: sb a3, 207(sp)
-; ZVFHMIN64-NEXT: feq.h a2, fa5, fa4
-; ZVFHMIN64-NEXT: sb a2, 200(sp)
-; ZVFHMIN64-NEXT: sb a6, 201(sp)
-; ZVFHMIN64-NEXT: sb a7, 202(sp)
-; ZVFHMIN64-NEXT: sb t0, 203(sp)
-; ZVFHMIN64-NEXT: li a2, 128
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT: sb a0, 242(sp)
-; ZVFHMIN64-NEXT: addi a0, sp, 128
-; ZVFHMIN64-NEXT: vsetvli zero, a2, e8, m8, ta, ma
-; ZVFHMIN64-NEXT: vle8.v v8, (a0)
+; ZVFHMIN64-NEXT: sb a0, 206(sp)
+; ZVFHMIN64-NEXT: sb a2, 207(sp)
+; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT: sb a0, 200(sp)
+; ZVFHMIN64-NEXT: sb a5, 201(sp)
+; ZVFHMIN64-NEXT: sb a6, 202(sp)
+; ZVFHMIN64-NEXT: sb a7, 203(sp)
+; ZVFHMIN64-NEXT: li a0, 128
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a3
+; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT: sb a1, 242(sp)
+; ZVFHMIN64-NEXT: addi a1, sp, 128
+; ZVFHMIN64-NEXT: vsetvli zero, a0, e8, m8, ta, ma
+; ZVFHMIN64-NEXT: vle8.v v8, (a1)
; ZVFHMIN64-NEXT: vand.vi v8, v8, 1
; ZVFHMIN64-NEXT: vmsne.vi v0, v8, 0
; ZVFHMIN64-NEXT: addi sp, s0, -896
diff --git a/llvm/test/CodeGen/RISCV/rvv/intrinsic-vector-match.ll b/llvm/test/CodeGen/RISCV/rvv/intrinsic-vector-match.ll
index e70dcd16d02cd2..dd2a8240ee2533 100644
--- a/llvm/test/CodeGen/RISCV/rvv/intrinsic-vector-match.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/intrinsic-vector-match.ll
@@ -507,37 +507,28 @@ define <8 x i1> @match_v8i8_v16i8(<8 x i8> %op1, <16 x i8> %op2, <8 x i1> %mask)
define <vscale x 16 x i1> @match_nxv16i8_v32i8(<vscale x 16 x i8> %op1, <32 x i8> %op2, <vscale x 16 x i1> %mask) {
; RV32-LABEL: match_nxv16i8_v32i8:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -64
-; RV32-NEXT: .cfi_def_cfa_offset 64
-; RV32-NEXT: sw ra, 60(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s0, 56(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s1, 52(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s2, 48(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s3, 44(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s4, 40(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s5, 36(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s6, 32(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s7, 28(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s8, 24(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s9, 20(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s10, 16(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s11, 12(sp) # 4-byte Folded Spill
-; RV32-NEXT: .cfi_offset ra, -4
-; RV32-NEXT: .cfi_offset s0, -8
-; RV32-NEXT: .cfi_offset s1, -12
-; RV32-NEXT: .cfi_offset s2, -16
-; RV32-NEXT: .cfi_offset s3, -20
-; RV32-NEXT: .cfi_offset s4, -24
-; RV32-NEXT: .cfi_offset s5, -28
-; RV32-NEXT: .cfi_offset s6, -32
-; RV32-NEXT: .cfi_offset s7, -36
-; RV32-NEXT: .cfi_offset s8, -40
-; RV32-NEXT: .cfi_offset s9, -44
-; RV32-NEXT: .cfi_offset s10, -48
-; RV32-NEXT: .cfi_offset s11, -52
+; RV32-NEXT: addi sp, sp, -48
+; RV32-NEXT: .cfi_def_cfa_offset 48
+; RV32-NEXT: sw s0, 44(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s1, 40(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s2, 36(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s3, 32(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s4, 28(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s5, 24(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s6, 20(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s7, 16(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s8, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT: .cfi_offset s0, -4
+; RV32-NEXT: .cfi_offset s1, -8
+; RV32-NEXT: .cfi_offset s2, -12
+; RV32-NEXT: .cfi_offset s3, -16
+; RV32-NEXT: .cfi_offset s4, -20
+; RV32-NEXT: .cfi_offset s5, -24
+; RV32-NEXT: .cfi_offset s6, -28
+; RV32-NEXT: .cfi_offset s7, -32
+; RV32-NEXT: .cfi_offset s8, -36
; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma
; RV32-NEXT: vmv.x.s a0, v10
-; RV32-NEXT: sw a0, 8(sp) # 4-byte Folded Spill
; RV32-NEXT: vslidedown.vi v12, v10, 1
; RV32-NEXT: vslidedown.vi v13, v10, 2
; RV32-NEXT: vslidedown.vi v14, v10, 3
@@ -593,95 +584,89 @@ define <vscale x 16 x i1> @match_nxv16i8_v32i8(<vscale x 16 x i8> %op1, <32 x i8
; RV32-NEXT: vmv.x.s s5, v15
; RV32-NEXT: vmv.x.s s6, v16
; RV32-NEXT: vmv.x.s s7, v17
-; RV32-NEXT: vmv.x.s s8, v18
-; RV32-NEXT: vmv.x.s s9, v19
-; RV32-NEXT: vmv.x.s s10, v20
-; RV32-NEXT: vmv.x.s s11, v21
-; RV32-NEXT: vmv.x.s ra, v22
-; RV32-NEXT: vsetvli a0, zero, e8, m2, ta, ma
-; RV32-NEXT: lw a0, 8(sp) # 4-byte Folded Reload
+; RV32-NEXT: vsetvli s8, zero, e8, m2, ta, ma
; RV32-NEXT: vmseq.vx v12, v8, a0
-; RV32-NEXT: vmv.x.s a0, v23
+; RV32-NEXT: vmv.x.s a0, v18
; RV32-NEXT: vmseq.vx v13, v8, s2
-; RV32-NEXT: vmv.x.s s2, v11
-; RV32-NEXT: vmseq.vx v11, v8, s3
-; RV32-NEXT: vmv.x.s s3, v24
-; RV32-NEXT: vmseq.vx v14, v8, s4
-; RV32-NEXT: vmv.x.s s4, v10
-; RV32-NEXT: vmseq.vx v10, v8, s5
-; RV32-NEXT: vmor.mm v12, v12, v13
-; RV32-NEXT: vmseq.vx v13, v8, s6
-; RV32-NEXT: vmor.mm v11, v12, v11
-; RV32-NEXT: vmseq.vx v12, v8, s7
-; RV32-NEXT: vmor.mm v11, v11, v14
-; RV32-NEXT: vmseq.vx v14, v8, s8
-; RV32-NEXT: vmor.mm v10, v11, v10
-; RV32-NEXT: vmseq.vx v11, v8, s9
-; RV32-NEXT: vmor.mm v10, v10, v13
-; RV32-NEXT: vmseq.vx v13, v8, s10
-; RV32-NEXT: vmor.mm v10, v10, v12
-; RV32-NEXT: vmseq.vx v12, v8, s11
-; RV32-NEXT: vmor.mm v10, v10, v14
-; RV32-NEXT: vmseq.vx v14, v8, ra
-; RV32-NEXT: vmor.mm v10, v10, v11
+; RV32-NEXT: vmv.x.s s2, v19
+; RV32-NEXT: vmseq.vx v14, v8, s3
+; RV32-NEXT: vmv.x.s s3, v20
+; RV32-NEXT: vmseq.vx v15, v8, s4
+; RV32-NEXT: vmv.x.s s4, v21
+; RV32-NEXT: vmseq.vx v16, v8, s5
+; RV32-NEXT: vmv.x.s s5, v22
+; RV32-NEXT: vmseq.vx v17, v8, s6
+; RV32-NEXT: vmv.x.s s6, v23
+; RV32-NEXT: vmseq.vx v18, v8, s7
+; RV32-NEXT: vmv.x.s s7, v11
; RV32-NEXT: vmseq.vx v11, v8, a0
-; RV32-NEXT: vmor.mm v10, v10, v13
-; RV32-NEXT: vmseq.vx v13, v8, s2
-; RV32-NEXT: vmor.mm v10, v10, v12
-; RV32-NEXT: vmseq.vx v12, v8, s3
+; RV32-NEXT: vmv.x.s a0, v24
+; RV32-NEXT: vmseq.vx v19, v8, s2
+; RV32-NEXT: vmv.x.s s2, v10
+; RV32-NEXT: vmor.mm v10, v12, v13
; RV32-NEXT: vmor.mm v10, v10, v14
-; RV32-NEXT: vmseq.vx v14, v8, s4
+; RV32-NEXT: vmor.mm v10, v10, v15
+; RV32-NEXT: vmor.mm v10, v10, v16
+; RV32-NEXT: vmor.mm v10, v10, v17
+; RV32-NEXT: vmseq.vx v12, v8, s3
+; RV32-NEXT: vmor.mm v10, v10, v18
+; RV32-NEXT: vmseq.vx v13, v8, s4
; RV32-NEXT: vmor.mm v10, v10, v11
-; RV32-NEXT: vmseq.vx v11, v8, a1
-; RV32-NEXT: vmor.mm v10, v10, v13
-; RV32-NEXT: vmseq.vx v13, v8, a2
+; RV32-NEXT: vmseq.vx v11, v8, s5
+; RV32-NEXT: vmor.mm v10, v10, v19
+; RV32-NEXT: vmseq.vx v14, v8, s6
; RV32-NEXT: vmor.mm v10, v10, v12
-; RV32-NEXT: vmseq.vx v12, v8, a3
-; RV32-NEXT: vmor.mm v10, v10, v14
-; RV32-NEXT: vmseq.vx v14, v8, a4
-; RV32-NEXT: vmor.mm v10, v10, v11
-; RV32-NEXT: vmseq.vx v11, v8, a5
+; RV32-NEXT: vmseq.vx v12, v8, s7
; RV32-NEXT: vmor.mm v10, v10, v13
-; RV32-NEXT: vmseq.vx v13, v8, a6
-; RV32-NEXT: vmor.mm v10, v10, v12
-; RV32-NEXT: vmseq.vx v12, v8, a7
-; RV32-NEXT: vmor.mm v10, v10, v14
-; RV32-NEXT: vmseq.vx v14, v8, t0
+; RV32-NEXT: vmseq.vx v13, v8, a0
; RV32-NEXT: vmor.mm v10, v10, v11
-; RV32-NEXT: vmseq.vx v11, v8, t1
-; RV32-NEXT: vmor.mm v10, v10, v13
-; RV32-NEXT: vmseq.vx v13, v8, t2
-; RV32-NEXT: vmor.mm v10, v10, v12
-; RV32-NEXT: vmseq.vx v12, v8, t3
+; RV32-NEXT: vmseq.vx v11, v8, s2
; RV32-NEXT: vmor.mm v10, v10, v14
-; RV32-NEXT: vmseq.vx v14, v8, t4
-; RV32-NEXT: vmor.mm v10, v10, v11
-; RV32-NEXT: vmseq.vx v11, v8, t5
+; RV32-NEXT: vmseq.vx v14, v8, a1
+; RV32-NEXT: vmor.mm v10, v10, v12
+; RV32-NEXT: vmseq.vx v12, v8, a2
; RV32-NEXT: vmor.mm v10, v10, v13
-; RV32-NEXT: vmseq.vx v13, v8, t6
+; RV32-NEXT: vmseq.vx v13, v8, a3
+; RV32-NEXT: vmor.mm v10, v10, v11
+; RV32-NEXT: vmseq.vx v11, v8, a4
+; RV32-NEXT: vmor.mm v10, v10, v14
+; RV32-NEXT: vmseq.vx v14, v8, a5
; RV32-NEXT: vmor.mm v10, v10, v12
-; RV32-NEXT: vmseq.vx v12, v8, s0
+; RV32-NEXT: vmseq.vx v12, v8, a6
+; RV32-NEXT: vmor.mm v10, v10, v13
+; RV32-NEXT: vmseq.vx v13, v8, a7
+; RV32-NEXT: vmor.mm v10, v10, v11
+; RV32-NEXT: vmseq.vx v11, v8, t0
; RV32-NEXT: vmor.mm v10, v10, v14
+; RV32-NEXT: vmseq.vx v14, v8, t1
+; RV32-NEXT: vmor.mm v10, v10, v12
+; RV32-NEXT: vmseq.vx v12, v8, t2
+; RV32-NEXT: vmor.mm v10, v10, v13
+; RV32-NEXT: vmseq.vx v13, v8, t3
; RV32-NEXT: vmor.mm v10, v10, v11
+; RV32-NEXT: vmseq.vx v11, v8, t4
+; RV32-NEXT: vmor.mm v10, v10, v14
+; RV32-NEXT: vmseq.vx v14, v8, t5
+; RV32-NEXT: vmor.mm v10, v10, v12
+; RV32-NEXT: vmseq.vx v12, v8, t6
; RV32-NEXT: vmor.mm v10, v10, v13
+; RV32-NEXT: vmseq.vx v13, v8, s0
+; RV32-NEXT: vmor.mm v10, v10, v11
+; RV32-NEXT: vmor.mm v10, v10, v14
; RV32-NEXT: vmor.mm v10, v10, v12
+; RV32-NEXT: vmor.mm v10, v10, v13
; RV32-NEXT: vmseq.vx v11, v8, s1
; RV32-NEXT: vmor.mm v8, v10, v11
; RV32-NEXT: vmand.mm v0, v8, v0
-; RV32-NEXT: lw ra, 60(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s0, 56(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s1, 52(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s2, 48(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s3, 44(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s4, 40(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s5, 36(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s6, 32(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s7, 28(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s8, 24(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s9, 20(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s10, 16(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s11, 12(sp) # 4-byte Folded Reload
-; RV32-NEXT: .cfi_restore ra
+; RV32-NEXT: lw s0, 44(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s1, 40(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s2, 36(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s3, 32(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s4, 28(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s5, 24(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s6, 20(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s7, 16(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s8, 12(sp) # 4-byte Folded Reload
; RV32-NEXT: .cfi_restore s0
; RV32-NEXT: .cfi_restore s1
; RV32-NEXT: .cfi_restore s2
@@ -691,46 +676,34 @@ define <vscale x 16 x i1> @match_nxv16i8_v32i8(<vscale x 16 x i8> %op1, <32 x i8
; RV32-NEXT: .cfi_restore s6
; RV32-NEXT: .cfi_restore s7
; RV32-NEXT: .cfi_restore s8
-; RV32-NEXT: .cfi_restore s9
-; RV32-NEXT: .cfi_restore s10
-; RV32-NEXT: .cfi_restore s11
-; RV32-NEXT: addi sp, sp, 64
+; RV32-NEXT: addi sp, sp, 48
; RV32-NEXT: .cfi_def_cfa_offset 0
; RV32-NEXT: ret
;
; RV64-LABEL: match_nxv16i8_v32i8:
; RV64: # %bb.0:
-; RV64-NEXT: addi sp, sp, -112
-; RV64-NEXT: .cfi_def_cfa_offset 112
-; RV64-NEXT: sd ra, 104(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s0, 96(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s1, 88(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s2, 80(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s3, 72(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s4, 64(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s5, 56(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s6, 48(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s7, 40(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s8, 32(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s9, 24(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s10, 16(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s11, 8(sp) # 8-byte Folded Spill
-; RV64-NEXT: .cfi_offset ra, -8
-; RV64-NEXT: .cfi_offset s0, -16
-; RV64-NEXT: .cfi_offset s1, -24
-; RV64-NEXT: .cfi_offset s2, -32
-; RV64-NEXT: .cfi_offset s3, -40
-; RV64-NEXT: .cfi_offset s4, -48
-; RV64-NEXT: .cfi_offset s5, -56
-; RV64-NEXT: .cfi_offset s6, -64
-; RV64-NEXT: .cfi_offset s7, -72
-; RV64-NEXT: .cfi_offset s8, -80
-; RV64-NEXT: .cfi_offset s9, -88
-; RV64-NEXT: .cfi_offset s10, -96
-; RV64-NEXT: .cfi_offset s11, -104
+; RV64-NEXT: addi sp, sp, -80
+; RV64-NEXT: .cfi_def_cfa_offset 80
+; RV64-NEXT: sd s0, 72(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s1, 64(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s2, 56(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s3, 48(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s4, 40(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s5, 32(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s6, 24(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s7, 16(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s8, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT: .cfi_offset s0, -8
+; RV64-NEXT: .cfi_offset s1, -16
+; RV64-NEXT: .cfi_offset s2, -24
+; RV64-NEXT: .cfi_offset s3, -32
+; RV64-NEXT: .cfi_offset s4, -40
+; RV64-NEXT: .cfi_offset s5, -48
+; RV64-NEXT: .cfi_offset s6, -56
+; RV64-NEXT: .cfi_offset s7, -64
+; RV64-NEXT: .cfi_offset s8, -72
; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma
; RV64-NEXT: vmv.x.s a0, v10
-; RV64-NEXT: sd a0, 0(sp) # 8-byte Folded Spill
; RV64-NEXT: vslidedown.vi v12, v10, 1
; RV64-NEXT: vslidedown.vi v13, v10, 2
; RV64-NEXT: vslidedown.vi v14, v10, 3
@@ -786,95 +759,89 @@ define <vscale x 16 x i1> @match_nxv16i8_v32i8(<vscale x 16 x i8> %op1, <32 x i8
; RV64-NEXT: vmv.x.s s5, v15
; RV64-NEXT: vmv.x.s s6, v16
; RV64-NEXT: vmv.x.s s7, v17
-; RV64-NEXT: vmv.x.s s8, v18
-; RV64-NEXT: vmv.x.s s9, v19
-; RV64-NEXT: vmv.x.s s10, v20
-; RV64-NEXT: vmv.x.s s11, v21
-; RV64-NEXT: vmv.x.s ra, v22
-; RV64-NEXT: vsetvli a0, zero, e8, m2, ta, ma
-; RV64-NEXT: ld a0, 0(sp) # 8-byte Folded Reload
+; RV64-NEXT: vsetvli s8, zero, e8, m2, ta, ma
; RV64-NEXT: vmseq.vx v12, v8, a0
-; RV64-NEXT: vmv.x.s a0, v23
+; RV64-NEXT: vmv.x.s a0, v18
; RV64-NEXT: vmseq.vx v13, v8, s2
-; RV64-NEXT: vmv.x.s s2, v11
-; RV64-NEXT: vmseq.vx v11, v8, s3
-; RV64-NEXT: vmv.x.s s3, v24
-; RV64-NEXT: vmseq.vx v14, v8, s4
-; RV64-NEXT: vmv.x.s s4, v10
-; RV64-NEXT: vmseq.vx v10, v8, s5
-; RV64-NEXT: vmor.mm v12, v12, v13
-; RV64-NEXT: vmseq.vx v13, v8, s6
-; RV64-NEXT: vmor.mm v11, v12, v11
-; RV64-NEXT: vmseq.vx v12, v8, s7
-; RV64-NEXT: vmor.mm v11, v11, v14
-; RV64-NEXT: vmseq.vx v14, v8, s8
-; RV64-NEXT: vmor.mm v10, v11, v10
-; RV64-NEXT: vmseq.vx v11, v8, s9
-; RV64-NEXT: vmor.mm v10, v10, v13
-; RV64-NEXT: vmseq.vx v13, v8, s10
-; RV64-NEXT: vmor.mm v10, v10, v12
-; RV64-NEXT: vmseq.vx v12, v8, s11
-; RV64-NEXT: vmor.mm v10, v10, v14
-; RV64-NEXT: vmseq.vx v14, v8, ra
-; RV64-NEXT: vmor.mm v10, v10, v11
+; RV64-NEXT: vmv.x.s s2, v19
+; RV64-NEXT: vmseq.vx v14, v8, s3
+; RV64-NEXT: vmv.x.s s3, v20
+; RV64-NEXT: vmseq.vx v15, v8, s4
+; RV64-NEXT: vmv.x.s s4, v21
+; RV64-NEXT: vmseq.vx v16, v8, s5
+; RV64-NEXT: vmv.x.s s5, v22
+; RV64-NEXT: vmseq.vx v17, v8, s6
+; RV64-NEXT: vmv.x.s s6, v23
+; RV64-NEXT: vmseq.vx v18, v8, s7
+; RV64-NEXT: vmv.x.s s7, v11
; RV64-NEXT: vmseq.vx v11, v8, a0
-; RV64-NEXT: vmor.mm v10, v10, v13
-; RV64-NEXT: vmseq.vx v13, v8, s2
-; RV64-NEXT: vmor.mm v10, v10, v12
-; RV64-NEXT: vmseq.vx v12, v8, s3
+; RV64-NEXT: vmv.x.s a0, v24
+; RV64-NEXT: vmseq.vx v19, v8, s2
+; RV64-NEXT: vmv.x.s s2, v10
+; RV64-NEXT: vmor.mm v10, v12, v13
; RV64-NEXT: vmor.mm v10, v10, v14
-; RV64-NEXT: vmseq.vx v14, v8, s4
+; RV64-NEXT: vmor.mm v10, v10, v15
+; RV64-NEXT: vmor.mm v10, v10, v16
+; RV64-NEXT: vmor.mm v10, v10, v17
+; RV64-NEXT: vmseq.vx v12, v8, s3
+; RV64-NEXT: vmor.mm v10, v10, v18
+; RV64-NEXT: vmseq.vx v13, v8, s4
; RV64-NEXT: vmor.mm v10, v10, v11
-; RV64-NEXT: vmseq.vx v11, v8, a1
-; RV64-NEXT: vmor.mm v10, v10, v13
-; RV64-NEXT: vmseq.vx v13, v8, a2
+; RV64-NEXT: vmseq.vx v11, v8, s5
+; RV64-NEXT: vmor.mm v10, v10, v19
+; RV64-NEXT: vmseq.vx v14, v8, s6
; RV64-NEXT: vmor.mm v10, v10, v12
-; RV64-NEXT: vmseq.vx v12, v8, a3
-; RV64-NEXT: vmor.mm v10, v10, v14
-; RV64-NEXT: vmseq.vx v14, v8, a4
-; RV64-NEXT: vmor.mm v10, v10, v11
-; RV64-NEXT: vmseq.vx v11, v8, a5
+; RV64-NEXT: vmseq.vx v12, v8, s7
; RV64-NEXT: vmor.mm v10, v10, v13
-; RV64-NEXT: vmseq.vx v13, v8, a6
-; RV64-NEXT: vmor.mm v10, v10, v12
-; RV64-NEXT: vmseq.vx v12, v8, a7
-; RV64-NEXT: vmor.mm v10, v10, v14
-; RV64-NEXT: vmseq.vx v14, v8, t0
+; RV64-NEXT: vmseq.vx v13, v8, a0
; RV64-NEXT: vmor.mm v10, v10, v11
-; RV64-NEXT: vmseq.vx v11, v8, t1
-; RV64-NEXT: vmor.mm v10, v10, v13
-; RV64-NEXT: vmseq.vx v13, v8, t2
-; RV64-NEXT: vmor.mm v10, v10, v12
-; RV64-NEXT: vmseq.vx v12, v8, t3
+; RV64-NEXT: vmseq.vx v11, v8, s2
; RV64-NEXT: vmor.mm v10, v10, v14
-; RV64-NEXT: vmseq.vx v14, v8, t4
-; RV64-NEXT: vmor.mm v10, v10, v11
-; RV64-NEXT: vmseq.vx v11, v8, t5
+; RV64-NEXT: vmseq.vx v14, v8, a1
+; RV64-NEXT: vmor.mm v10, v10, v12
+; RV64-NEXT: vmseq.vx v12, v8, a2
; RV64-NEXT: vmor.mm v10, v10, v13
-; RV64-NEXT: vmseq.vx v13, v8, t6
+; RV64-NEXT: vmseq.vx v13, v8, a3
+; RV64-NEXT: vmor.mm v10, v10, v11
+; RV64-NEXT: vmseq.vx v11, v8, a4
+; RV64-NEXT: vmor.mm v10, v10, v14
+; RV64-NEXT: vmseq.vx v14, v8, a5
; RV64-NEXT: vmor.mm v10, v10, v12
-; RV64-NEXT: vmseq.vx v12, v8, s0
+; RV64-NEXT: vmseq.vx v12, v8, a6
+; RV64-NEXT: vmor.mm v10, v10, v13
+; RV64-NEXT: vmseq.vx v13, v8, a7
+; RV64-NEXT: vmor.mm v10, v10, v11
+; RV64-NEXT: vmseq.vx v11, v8, t0
; RV64-NEXT: vmor.mm v10, v10, v14
+; RV64-NEXT: vmseq.vx v14, v8, t1
+; RV64-NEXT: vmor.mm v10, v10, v12
+; RV64-NEXT: vmseq.vx v12, v8, t2
+; RV64-NEXT: vmor.mm v10, v10, v13
+; RV64-NEXT: vmseq.vx v13, v8, t3
; RV64-NEXT: vmor.mm v10, v10, v11
+; RV64-NEXT: vmseq.vx v11, v8, t4
+; RV64-NEXT: vmor.mm v10, v10, v14
+; RV64-NEXT: vmseq.vx v14, v8, t5
+; RV64-NEXT: vmor.mm v10, v10, v12
+; RV64-NEXT: vmseq.vx v12, v8, t6
; RV64-NEXT: vmor.mm v10, v10, v13
+; RV64-NEXT: vmseq.vx v13, v8, s0
+; RV64-NEXT: vmor.mm v10, v10, v11
+; RV64-NEXT: vmor.mm v10, v10, v14
; RV64-NEXT: vmor.mm v10, v10, v12
+; RV64-NEXT: vmor.mm v10, v10, v13
; RV64-NEXT: vmseq.vx v11, v8, s1
; RV64-NEXT: vmor.mm v8, v10, v11
; RV64-NEXT: vmand.mm v0, v8, v0
-; RV64-NEXT: ld ra, 104(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s0, 96(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s1, 88(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s2, 80(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s3, 72(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s4, 64(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s5, 56(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s6, 48(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s7, 40(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s8, 32(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s9, 24(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s10, 16(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s11, 8(sp) # 8-byte Folded Reload
-; RV64-NEXT: .cfi_restore ra
+; RV64-NEXT: ld s0, 72(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s1, 64(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s2, 56(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s3, 48(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s4, 40(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s5, 32(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s6, 24(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s7, 16(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s8, 8(sp) # 8-byte Folded Reload
; RV64-NEXT: .cfi_restore s0
; RV64-NEXT: .cfi_restore s1
; RV64-NEXT: .cfi_restore s2
@@ -884,10 +851,7 @@ define <vscale x 16 x i1> @match_nxv16i8_v32i8(<vscale x 16 x i8> %op1, <32 x i8
; RV64-NEXT: .cfi_restore s6
; RV64-NEXT: .cfi_restore s7
; RV64-NEXT: .cfi_restore s8
-; RV64-NEXT: .cfi_restore s9
-; RV64-NEXT: .cfi_restore s10
-; RV64-NEXT: .cfi_restore s11
-; RV64-NEXT: addi sp, sp, 112
+; RV64-NEXT: addi sp, sp, 80
; RV64-NEXT: .cfi_def_cfa_offset 0
; RV64-NEXT: ret
%r = tail call <vscale x 16 x i1> @llvm.experimental.vector.match(<vscale x 16 x i8> %op1, <32 x i8> %op2, <vscale x 16 x i1> %mask)
@@ -897,34 +861,24 @@ define <vscale x 16 x i1> @match_nxv16i8_v32i8(<vscale x 16 x i8> %op1, <32 x i8
define <16 x i1> @match_v16i8_v32i8(<16 x i8> %op1, <32 x i8> %op2, <16 x i1> %mask) {
; RV32-LABEL: match_v16i8_v32i8:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -64
-; RV32-NEXT: .cfi_def_cfa_offset 64
-; RV32-NEXT: sw ra, 60(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s0, 56(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s1, 52(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s2, 48(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s3, 44(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s4, 40(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s5, 36(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s6, 32(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s7, 28(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s8, 24(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s9, 20(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s10, 16(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s11, 12(sp) # 4-byte Folded Spill
-; RV32-NEXT: .cfi_offset ra, -4
-; RV32-NEXT: .cfi_offset s0, -8
-; RV32-NEXT: .cfi_offset s1, -12
-; RV32-NEXT: .cfi_offset s2, -16
-; RV32-NEXT: .cfi_offset s3, -20
-; RV32-NEXT: .cfi_offset s4, -24
-; RV32-NEXT: .cfi_offset s5, -28
-; RV32-NEXT: .cfi_offset s6, -32
-; RV32-NEXT: .cfi_offset s7, -36
-; RV32-NEXT: .cfi_offset s8, -40
-; RV32-NEXT: .cfi_offset s9, -44
-; RV32-NEXT: .cfi_offset s10, -48
-; RV32-NEXT: .cfi_offset s11, -52
+; RV32-NEXT: addi sp, sp, -32
+; RV32-NEXT: .cfi_def_cfa_offset 32
+; RV32-NEXT: sw s0, 28(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s1, 24(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s2, 20(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s3, 16(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s4, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s5, 8(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s6, 4(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s7, 0(sp) # 4-byte Folded Spill
+; RV32-NEXT: .cfi_offset s0, -4
+; RV32-NEXT: .cfi_offset s1, -8
+; RV32-NEXT: .cfi_offset s2, -12
+; RV32-NEXT: .cfi_offset s3, -16
+; RV32-NEXT: .cfi_offset s4, -20
+; RV32-NEXT: .cfi_offset s5, -24
+; RV32-NEXT: .cfi_offset s6, -28
+; RV32-NEXT: .cfi_offset s7, -32
; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma
; RV32-NEXT: vmv.x.s a0, v10
; RV32-NEXT: vslidedown.vi v9, v10, 1
@@ -982,93 +936,87 @@ define <16 x i1> @match_v16i8_v32i8(<16 x i8> %op1, <32 x i8> %op2, <16 x i1> %m
; RV32-NEXT: vmv.x.s s5, v14
; RV32-NEXT: vmv.x.s s6, v15
; RV32-NEXT: vmv.x.s s7, v16
-; RV32-NEXT: vmv.x.s s8, v17
-; RV32-NEXT: vmv.x.s s9, v18
-; RV32-NEXT: vmv.x.s s10, v19
-; RV32-NEXT: vmv.x.s s11, v20
-; RV32-NEXT: vmv.x.s ra, v21
; RV32-NEXT: vmseq.vx v9, v8, a0
-; RV32-NEXT: vmv.x.s a0, v22
+; RV32-NEXT: vmv.x.s a0, v17
; RV32-NEXT: vmseq.vx v12, v8, s2
-; RV32-NEXT: vmv.x.s s2, v11
-; RV32-NEXT: vmseq.vx v11, v8, s3
-; RV32-NEXT: vmv.x.s s3, v23
-; RV32-NEXT: vmseq.vx v13, v8, s4
-; RV32-NEXT: vmv.x.s s4, v10
-; RV32-NEXT: vmseq.vx v10, v8, s5
+; RV32-NEXT: vmv.x.s s2, v18
+; RV32-NEXT: vmseq.vx v13, v8, s3
+; RV32-NEXT: vmv.x.s s3, v19
+; RV32-NEXT: vmseq.vx v14, v8, s4
+; RV32-NEXT: vmv.x.s s4, v20
+; RV32-NEXT: vmseq.vx v15, v8, s5
+; RV32-NEXT: vmv.x.s s5, v21
+; RV32-NEXT: vmseq.vx v16, v8, s6
+; RV32-NEXT: vmv.x.s s6, v22
+; RV32-NEXT: vmseq.vx v17, v8, s7
+; RV32-NEXT: vmv.x.s s7, v11
+; RV32-NEXT: vmseq.vx v11, v8, a0
+; RV32-NEXT: vmv.x.s a0, v23
+; RV32-NEXT: vmseq.vx v18, v8, s2
+; RV32-NEXT: vmv.x.s s2, v10
; RV32-NEXT: vmor.mm v9, v9, v12
-; RV32-NEXT: vmseq.vx v12, v8, s6
-; RV32-NEXT: vmor.mm v9, v9, v11
-; RV32-NEXT: vmseq.vx v11, v8, s7
; RV32-NEXT: vmor.mm v9, v9, v13
-; RV32-NEXT: vmseq.vx v13, v8, s8
-; RV32-NEXT: vmor.mm v9, v9, v10
-; RV32-NEXT: vmseq.vx v10, v8, s9
-; RV32-NEXT: vmor.mm v9, v9, v12
-; RV32-NEXT: vmseq.vx v12, v8, s10
+; RV32-NEXT: vmor.mm v9, v9, v14
+; RV32-NEXT: vmor.mm v9, v9, v15
+; RV32-NEXT: vmor.mm v9, v9, v16
+; RV32-NEXT: vmseq.vx v10, v8, s3
+; RV32-NEXT: vmor.mm v9, v9, v17
+; RV32-NEXT: vmseq.vx v12, v8, s4
; RV32-NEXT: vmor.mm v9, v9, v11
-; RV32-NEXT: vmseq.vx v11, v8, s11
-; RV32-NEXT: vmor.mm v9, v9, v13
-; RV32-NEXT: vmseq.vx v13, v8, ra
+; RV32-NEXT: vmseq.vx v11, v8, s5
+; RV32-NEXT: vmor.mm v9, v9, v18
+; RV32-NEXT: vmseq.vx v13, v8, s6
; RV32-NEXT: vmor.mm v9, v9, v10
-; RV32-NEXT: vmseq.vx v10, v8, a0
+; RV32-NEXT: vmseq.vx v10, v8, s7
; RV32-NEXT: vmor.mm v9, v9, v12
-; RV32-NEXT: vmseq.vx v12, v8, s2
+; RV32-NEXT: vmseq.vx v12, v8, a0
; RV32-NEXT: vmor.mm v9, v9, v11
-; RV32-NEXT: vmseq.vx v11, v8, s3
+; RV32-NEXT: vmseq.vx v11, v8, s2
; RV32-NEXT: vmor.mm v9, v9, v13
-; RV32-NEXT: vmseq.vx v13, v8, s4
+; RV32-NEXT: vmseq.vx v13, v8, a1
; RV32-NEXT: vmor.mm v9, v9, v10
-; RV32-NEXT: vmseq.vx v10, v8, a1
+; RV32-NEXT: vmseq.vx v10, v8, a2
; RV32-NEXT: vmor.mm v9, v9, v12
-; RV32-NEXT: vmseq.vx v12, v8, a2
+; RV32-NEXT: vmseq.vx v12, v8, a3
; RV32-NEXT: vmor.mm v9, v9, v11
-; RV32-NEXT: vmseq.vx v11, v8, a3
+; RV32-NEXT: vmseq.vx v11, v8, a4
; RV32-NEXT: vmor.mm v9, v9, v13
-; RV32-NEXT: vmseq.vx v13, v8, a4
+; RV32-NEXT: vmseq.vx v13, v8, a5
; RV32-NEXT: vmor.mm v9, v9, v10
-; RV32-NEXT: vmseq.vx v10, v8, a5
+; RV32-NEXT: vmseq.vx v10, v8, a6
; RV32-NEXT: vmor.mm v9, v9, v12
-; RV32-NEXT: vmseq.vx v12, v8, a6
+; RV32-NEXT: vmseq.vx v12, v8, a7
; RV32-NEXT: vmor.mm v9, v9, v11
-; RV32-NEXT: vmseq.vx v11, v8, a7
+; RV32-NEXT: vmseq.vx v11, v8, t0
; RV32-NEXT: vmor.mm v9, v9, v13
-; RV32-NEXT: vmseq.vx v13, v8, t0
+; RV32-NEXT: vmseq.vx v13, v8, t1
; RV32-NEXT: vmor.mm v9, v9, v10
-; RV32-NEXT: vmseq.vx v10, v8, t1
+; RV32-NEXT: vmseq.vx v10, v8, t2
; RV32-NEXT: vmor.mm v9, v9, v12
-; RV32-NEXT: vmseq.vx v12, v8, t2
+; RV32-NEXT: vmseq.vx v12, v8, t3
; RV32-NEXT: vmor.mm v9, v9, v11
-; RV32-NEXT: vmseq.vx v11, v8, t3
+; RV32-NEXT: vmseq.vx v11, v8, t4
; RV32-NEXT: vmor.mm v9, v9, v13
-; RV32-NEXT: vmseq.vx v13, v8, t4
+; RV32-NEXT: vmseq.vx v13, v8, t5
; RV32-NEXT: vmor.mm v9, v9, v10
-; RV32-NEXT: vmseq.vx v10, v8, t5
+; RV32-NEXT: vmseq.vx v10, v8, t6
; RV32-NEXT: vmor.mm v9, v9, v12
-; RV32-NEXT: vmseq.vx v12, v8, t6
+; RV32-NEXT: vmseq.vx v12, v8, s0
; RV32-NEXT: vmor.mm v9, v9, v11
-; RV32-NEXT: vmseq.vx v11, v8, s0
; RV32-NEXT: vmor.mm v9, v9, v13
; RV32-NEXT: vmor.mm v9, v9, v10
; RV32-NEXT: vmor.mm v9, v9, v12
-; RV32-NEXT: vmor.mm v9, v9, v11
; RV32-NEXT: vmseq.vx v8, v8, s1
; RV32-NEXT: vmor.mm v8, v9, v8
; RV32-NEXT: vmand.mm v0, v8, v0
-; RV32-NEXT: lw ra, 60(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s0, 56(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s1, 52(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s2, 48(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s3, 44(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s4, 40(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s5, 36(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s6, 32(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s7, 28(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s8, 24(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s9, 20(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s10, 16(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s11, 12(sp) # 4-byte Folded Reload
-; RV32-NEXT: .cfi_restore ra
+; RV32-NEXT: lw s0, 28(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s1, 24(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s2, 20(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s3, 16(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s4, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s5, 8(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s6, 4(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s7, 0(sp) # 4-byte Folded Reload
; RV32-NEXT: .cfi_restore s0
; RV32-NEXT: .cfi_restore s1
; RV32-NEXT: .cfi_restore s2
@@ -1077,44 +1025,30 @@ define <16 x i1> @match_v16i8_v32i8(<16 x i8> %op1, <32 x i8> %op2, <16 x i1> %m
; RV32-NEXT: .cfi_restore s5
; RV32-NEXT: .cfi_restore s6
; RV32-NEXT: .cfi_restore s7
-; RV32-NEXT: .cfi_restore s8
-; RV32-NEXT: .cfi_restore s9
-; RV32-NEXT: .cfi_restore s10
-; RV32-NEXT: .cfi_restore s11
-; RV32-NEXT: addi sp, sp, 64
+; RV32-NEXT: addi sp, sp, 32
; RV32-NEXT: .cfi_def_cfa_offset 0
; RV32-NEXT: ret
;
; RV64-LABEL: match_v16i8_v32i8:
; RV64: # %bb.0:
-; RV64-NEXT: addi sp, sp, -112
-; RV64-NEXT: .cfi_def_cfa_offset 112
-; RV64-NEXT: sd ra, 104(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s0, 96(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s1, 88(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s2, 80(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s3, 72(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s4, 64(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s5, 56(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s6, 48(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s7, 40(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s8, 32(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s9, 24(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s10, 16(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s11, 8(sp) # 8-byte Folded Spill
-; RV64-NEXT: .cfi_offset ra, -8
-; RV64-NEXT: .cfi_offset s0, -16
-; RV64-NEXT: .cfi_offset s1, -24
-; RV64-NEXT: .cfi_offset s2, -32
-; RV64-NEXT: .cfi_offset s3, -40
-; RV64-NEXT: .cfi_offset s4, -48
-; RV64-NEXT: .cfi_offset s5, -56
-; RV64-NEXT: .cfi_offset s6, -64
-; RV64-NEXT: .cfi_offset s7, -72
-; RV64-NEXT: .cfi_offset s8, -80
-; RV64-NEXT: .cfi_offset s9, -88
-; RV64-NEXT: .cfi_offset s10, -96
-; RV64-NEXT: .cfi_offset s11, -104
+; RV64-NEXT: addi sp, sp, -64
+; RV64-NEXT: .cfi_def_cfa_offset 64
+; RV64-NEXT: sd s0, 56(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s1, 48(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s2, 40(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s3, 32(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s4, 24(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s5, 16(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s6, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s7, 0(sp) # 8-byte Folded Spill
+; RV64-NEXT: .cfi_offset s0, -8
+; RV64-NEXT: .cfi_offset s1, -16
+; RV64-NEXT: .cfi_offset s2, -24
+; RV64-NEXT: .cfi_offset s3, -32
+; RV64-NEXT: .cfi_offset s4, -40
+; RV64-NEXT: .cfi_offset s5, -48
+; RV64-NEXT: .cfi_offset s6, -56
+; RV64-NEXT: .cfi_offset s7, -64
; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma
; RV64-NEXT: vmv.x.s a0, v10
; RV64-NEXT: vslidedown.vi v9, v10, 1
@@ -1172,93 +1106,87 @@ define <16 x i1> @match_v16i8_v32i8(<16 x i8> %op1, <32 x i8> %op2, <16 x i1> %m
; RV64-NEXT: vmv.x.s s5, v14
; RV64-NEXT: vmv.x.s s6, v15
; RV64-NEXT: vmv.x.s s7, v16
-; RV64-NEXT: vmv.x.s s8, v17
-; RV64-NEXT: vmv.x.s s9, v18
-; RV64-NEXT: vmv.x.s s10, v19
-; RV64-NEXT: vmv.x.s s11, v20
-; RV64-NEXT: vmv.x.s ra, v21
; RV64-NEXT: vmseq.vx v9, v8, a0
-; RV64-NEXT: vmv.x.s a0, v22
+; RV64-NEXT: vmv.x.s a0, v17
; RV64-NEXT: vmseq.vx v12, v8, s2
-; RV64-NEXT: vmv.x.s s2, v11
-; RV64-NEXT: vmseq.vx v11, v8, s3
-; RV64-NEXT: vmv.x.s s3, v23
-; RV64-NEXT: vmseq.vx v13, v8, s4
-; RV64-NEXT: vmv.x.s s4, v10
-; RV64-NEXT: vmseq.vx v10, v8, s5
+; RV64-NEXT: vmv.x.s s2, v18
+; RV64-NEXT: vmseq.vx v13, v8, s3
+; RV64-NEXT: vmv.x.s s3, v19
+; RV64-NEXT: vmseq.vx v14, v8, s4
+; RV64-NEXT: vmv.x.s s4, v20
+; RV64-NEXT: vmseq.vx v15, v8, s5
+; RV64-NEXT: vmv.x.s s5, v21
+; RV64-NEXT: vmseq.vx v16, v8, s6
+; RV64-NEXT: vmv.x.s s6, v22
+; RV64-NEXT: vmseq.vx v17, v8, s7
+; RV64-NEXT: vmv.x.s s7, v11
+; RV64-NEXT: vmseq.vx v11, v8, a0
+; RV64-NEXT: vmv.x.s a0, v23
+; RV64-NEXT: vmseq.vx v18, v8, s2
+; RV64-NEXT: vmv.x.s s2, v10
; RV64-NEXT: vmor.mm v9, v9, v12
-; RV64-NEXT: vmseq.vx v12, v8, s6
-; RV64-NEXT: vmor.mm v9, v9, v11
-; RV64-NEXT: vmseq.vx v11, v8, s7
; RV64-NEXT: vmor.mm v9, v9, v13
-; RV64-NEXT: vmseq.vx v13, v8, s8
-; RV64-NEXT: vmor.mm v9, v9, v10
-; RV64-NEXT: vmseq.vx v10, v8, s9
-; RV64-NEXT: vmor.mm v9, v9, v12
-; RV64-NEXT: vmseq.vx v12, v8, s10
+; RV64-NEXT: vmor.mm v9, v9, v14
+; RV64-NEXT: vmor.mm v9, v9, v15
+; RV64-NEXT: vmor.mm v9, v9, v16
+; RV64-NEXT: vmseq.vx v10, v8, s3
+; RV64-NEXT: vmor.mm v9, v9, v17
+; RV64-NEXT: vmseq.vx v12, v8, s4
; RV64-NEXT: vmor.mm v9, v9, v11
-; RV64-NEXT: vmseq.vx v11, v8, s11
-; RV64-NEXT: vmor.mm v9, v9, v13
-; RV64-NEXT: vmseq.vx v13, v8, ra
+; RV64-NEXT: vmseq.vx v11, v8, s5
+; RV64-NEXT: vmor.mm v9, v9, v18
+; RV64-NEXT: vmseq.vx v13, v8, s6
; RV64-NEXT: vmor.mm v9, v9, v10
-; RV64-NEXT: vmseq.vx v10, v8, a0
+; RV64-NEXT: vmseq.vx v10, v8, s7
; RV64-NEXT: vmor.mm v9, v9, v12
-; RV64-NEXT: vmseq.vx v12, v8, s2
+; RV64-NEXT: vmseq.vx v12, v8, a0
; RV64-NEXT: vmor.mm v9, v9, v11
-; RV64-NEXT: vmseq.vx v11, v8, s3
+; RV64-NEXT: vmseq.vx v11, v8, s2
; RV64-NEXT: vmor.mm v9, v9, v13
-; RV64-NEXT: vmseq.vx v13, v8, s4
+; RV64-NEXT: vmseq.vx v13, v8, a1
; RV64-NEXT: vmor.mm v9, v9, v10
-; RV64-NEXT: vmseq.vx v10, v8, a1
+; RV64-NEXT: vmseq.vx v10, v8, a2
; RV64-NEXT: vmor.mm v9, v9, v12
-; RV64-NEXT: vmseq.vx v12, v8, a2
+; RV64-NEXT: vmseq.vx v12, v8, a3
; RV64-NEXT: vmor.mm v9, v9, v11
-; RV64-NEXT: vmseq.vx v11, v8, a3
+; RV64-NEXT: vmseq.vx v11, v8, a4
; RV64-NEXT: vmor.mm v9, v9, v13
-; RV64-NEXT: vmseq.vx v13, v8, a4
+; RV64-NEXT: vmseq.vx v13, v8, a5
; RV64-NEXT: vmor.mm v9, v9, v10
-; RV64-NEXT: vmseq.vx v10, v8, a5
+; RV64-NEXT: vmseq.vx v10, v8, a6
; RV64-NEXT: vmor.mm v9, v9, v12
-; RV64-NEXT: vmseq.vx v12, v8, a6
+; RV64-NEXT: vmseq.vx v12, v8, a7
; RV64-NEXT: vmor.mm v9, v9, v11
-; RV64-NEXT: vmseq.vx v11, v8, a7
+; RV64-NEXT: vmseq.vx v11, v8, t0
; RV64-NEXT: vmor.mm v9, v9, v13
-; RV64-NEXT: vmseq.vx v13, v8, t0
+; RV64-NEXT: vmseq.vx v13, v8, t1
; RV64-NEXT: vmor.mm v9, v9, v10
-; RV64-NEXT: vmseq.vx v10, v8, t1
+; RV64-NEXT: vmseq.vx v10, v8, t2
; RV64-NEXT: vmor.mm v9, v9, v12
-; RV64-NEXT: vmseq.vx v12, v8, t2
+; RV64-NEXT: vmseq.vx v12, v8, t3
; RV64-NEXT: vmor.mm v9, v9, v11
-; RV64-NEXT: vmseq.vx v11, v8, t3
+; RV64-NEXT: vmseq.vx v11, v8, t4
; RV64-NEXT: vmor.mm v9, v9, v13
-; RV64-NEXT: vmseq.vx v13, v8, t4
+; RV64-NEXT: vmseq.vx v13, v8, t5
; RV64-NEXT: vmor.mm v9, v9, v10
-; RV64-NEXT: vmseq.vx v10, v8, t5
+; RV64-NEXT: vmseq.vx v10, v8, t6
; RV64-NEXT: vmor.mm v9, v9, v12
-; RV64-NEXT: vmseq.vx v12, v8, t6
+; RV64-NEXT: vmseq.vx v12, v8, s0
; RV64-NEXT: vmor.mm v9, v9, v11
-; RV64-NEXT: vmseq.vx v11, v8, s0
; RV64-NEXT: vmor.mm v9, v9, v13
; RV64-NEXT: vmor.mm v9, v9, v10
; RV64-NEXT: vmor.mm v9, v9, v12
-; RV64-NEXT: vmor.mm v9, v9, v11
; RV64-NEXT: vmseq.vx v8, v8, s1
; RV64-NEXT: vmor.mm v8, v9, v8
; RV64-NEXT: vmand.mm v0, v8, v0
-; RV64-NEXT: ld ra, 104(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s0, 96(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s1, 88(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s2, 80(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s3, 72(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s4, 64(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s5, 56(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s6, 48(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s7, 40(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s8, 32(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s9, 24(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s10, 16(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s11, 8(sp) # 8-byte Folded Reload
-; RV64-NEXT: .cfi_restore ra
+; RV64-NEXT: ld s0, 56(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s1, 48(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s2, 40(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s3, 32(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s4, 24(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s5, 16(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s6, 8(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s7, 0(sp) # 8-byte Folded Reload
; RV64-NEXT: .cfi_restore s0
; RV64-NEXT: .cfi_restore s1
; RV64-NEXT: .cfi_restore s2
@@ -1267,11 +1195,7 @@ define <16 x i1> @match_v16i8_v32i8(<16 x i8> %op1, <32 x i8> %op2, <16 x i1> %m
; RV64-NEXT: .cfi_restore s5
; RV64-NEXT: .cfi_restore s6
; RV64-NEXT: .cfi_restore s7
-; RV64-NEXT: .cfi_restore s8
-; RV64-NEXT: .cfi_restore s9
-; RV64-NEXT: .cfi_restore s10
-; RV64-NEXT: .cfi_restore s11
-; RV64-NEXT: addi sp, sp, 112
+; RV64-NEXT: addi sp, sp, 64
; RV64-NEXT: .cfi_def_cfa_offset 0
; RV64-NEXT: ret
%r = tail call <16 x i1> @llvm.experimental.vector.match(<16 x i8> %op1, <32 x i8> %op2, <16 x i1> %mask)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll b/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll
index c35f05be304cce..ec2448cb3965f3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll
@@ -489,8 +489,9 @@ define void @test1(ptr nocapture noundef writeonly %dst, i32 noundef signext %i_
; RV64-NEXT: j .LBB0_11
; RV64-NEXT: .LBB0_8: # %vector.ph
; RV64-NEXT: # in Loop: Header=BB0_6 Depth=1
-; RV64-NEXT: slli t6, t0, 28
-; RV64-NEXT: sub t6, t6, t1
+; RV64-NEXT: slli t6, t0, 1
+; RV64-NEXT: slli s0, t0, 28
+; RV64-NEXT: sub t6, s0, t6
; RV64-NEXT: and t6, t6, a6
; RV64-NEXT: csrwi vxrm, 0
; RV64-NEXT: mv s0, a2
diff --git a/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll
index 437b7e557718cc..22e6f23d4d6e6a 100644
--- a/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll
+++ b/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll
@@ -2203,139 +2203,136 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
;
; RV32I-LABEL: lshr_32bytes:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi sp, sp, -128
-; RV32I-NEXT: sw ra, 124(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s0, 120(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s1, 116(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s2, 112(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s3, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s4, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s5, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s6, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s7, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s8, 88(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s9, 84(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s10, 80(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s11, 76(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu s1, 0(a0)
+; RV32I-NEXT: addi sp, sp, -112
+; RV32I-NEXT: sw s0, 108(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 104(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s2, 100(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s3, 96(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s4, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s5, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s6, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s7, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s8, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s9, 72(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s10, 68(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s11, 64(sp) # 4-byte Folded Spill
+; RV32I-NEXT: lbu a3, 0(a0)
; RV32I-NEXT: lbu a4, 1(a0)
; RV32I-NEXT: lbu a5, 2(a0)
; RV32I-NEXT: lbu a6, 3(a0)
-; RV32I-NEXT: lbu t1, 4(a0)
-; RV32I-NEXT: lbu t3, 5(a0)
-; RV32I-NEXT: lbu t4, 6(a0)
-; RV32I-NEXT: lbu s0, 7(a0)
-; RV32I-NEXT: lbu t2, 8(a0)
-; RV32I-NEXT: lbu s3, 9(a0)
-; RV32I-NEXT: lbu s6, 10(a0)
-; RV32I-NEXT: lbu s8, 11(a0)
-; RV32I-NEXT: lbu s9, 12(a0)
-; RV32I-NEXT: lbu s10, 13(a0)
-; RV32I-NEXT: lbu s4, 14(a0)
-; RV32I-NEXT: lbu s7, 15(a0)
-; RV32I-NEXT: lbu s5, 16(a0)
-; RV32I-NEXT: lbu s11, 17(a0)
-; RV32I-NEXT: lbu ra, 18(a0)
-; RV32I-NEXT: lbu a3, 19(a0)
-; RV32I-NEXT: lbu t5, 20(a0)
-; RV32I-NEXT: lbu t6, 21(a0)
-; RV32I-NEXT: lbu a7, 22(a0)
-; RV32I-NEXT: lbu t0, 23(a0)
+; RV32I-NEXT: lbu a7, 4(a0)
+; RV32I-NEXT: lbu t0, 5(a0)
+; RV32I-NEXT: lbu t1, 6(a0)
+; RV32I-NEXT: lbu t2, 7(a0)
+; RV32I-NEXT: lbu t3, 8(a0)
+; RV32I-NEXT: lbu t4, 9(a0)
+; RV32I-NEXT: lbu t5, 10(a0)
+; RV32I-NEXT: lbu t6, 11(a0)
+; RV32I-NEXT: lbu s0, 12(a0)
+; RV32I-NEXT: lbu s1, 13(a0)
+; RV32I-NEXT: lbu s2, 14(a0)
+; RV32I-NEXT: lbu s3, 15(a0)
+; RV32I-NEXT: lbu s4, 16(a0)
+; RV32I-NEXT: lbu s5, 17(a0)
+; RV32I-NEXT: lbu s6, 18(a0)
+; RV32I-NEXT: lbu s7, 19(a0)
; RV32I-NEXT: slli a4, a4, 8
; RV32I-NEXT: slli a5, a5, 16
; RV32I-NEXT: slli a6, a6, 24
-; RV32I-NEXT: slli t3, t3, 8
-; RV32I-NEXT: slli t4, t4, 16
-; RV32I-NEXT: slli s0, s0, 24
-; RV32I-NEXT: or a4, a4, s1
-; RV32I-NEXT: sw a4, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT: slli t0, t0, 8
+; RV32I-NEXT: slli t1, t1, 16
+; RV32I-NEXT: slli t2, t2, 24
+; RV32I-NEXT: or a3, a4, a3
; RV32I-NEXT: or a4, a6, a5
-; RV32I-NEXT: or a5, t3, t1
-; RV32I-NEXT: or a6, s0, t4
-; RV32I-NEXT: lbu t1, 24(a0)
+; RV32I-NEXT: or a5, t0, a7
+; RV32I-NEXT: or a6, t2, t1
+; RV32I-NEXT: lbu s8, 20(a0)
+; RV32I-NEXT: lbu s9, 21(a0)
+; RV32I-NEXT: lbu s10, 22(a0)
+; RV32I-NEXT: lbu s11, 23(a0)
+; RV32I-NEXT: slli t4, t4, 8
+; RV32I-NEXT: slli t5, t5, 16
+; RV32I-NEXT: slli t6, t6, 24
+; RV32I-NEXT: slli s1, s1, 8
+; RV32I-NEXT: slli s2, s2, 16
+; RV32I-NEXT: slli s3, s3, 24
+; RV32I-NEXT: or a7, t4, t3
+; RV32I-NEXT: or t0, t6, t5
+; RV32I-NEXT: or t1, s1, s0
+; RV32I-NEXT: or t2, s3, s2
+; RV32I-NEXT: lbu t6, 24(a0)
; RV32I-NEXT: lbu s0, 25(a0)
; RV32I-NEXT: lbu s1, 26(a0)
; RV32I-NEXT: lbu s2, 27(a0)
-; RV32I-NEXT: slli s3, s3, 8
+; RV32I-NEXT: slli s5, s5, 8
; RV32I-NEXT: slli s6, s6, 16
-; RV32I-NEXT: slli s8, s8, 24
-; RV32I-NEXT: slli s10, s10, 8
-; RV32I-NEXT: or t2, s3, t2
-; RV32I-NEXT: or t3, s8, s6
-; RV32I-NEXT: or t4, s10, s9
-; RV32I-NEXT: lbu s3, 28(a0)
-; RV32I-NEXT: lbu s6, 29(a0)
-; RV32I-NEXT: lbu s8, 30(a0)
-; RV32I-NEXT: lbu s9, 31(a0)
-; RV32I-NEXT: slli s4, s4, 16
; RV32I-NEXT: slli s7, s7, 24
-; RV32I-NEXT: slli s11, s11, 8
-; RV32I-NEXT: slli ra, ra, 16
-; RV32I-NEXT: slli a3, a3, 24
-; RV32I-NEXT: or a0, s7, s4
-; RV32I-NEXT: or s4, s11, s5
-; RV32I-NEXT: or s5, a3, ra
-; RV32I-NEXT: lbu a3, 0(a1)
-; RV32I-NEXT: lbu s7, 1(a1)
-; RV32I-NEXT: lbu s10, 2(a1)
+; RV32I-NEXT: slli s9, s9, 8
+; RV32I-NEXT: or t3, s5, s4
+; RV32I-NEXT: or t4, s7, s6
+; RV32I-NEXT: or t5, s9, s8
+; RV32I-NEXT: lbu s3, 28(a0)
+; RV32I-NEXT: lbu s4, 29(a0)
+; RV32I-NEXT: lbu s5, 30(a0)
+; RV32I-NEXT: lbu s6, 31(a0)
+; RV32I-NEXT: slli s10, s10, 16
+; RV32I-NEXT: slli s11, s11, 24
+; RV32I-NEXT: slli s0, s0, 8
+; RV32I-NEXT: slli s1, s1, 16
+; RV32I-NEXT: slli s2, s2, 24
+; RV32I-NEXT: or a0, s11, s10
+; RV32I-NEXT: or t6, s0, t6
+; RV32I-NEXT: or s0, s2, s1
+; RV32I-NEXT: lbu s1, 0(a1)
+; RV32I-NEXT: lbu s2, 1(a1)
+; RV32I-NEXT: lbu s7, 2(a1)
; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: sw zero, 48(sp)
+; RV32I-NEXT: sw zero, 52(sp)
; RV32I-NEXT: sw zero, 56(sp)
; RV32I-NEXT: sw zero, 60(sp)
-; RV32I-NEXT: sw zero, 64(sp)
-; RV32I-NEXT: sw zero, 68(sp)
+; RV32I-NEXT: sw zero, 32(sp)
+; RV32I-NEXT: sw zero, 36(sp)
; RV32I-NEXT: sw zero, 40(sp)
; RV32I-NEXT: sw zero, 44(sp)
-; RV32I-NEXT: sw zero, 48(sp)
-; RV32I-NEXT: sw zero, 52(sp)
-; RV32I-NEXT: slli t6, t6, 8
-; RV32I-NEXT: or t5, t6, t5
-; RV32I-NEXT: addi t6, sp, 8
-; RV32I-NEXT: slli a7, a7, 16
-; RV32I-NEXT: slli t0, t0, 24
-; RV32I-NEXT: slli s0, s0, 8
-; RV32I-NEXT: slli s1, s1, 16
-; RV32I-NEXT: slli s2, s2, 24
-; RV32I-NEXT: slli s6, s6, 8
-; RV32I-NEXT: slli s8, s8, 16
-; RV32I-NEXT: slli s9, s9, 24
-; RV32I-NEXT: slli s7, s7, 8
-; RV32I-NEXT: slli s10, s10, 16
+; RV32I-NEXT: slli s4, s4, 8
+; RV32I-NEXT: or s3, s4, s3
+; RV32I-NEXT: mv s4, sp
+; RV32I-NEXT: slli s5, s5, 16
+; RV32I-NEXT: slli s6, s6, 24
+; RV32I-NEXT: slli s2, s2, 8
+; RV32I-NEXT: slli s7, s7, 16
; RV32I-NEXT: slli a1, a1, 24
-; RV32I-NEXT: or a7, t0, a7
-; RV32I-NEXT: or t0, s0, t1
-; RV32I-NEXT: or t1, s2, s1
-; RV32I-NEXT: or s0, s6, s3
-; RV32I-NEXT: or s1, s9, s8
-; RV32I-NEXT: or a3, s7, a3
-; RV32I-NEXT: or a1, a1, s10
-; RV32I-NEXT: lw s2, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT: or a4, a4, s2
-; RV32I-NEXT: or a5, a6, a5
-; RV32I-NEXT: or a6, t3, t2
-; RV32I-NEXT: or a0, a0, t4
-; RV32I-NEXT: or t2, s5, s4
-; RV32I-NEXT: or a7, a7, t5
-; RV32I-NEXT: or t0, t1, t0
-; RV32I-NEXT: or s0, s1, s0
-; RV32I-NEXT: or a1, a1, a3
-; RV32I-NEXT: sw t2, 24(sp)
-; RV32I-NEXT: sw a7, 28(sp)
-; RV32I-NEXT: sw t0, 32(sp)
-; RV32I-NEXT: sw s0, 36(sp)
-; RV32I-NEXT: sw a4, 8(sp)
-; RV32I-NEXT: sw a5, 12(sp)
-; RV32I-NEXT: sw a6, 16(sp)
+; RV32I-NEXT: or s5, s6, s5
+; RV32I-NEXT: or s1, s2, s1
+; RV32I-NEXT: or a1, a1, s7
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a5, t0, a7
+; RV32I-NEXT: or a6, t2, t1
+; RV32I-NEXT: or a7, t4, t3
+; RV32I-NEXT: or a0, a0, t5
+; RV32I-NEXT: or t0, s0, t6
+; RV32I-NEXT: or t1, s5, s3
+; RV32I-NEXT: or a1, a1, s1
+; RV32I-NEXT: sw a7, 16(sp)
; RV32I-NEXT: sw a0, 20(sp)
+; RV32I-NEXT: sw t0, 24(sp)
+; RV32I-NEXT: sw t1, 28(sp)
+; RV32I-NEXT: sw a3, 0(sp)
+; RV32I-NEXT: sw a4, 4(sp)
+; RV32I-NEXT: sw a5, 8(sp)
+; RV32I-NEXT: sw a6, 12(sp)
; RV32I-NEXT: slli t1, a1, 3
; RV32I-NEXT: andi a1, a1, 28
-; RV32I-NEXT: add a1, t6, a1
+; RV32I-NEXT: add a1, s4, a1
; RV32I-NEXT: andi a0, t1, 24
-; RV32I-NEXT: xori t0, a0, 31
+; RV32I-NEXT: xori a7, a0, 31
; RV32I-NEXT: lw a3, 0(a1)
; RV32I-NEXT: lw a4, 4(a1)
; RV32I-NEXT: lw a5, 8(a1)
; RV32I-NEXT: lw a6, 12(a1)
-; RV32I-NEXT: lw a7, 16(a1)
+; RV32I-NEXT: lw t0, 16(a1)
; RV32I-NEXT: lw t2, 20(a1)
; RV32I-NEXT: lw t3, 24(a1)
; RV32I-NEXT: lw t4, 28(a1)
@@ -2344,33 +2341,33 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: srl a1, a3, t1
; RV32I-NEXT: slli t6, a4, 1
; RV32I-NEXT: srl a3, a6, t1
-; RV32I-NEXT: slli s0, a7, 1
+; RV32I-NEXT: slli s0, t0, 1
; RV32I-NEXT: srl a4, a5, t1
; RV32I-NEXT: slli s1, a6, 1
; RV32I-NEXT: srl a5, t2, t1
; RV32I-NEXT: slli s2, t3, 1
-; RV32I-NEXT: srl a6, a7, t1
+; RV32I-NEXT: srl a6, t0, t1
; RV32I-NEXT: slli t2, t2, 1
-; RV32I-NEXT: srl a7, t3, t1
+; RV32I-NEXT: srl t0, t3, t1
; RV32I-NEXT: slli t3, t4, 1
; RV32I-NEXT: srl t1, t4, t1
-; RV32I-NEXT: sll t4, t5, t0
-; RV32I-NEXT: sll t5, t6, t0
-; RV32I-NEXT: sll t6, s0, t0
-; RV32I-NEXT: sll s0, s1, t0
-; RV32I-NEXT: sll s1, s2, t0
-; RV32I-NEXT: sll t2, t2, t0
-; RV32I-NEXT: sll t3, t3, t0
+; RV32I-NEXT: sll t4, t5, a7
+; RV32I-NEXT: sll t5, t6, a7
+; RV32I-NEXT: sll t6, s0, a7
+; RV32I-NEXT: sll s0, s1, a7
+; RV32I-NEXT: sll s1, s2, a7
+; RV32I-NEXT: sll t2, t2, a7
+; RV32I-NEXT: sll t3, t3, a7
; RV32I-NEXT: srli s2, t1, 24
; RV32I-NEXT: srli s3, t1, 16
; RV32I-NEXT: srli s4, t1, 8
-; RV32I-NEXT: or t0, a0, t4
+; RV32I-NEXT: or a7, a0, t4
; RV32I-NEXT: or t4, a1, t5
; RV32I-NEXT: or t5, a3, t6
; RV32I-NEXT: or s0, a4, s0
; RV32I-NEXT: or s1, a5, s1
; RV32I-NEXT: or t2, a6, t2
-; RV32I-NEXT: or t3, a7, t3
+; RV32I-NEXT: or t3, t0, t3
; RV32I-NEXT: sb t1, 28(a2)
; RV32I-NEXT: sb s4, 29(a2)
; RV32I-NEXT: sb s3, 30(a2)
@@ -2387,23 +2384,23 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: srli s6, s0, 24
; RV32I-NEXT: srli s7, s0, 16
; RV32I-NEXT: srli s0, s0, 8
-; RV32I-NEXT: srli s8, t5, 24
-; RV32I-NEXT: srli s9, t5, 16
-; RV32I-NEXT: srli t5, t5, 8
-; RV32I-NEXT: srli s10, t4, 24
-; RV32I-NEXT: srli s11, t4, 16
-; RV32I-NEXT: srli t4, t4, 8
-; RV32I-NEXT: sb a7, 24(a2)
+; RV32I-NEXT: sb t0, 24(a2)
+; RV32I-NEXT: srli t0, t5, 24
; RV32I-NEXT: sb t3, 25(a2)
+; RV32I-NEXT: srli t3, t5, 16
+; RV32I-NEXT: srli t5, t5, 8
; RV32I-NEXT: sb t6, 26(a2)
+; RV32I-NEXT: srli t6, t4, 24
; RV32I-NEXT: sb t1, 27(a2)
-; RV32I-NEXT: srli a7, t0, 24
+; RV32I-NEXT: srli t1, t4, 16
+; RV32I-NEXT: srli t4, t4, 8
; RV32I-NEXT: sb a6, 16(a2)
+; RV32I-NEXT: srli a6, a7, 24
; RV32I-NEXT: sb t2, 17(a2)
; RV32I-NEXT: sb s3, 18(a2)
; RV32I-NEXT: sb s2, 19(a2)
-; RV32I-NEXT: srli a6, t0, 16
-; RV32I-NEXT: srli t0, t0, 8
+; RV32I-NEXT: srli t2, a7, 16
+; RV32I-NEXT: srli a7, a7, 8
; RV32I-NEXT: sb a5, 20(a2)
; RV32I-NEXT: sb s1, 21(a2)
; RV32I-NEXT: sb s5, 22(a2)
@@ -2414,30 +2411,29 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: sb s6, 11(a2)
; RV32I-NEXT: sb a3, 12(a2)
; RV32I-NEXT: sb t5, 13(a2)
-; RV32I-NEXT: sb s9, 14(a2)
-; RV32I-NEXT: sb s8, 15(a2)
+; RV32I-NEXT: sb t3, 14(a2)
+; RV32I-NEXT: sb t0, 15(a2)
; RV32I-NEXT: sb a1, 0(a2)
; RV32I-NEXT: sb t4, 1(a2)
-; RV32I-NEXT: sb s11, 2(a2)
-; RV32I-NEXT: sb s10, 3(a2)
+; RV32I-NEXT: sb t1, 2(a2)
+; RV32I-NEXT: sb t6, 3(a2)
; RV32I-NEXT: sb a0, 4(a2)
-; RV32I-NEXT: sb t0, 5(a2)
-; RV32I-NEXT: sb a6, 6(a2)
-; RV32I-NEXT: sb a7, 7(a2)
-; RV32I-NEXT: lw ra, 124(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s0, 120(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s1, 116(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s2, 112(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s3, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s4, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s5, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s6, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s7, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s8, 88(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s9, 84(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s10, 80(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s11, 76(sp) # 4-byte Folded Reload
-; RV32I-NEXT: addi sp, sp, 128
+; RV32I-NEXT: sb a7, 5(a2)
+; RV32I-NEXT: sb t2, 6(a2)
+; RV32I-NEXT: sb a6, 7(a2)
+; RV32I-NEXT: lw s0, 108(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 104(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s2, 100(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s3, 96(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s4, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s5, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s6, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s7, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s8, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s9, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s10, 68(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s11, 64(sp) # 4-byte Folded Reload
+; RV32I-NEXT: addi sp, sp, 112
; RV32I-NEXT: ret
%src = load i256, ptr %src.ptr, align 1
%byteOff = load i256, ptr %byteOff.ptr, align 1
@@ -2682,129 +2678,128 @@ define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
;
; RV32I-LABEL: lshr_32bytes_wordOff:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi sp, sp, -128
-; RV32I-NEXT: sw ra, 124(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s0, 120(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s1, 116(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s2, 112(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s3, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s4, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s5, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s6, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s7, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s8, 88(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s9, 84(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s10, 80(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s11, 76(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a7, 0(a0)
-; RV32I-NEXT: lbu t0, 1(a0)
-; RV32I-NEXT: lbu t1, 2(a0)
-; RV32I-NEXT: lbu s1, 3(a0)
-; RV32I-NEXT: lbu s7, 4(a0)
-; RV32I-NEXT: lbu s8, 5(a0)
-; RV32I-NEXT: lbu s4, 6(a0)
-; RV32I-NEXT: lbu s6, 7(a0)
-; RV32I-NEXT: lbu s5, 8(a0)
-; RV32I-NEXT: lbu s10, 9(a0)
-; RV32I-NEXT: lbu s11, 10(a0)
-; RV32I-NEXT: lbu ra, 11(a0)
-; RV32I-NEXT: lbu t4, 12(a0)
-; RV32I-NEXT: lbu t6, 13(a0)
-; RV32I-NEXT: lbu a5, 14(a0)
-; RV32I-NEXT: lbu a6, 15(a0)
-; RV32I-NEXT: lbu a3, 16(a0)
-; RV32I-NEXT: lbu t2, 17(a0)
-; RV32I-NEXT: lbu t3, 18(a0)
-; RV32I-NEXT: lbu t5, 19(a0)
-; RV32I-NEXT: lbu a4, 20(a0)
-; RV32I-NEXT: lbu s0, 21(a0)
-; RV32I-NEXT: lbu s2, 22(a0)
-; RV32I-NEXT: lbu s3, 23(a0)
+; RV32I-NEXT: addi sp, sp, -112
+; RV32I-NEXT: sw s0, 108(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 104(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s2, 100(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s3, 96(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s4, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s5, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s6, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s7, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s8, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s9, 72(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s10, 68(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s11, 64(sp) # 4-byte Folded Spill
+; RV32I-NEXT: lbu a3, 0(a0)
+; RV32I-NEXT: lbu a4, 1(a0)
+; RV32I-NEXT: lbu a5, 2(a0)
+; RV32I-NEXT: lbu a6, 3(a0)
+; RV32I-NEXT: lbu a7, 4(a0)
+; RV32I-NEXT: lbu t0, 5(a0)
+; RV32I-NEXT: lbu t1, 6(a0)
+; RV32I-NEXT: lbu t2, 7(a0)
+; RV32I-NEXT: lbu t3, 8(a0)
+; RV32I-NEXT: lbu t4, 9(a0)
+; RV32I-NEXT: lbu t5, 10(a0)
+; RV32I-NEXT: lbu t6, 11(a0)
+; RV32I-NEXT: lbu s2, 12(a0)
+; RV32I-NEXT: lbu s3, 13(a0)
+; RV32I-NEXT: lbu s4, 14(a0)
+; RV32I-NEXT: lbu s5, 15(a0)
+; RV32I-NEXT: lbu s6, 16(a0)
+; RV32I-NEXT: lbu s7, 17(a0)
+; RV32I-NEXT: lbu s8, 18(a0)
+; RV32I-NEXT: lbu s9, 19(a0)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: slli a5, a5, 16
+; RV32I-NEXT: slli a6, a6, 24
; RV32I-NEXT: slli t0, t0, 8
; RV32I-NEXT: slli t1, t1, 16
-; RV32I-NEXT: slli s1, s1, 24
-; RV32I-NEXT: slli s8, s8, 8
-; RV32I-NEXT: or a7, t0, a7
-; RV32I-NEXT: or t0, s1, t1
-; RV32I-NEXT: or t1, s8, s7
-; RV32I-NEXT: lbu s1, 24(a0)
-; RV32I-NEXT: lbu s7, 25(a0)
-; RV32I-NEXT: lbu s8, 26(a0)
-; RV32I-NEXT: lbu s9, 27(a0)
+; RV32I-NEXT: slli t2, t2, 24
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a5, t0, a7
+; RV32I-NEXT: or a6, t2, t1
+; RV32I-NEXT: lbu s10, 20(a0)
+; RV32I-NEXT: lbu s11, 21(a0)
+; RV32I-NEXT: lbu s0, 22(a0)
+; RV32I-NEXT: lbu s1, 23(a0)
+; RV32I-NEXT: slli t4, t4, 8
+; RV32I-NEXT: slli t5, t5, 16
+; RV32I-NEXT: slli t6, t6, 24
+; RV32I-NEXT: slli s3, s3, 8
; RV32I-NEXT: slli s4, s4, 16
-; RV32I-NEXT: slli s6, s6, 24
-; RV32I-NEXT: slli s10, s10, 8
-; RV32I-NEXT: slli s11, s11, 16
-; RV32I-NEXT: slli ra, ra, 24
-; RV32I-NEXT: or s4, s6, s4
-; RV32I-NEXT: or s5, s10, s5
-; RV32I-NEXT: or s6, ra, s11
-; RV32I-NEXT: lbu s10, 28(a0)
-; RV32I-NEXT: lbu s11, 29(a0)
-; RV32I-NEXT: lbu ra, 30(a0)
+; RV32I-NEXT: slli s5, s5, 24
+; RV32I-NEXT: or a7, t4, t3
+; RV32I-NEXT: or t0, t6, t5
+; RV32I-NEXT: or t1, s3, s2
+; RV32I-NEXT: or t2, s5, s4
+; RV32I-NEXT: lbu t3, 24(a0)
+; RV32I-NEXT: lbu s2, 25(a0)
+; RV32I-NEXT: lbu s3, 26(a0)
+; RV32I-NEXT: lbu s4, 27(a0)
+; RV32I-NEXT: slli s7, s7, 8
+; RV32I-NEXT: slli s8, s8, 16
+; RV32I-NEXT: slli s9, s9, 24
+; RV32I-NEXT: slli s11, s11, 8
+; RV32I-NEXT: or t4, s7, s6
+; RV32I-NEXT: or t5, s9, s8
+; RV32I-NEXT: or t6, s11, s10
+; RV32I-NEXT: lbu s5, 28(a0)
+; RV32I-NEXT: lbu s6, 29(a0)
+; RV32I-NEXT: lbu s7, 30(a0)
; RV32I-NEXT: lbu a0, 31(a0)
; RV32I-NEXT: lbu a1, 0(a1)
+; RV32I-NEXT: sw zero, 48(sp)
+; RV32I-NEXT: sw zero, 52(sp)
; RV32I-NEXT: sw zero, 56(sp)
; RV32I-NEXT: sw zero, 60(sp)
-; RV32I-NEXT: sw zero, 64(sp)
-; RV32I-NEXT: sw zero, 68(sp)
+; RV32I-NEXT: sw zero, 32(sp)
+; RV32I-NEXT: sw zero, 36(sp)
; RV32I-NEXT: sw zero, 40(sp)
; RV32I-NEXT: sw zero, 44(sp)
-; RV32I-NEXT: sw zero, 48(sp)
-; RV32I-NEXT: sw zero, 52(sp)
-; RV32I-NEXT: slli t6, t6, 8
-; RV32I-NEXT: or t4, t6, t4
-; RV32I-NEXT: addi t6, sp, 8
-; RV32I-NEXT: slli a5, a5, 16
-; RV32I-NEXT: slli a6, a6, 24
-; RV32I-NEXT: slli t2, t2, 8
-; RV32I-NEXT: slli t3, t3, 16
-; RV32I-NEXT: slli t5, t5, 24
-; RV32I-NEXT: slli s0, s0, 8
-; RV32I-NEXT: slli s2, s2, 16
-; RV32I-NEXT: slli s3, s3, 24
-; RV32I-NEXT: slli s7, s7, 8
-; RV32I-NEXT: slli s8, s8, 16
-; RV32I-NEXT: slli s9, s9, 24
-; RV32I-NEXT: slli s11, s11, 8
-; RV32I-NEXT: slli ra, ra, 16
+; RV32I-NEXT: slli s0, s0, 16
+; RV32I-NEXT: slli s1, s1, 24
+; RV32I-NEXT: or s0, s1, s0
+; RV32I-NEXT: mv s1, sp
+; RV32I-NEXT: slli s2, s2, 8
+; RV32I-NEXT: slli s3, s3, 16
+; RV32I-NEXT: slli s4, s4, 24
+; RV32I-NEXT: slli s6, s6, 8
+; RV32I-NEXT: slli s7, s7, 16
; RV32I-NEXT: slli a0, a0, 24
; RV32I-NEXT: slli a1, a1, 2
-; RV32I-NEXT: or a5, a6, a5
-; RV32I-NEXT: or a3, t2, a3
-; RV32I-NEXT: or a6, t5, t3
-; RV32I-NEXT: or a4, s0, a4
-; RV32I-NEXT: or t2, s3, s2
-; RV32I-NEXT: or t3, s7, s1
-; RV32I-NEXT: or t5, s9, s8
-; RV32I-NEXT: or s0, s11, s10
-; RV32I-NEXT: or a0, a0, ra
+; RV32I-NEXT: or t3, s2, t3
+; RV32I-NEXT: or s2, s4, s3
+; RV32I-NEXT: or s3, s6, s5
+; RV32I-NEXT: or a0, a0, s7
; RV32I-NEXT: andi a1, a1, 28
-; RV32I-NEXT: or a7, t0, a7
-; RV32I-NEXT: or t0, s4, t1
-; RV32I-NEXT: or t1, s6, s5
-; RV32I-NEXT: or a5, a5, t4
-; RV32I-NEXT: or a3, a6, a3
-; RV32I-NEXT: or a4, t2, a4
-; RV32I-NEXT: or a6, t5, t3
-; RV32I-NEXT: or a0, a0, s0
-; RV32I-NEXT: add t6, t6, a1
-; RV32I-NEXT: sw a3, 24(sp)
-; RV32I-NEXT: sw a4, 28(sp)
-; RV32I-NEXT: sw a6, 32(sp)
-; RV32I-NEXT: sw a0, 36(sp)
-; RV32I-NEXT: sw a7, 8(sp)
-; RV32I-NEXT: sw t0, 12(sp)
-; RV32I-NEXT: sw t1, 16(sp)
-; RV32I-NEXT: sw a5, 20(sp)
-; RV32I-NEXT: lw a6, 16(t6)
-; RV32I-NEXT: lw a5, 20(t6)
-; RV32I-NEXT: lw a7, 24(t6)
-; RV32I-NEXT: lw a1, 0(t6)
-; RV32I-NEXT: lw a0, 4(t6)
-; RV32I-NEXT: lw a4, 8(t6)
-; RV32I-NEXT: lw a3, 12(t6)
-; RV32I-NEXT: lw t0, 28(t6)
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a5, t0, a7
+; RV32I-NEXT: or a6, t2, t1
+; RV32I-NEXT: or a7, t5, t4
+; RV32I-NEXT: or t0, s0, t6
+; RV32I-NEXT: or t1, s2, t3
+; RV32I-NEXT: or a0, a0, s3
+; RV32I-NEXT: add s1, s1, a1
+; RV32I-NEXT: sw a7, 16(sp)
+; RV32I-NEXT: sw t0, 20(sp)
+; RV32I-NEXT: sw t1, 24(sp)
+; RV32I-NEXT: sw a0, 28(sp)
+; RV32I-NEXT: sw a3, 0(sp)
+; RV32I-NEXT: sw a4, 4(sp)
+; RV32I-NEXT: sw a5, 8(sp)
+; RV32I-NEXT: sw a6, 12(sp)
+; RV32I-NEXT: lw a6, 16(s1)
+; RV32I-NEXT: lw a5, 20(s1)
+; RV32I-NEXT: lw a7, 24(s1)
+; RV32I-NEXT: lw a1, 0(s1)
+; RV32I-NEXT: lw a0, 4(s1)
+; RV32I-NEXT: lw a4, 8(s1)
+; RV32I-NEXT: lw a3, 12(s1)
+; RV32I-NEXT: lw t0, 28(s1)
; RV32I-NEXT: srli t1, a7, 24
; RV32I-NEXT: srli t2, a7, 16
; RV32I-NEXT: srli t3, a7, 8
@@ -2819,21 +2814,21 @@ define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
; RV32I-NEXT: srli s5, a5, 8
; RV32I-NEXT: srli s6, a4, 24
; RV32I-NEXT: srli s7, a4, 16
-; RV32I-NEXT: srli s8, a4, 8
-; RV32I-NEXT: srli s9, a3, 24
-; RV32I-NEXT: srli s10, a3, 16
-; RV32I-NEXT: srli s11, a3, 8
-; RV32I-NEXT: srli ra, a1, 24
; RV32I-NEXT: sb a7, 24(a2)
+; RV32I-NEXT: srli a7, a4, 8
; RV32I-NEXT: sb t3, 25(a2)
+; RV32I-NEXT: srli t3, a3, 24
; RV32I-NEXT: sb t2, 26(a2)
+; RV32I-NEXT: srli t2, a3, 16
; RV32I-NEXT: sb t1, 27(a2)
-; RV32I-NEXT: srli a7, a1, 16
+; RV32I-NEXT: srli t1, a3, 8
; RV32I-NEXT: sb t0, 28(a2)
+; RV32I-NEXT: srli t0, a1, 24
; RV32I-NEXT: sb t6, 29(a2)
+; RV32I-NEXT: srli t6, a1, 16
; RV32I-NEXT: sb t5, 30(a2)
; RV32I-NEXT: sb t4, 31(a2)
-; RV32I-NEXT: srli t0, a1, 8
+; RV32I-NEXT: srli t4, a1, 8
; RV32I-NEXT: sb a6, 16(a2)
; RV32I-NEXT: sb s2, 17(a2)
; RV32I-NEXT: sb s1, 18(a2)
@@ -2845,36 +2840,35 @@ define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
; RV32I-NEXT: sb s3, 23(a2)
; RV32I-NEXT: srli a5, a0, 16
; RV32I-NEXT: sb a4, 8(a2)
-; RV32I-NEXT: sb s8, 9(a2)
+; RV32I-NEXT: sb a7, 9(a2)
; RV32I-NEXT: sb s7, 10(a2)
; RV32I-NEXT: sb s6, 11(a2)
; RV32I-NEXT: srli a4, a0, 8
; RV32I-NEXT: sb a3, 12(a2)
-; RV32I-NEXT: sb s11, 13(a2)
-; RV32I-NEXT: sb s10, 14(a2)
-; RV32I-NEXT: sb s9, 15(a2)
+; RV32I-NEXT: sb t1, 13(a2)
+; RV32I-NEXT: sb t2, 14(a2)
+; RV32I-NEXT: sb t3, 15(a2)
; RV32I-NEXT: sb a1, 0(a2)
-; RV32I-NEXT: sb t0, 1(a2)
-; RV32I-NEXT: sb a7, 2(a2)
-; RV32I-NEXT: sb ra, 3(a2)
+; RV32I-NEXT: sb t4, 1(a2)
+; RV32I-NEXT: sb t6, 2(a2)
+; RV32I-NEXT: sb t0, 3(a2)
; RV32I-NEXT: sb a0, 4(a2)
; RV32I-NEXT: sb a4, 5(a2)
; RV32I-NEXT: sb a5, 6(a2)
; RV32I-NEXT: sb a6, 7(a2)
-; RV32I-NEXT: lw ra, 124(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s0, 120(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s1, 116(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s2, 112(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s3, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s4, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s5, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s6, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s7, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s8, 88(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s9, 84(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s10, 80(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s11, 76(sp) # 4-byte Folded Reload
-; RV32I-NEXT: addi sp, sp, 128
+; RV32I-NEXT: lw s0, 108(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 104(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s2, 100(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s3, 96(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s4, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s5, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s6, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s7, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s8, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s9, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s10, 68(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s11, 64(sp) # 4-byte Folded Reload
+; RV32I-NEXT: addi sp, sp, 112
; RV32I-NEXT: ret
%src = load i256, ptr %src.ptr, align 1
%wordOff = load i256, ptr %wordOff.ptr, align 1
@@ -2900,111 +2894,111 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
; RV64I-NEXT: sd s9, 80(sp) # 8-byte Folded Spill
; RV64I-NEXT: sd s10, 72(sp) # 8-byte Folded Spill
; RV64I-NEXT: sd s11, 64(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a5, 0(a0)
-; RV64I-NEXT: lbu a7, 1(a0)
-; RV64I-NEXT: lbu t2, 2(a0)
-; RV64I-NEXT: lbu s3, 3(a0)
-; RV64I-NEXT: lbu t0, 4(a0)
-; RV64I-NEXT: lbu s8, 5(a0)
-; RV64I-NEXT: lbu s9, 6(a0)
-; RV64I-NEXT: lbu s10, 7(a0)
-; RV64I-NEXT: lbu s2, 8(a0)
-; RV64I-NEXT: lbu s4, 9(a0)
-; RV64I-NEXT: lbu s5, 10(a0)
-; RV64I-NEXT: lbu s6, 11(a0)
-; RV64I-NEXT: lbu s7, 12(a0)
-; RV64I-NEXT: lbu s11, 13(a0)
-; RV64I-NEXT: lbu t1, 14(a0)
-; RV64I-NEXT: lbu t3, 15(a0)
-; RV64I-NEXT: lbu a3, 16(a0)
-; RV64I-NEXT: lbu a6, 17(a0)
-; RV64I-NEXT: lbu t4, 18(a0)
-; RV64I-NEXT: lbu t5, 19(a0)
-; RV64I-NEXT: lbu a4, 20(a0)
-; RV64I-NEXT: lbu t6, 21(a0)
-; RV64I-NEXT: lbu s0, 22(a0)
-; RV64I-NEXT: lbu s1, 23(a0)
-; RV64I-NEXT: slli a7, a7, 8
-; RV64I-NEXT: slli t2, t2, 16
+; RV64I-NEXT: lbu a3, 0(a0)
+; RV64I-NEXT: lbu a4, 1(a0)
+; RV64I-NEXT: lbu a5, 2(a0)
+; RV64I-NEXT: lbu a6, 3(a0)
+; RV64I-NEXT: lbu a7, 4(a0)
+; RV64I-NEXT: lbu t0, 5(a0)
+; RV64I-NEXT: lbu t1, 6(a0)
+; RV64I-NEXT: lbu t2, 7(a0)
+; RV64I-NEXT: lbu t3, 8(a0)
+; RV64I-NEXT: lbu t4, 9(a0)
+; RV64I-NEXT: lbu t5, 10(a0)
+; RV64I-NEXT: lbu t6, 11(a0)
+; RV64I-NEXT: lbu s0, 12(a0)
+; RV64I-NEXT: lbu s1, 13(a0)
+; RV64I-NEXT: lbu s2, 14(a0)
+; RV64I-NEXT: lbu s3, 15(a0)
+; RV64I-NEXT: lbu s4, 16(a0)
+; RV64I-NEXT: lbu s5, 17(a0)
+; RV64I-NEXT: lbu s6, 18(a0)
+; RV64I-NEXT: lbu s7, 19(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: slli a5, a5, 16
+; RV64I-NEXT: slli a6, a6, 24
+; RV64I-NEXT: slli t0, t0, 8
+; RV64I-NEXT: slli t1, t1, 16
+; RV64I-NEXT: slli t2, t2, 24
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: or a5, t0, a7
+; RV64I-NEXT: or a6, t2, t1
+; RV64I-NEXT: lbu s8, 20(a0)
+; RV64I-NEXT: lbu s9, 21(a0)
+; RV64I-NEXT: lbu s10, 22(a0)
+; RV64I-NEXT: lbu s11, 23(a0)
+; RV64I-NEXT: slli t4, t4, 8
+; RV64I-NEXT: slli t5, t5, 16
+; RV64I-NEXT: slli t6, t6, 24
+; RV64I-NEXT: slli s1, s1, 8
+; RV64I-NEXT: slli s2, s2, 16
; RV64I-NEXT: slli s3, s3, 24
-; RV64I-NEXT: slli s8, s8, 8
-; RV64I-NEXT: slli s9, s9, 16
-; RV64I-NEXT: slli s10, s10, 24
-; RV64I-NEXT: or a5, a7, a5
-; RV64I-NEXT: or a7, s3, t2
-; RV64I-NEXT: or t0, s8, t0
-; RV64I-NEXT: or t2, s10, s9
-; RV64I-NEXT: lbu s3, 24(a0)
-; RV64I-NEXT: lbu s8, 25(a0)
-; RV64I-NEXT: lbu s9, 26(a0)
-; RV64I-NEXT: lbu s10, 27(a0)
-; RV64I-NEXT: slli s4, s4, 8
-; RV64I-NEXT: slli s5, s5, 16
-; RV64I-NEXT: slli s6, s6, 24
-; RV64I-NEXT: slli s11, s11, 8
-; RV64I-NEXT: or s2, s4, s2
-; RV64I-NEXT: or s4, s6, s5
-; RV64I-NEXT: or s5, s11, s7
-; RV64I-NEXT: lbu s6, 28(a0)
-; RV64I-NEXT: lbu s7, 29(a0)
-; RV64I-NEXT: lbu s11, 30(a0)
+; RV64I-NEXT: or a7, t4, t3
+; RV64I-NEXT: or t0, t6, t5
+; RV64I-NEXT: or t1, s1, s0
+; RV64I-NEXT: or t2, s3, s2
+; RV64I-NEXT: lbu t3, 24(a0)
+; RV64I-NEXT: lbu t4, 25(a0)
+; RV64I-NEXT: lbu t5, 26(a0)
+; RV64I-NEXT: lbu t6, 27(a0)
+; RV64I-NEXT: slli s5, s5, 8
+; RV64I-NEXT: slli s6, s6, 16
+; RV64I-NEXT: slli s7, s7, 24
+; RV64I-NEXT: slli s9, s9, 8
+; RV64I-NEXT: or s0, s5, s4
+; RV64I-NEXT: or s1, s7, s6
+; RV64I-NEXT: or s2, s9, s8
+; RV64I-NEXT: lbu s3, 28(a0)
+; RV64I-NEXT: lbu s4, 29(a0)
+; RV64I-NEXT: lbu s5, 30(a0)
; RV64I-NEXT: lbu a0, 31(a0)
; RV64I-NEXT: lbu a1, 0(a1)
; RV64I-NEXT: sd zero, 32(sp)
; RV64I-NEXT: sd zero, 40(sp)
; RV64I-NEXT: sd zero, 48(sp)
; RV64I-NEXT: sd zero, 56(sp)
-; RV64I-NEXT: slli t1, t1, 16
-; RV64I-NEXT: slli t3, t3, 24
-; RV64I-NEXT: or t1, t3, t1
-; RV64I-NEXT: mv t3, sp
-; RV64I-NEXT: slli a6, a6, 8
-; RV64I-NEXT: slli t4, t4, 16
-; RV64I-NEXT: slli t5, t5, 24
-; RV64I-NEXT: slli t6, t6, 8
-; RV64I-NEXT: slli s0, s0, 16
-; RV64I-NEXT: slli s1, s1, 24
-; RV64I-NEXT: slli s8, s8, 8
-; RV64I-NEXT: slli s9, s9, 16
-; RV64I-NEXT: slli s10, s10, 24
-; RV64I-NEXT: slli s7, s7, 8
-; RV64I-NEXT: slli s11, s11, 16
+; RV64I-NEXT: slli s10, s10, 16
+; RV64I-NEXT: slli s11, s11, 24
+; RV64I-NEXT: or s6, s11, s10
+; RV64I-NEXT: mv s7, sp
+; RV64I-NEXT: slli t4, t4, 8
+; RV64I-NEXT: slli t5, t5, 16
+; RV64I-NEXT: slli t6, t6, 24
+; RV64I-NEXT: slli s4, s4, 8
+; RV64I-NEXT: slli s5, s5, 16
; RV64I-NEXT: slli a0, a0, 24
; RV64I-NEXT: slli a1, a1, 3
-; RV64I-NEXT: or a3, a6, a3
-; RV64I-NEXT: or a6, t5, t4
-; RV64I-NEXT: or a4, t6, a4
-; RV64I-NEXT: or s0, s1, s0
-; RV64I-NEXT: or t4, s8, s3
-; RV64I-NEXT: or t5, s10, s9
-; RV64I-NEXT: or t6, s7, s6
-; RV64I-NEXT: or a0, a0, s11
+; RV64I-NEXT: or t3, t4, t3
+; RV64I-NEXT: or t4, t6, t5
+; RV64I-NEXT: or t5, s4, s3
+; RV64I-NEXT: or a0, a0, s5
; RV64I-NEXT: andi a1, a1, 24
-; RV64I-NEXT: or a5, a7, a5
-; RV64I-NEXT: or a7, t2, t0
-; RV64I-NEXT: or t0, s4, s2
-; RV64I-NEXT: or t1, t1, s5
-; RV64I-NEXT: or a3, a6, a3
-; RV64I-NEXT: or a4, s0, a4
-; RV64I-NEXT: or a6, t5, t4
-; RV64I-NEXT: or a0, a0, t6
-; RV64I-NEXT: add t3, t3, a1
-; RV64I-NEXT: slli a7, a7, 32
-; RV64I-NEXT: slli t1, t1, 32
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: or a5, t0, a7
+; RV64I-NEXT: or a6, t2, t1
+; RV64I-NEXT: or s0, s1, s0
+; RV64I-NEXT: or a7, s6, s2
+; RV64I-NEXT: or t0, t4, t3
+; RV64I-NEXT: or a0, a0, t5
+; RV64I-NEXT: add s7, s7, a1
; RV64I-NEXT: slli a4, a4, 32
+; RV64I-NEXT: slli a6, a6, 32
+; RV64I-NEXT: slli a7, a7, 32
; RV64I-NEXT: slli a0, a0, 32
-; RV64I-NEXT: or a1, a7, a5
-; RV64I-NEXT: or a5, t1, t0
; RV64I-NEXT: or a3, a4, a3
-; RV64I-NEXT: or a0, a0, a6
-; RV64I-NEXT: sd a1, 0(sp)
-; RV64I-NEXT: sd a5, 8(sp)
-; RV64I-NEXT: sd a3, 16(sp)
+; RV64I-NEXT: or a1, a6, a5
+; RV64I-NEXT: or a4, a7, s0
+; RV64I-NEXT: or a0, a0, t0
+; RV64I-NEXT: sd a3, 0(sp)
+; RV64I-NEXT: sd a1, 8(sp)
+; RV64I-NEXT: sd a4, 16(sp)
; RV64I-NEXT: sd a0, 24(sp)
-; RV64I-NEXT: ld a4, 16(t3)
-; RV64I-NEXT: ld a0, 8(t3)
-; RV64I-NEXT: ld a1, 0(t3)
-; RV64I-NEXT: ld a3, 24(t3)
+; RV64I-NEXT: ld a4, 16(s7)
+; RV64I-NEXT: ld a0, 8(s7)
+; RV64I-NEXT: ld a1, 0(s7)
+; RV64I-NEXT: ld a3, 24(s7)
; RV64I-NEXT: srli a5, a4, 56
; RV64I-NEXT: srli a6, a4, 48
; RV64I-NEXT: srli a7, a4, 40
@@ -3023,25 +3017,25 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
; RV64I-NEXT: srli s5, a1, 48
; RV64I-NEXT: srli s6, a1, 40
; RV64I-NEXT: srli s7, a1, 32
-; RV64I-NEXT: srli s8, a1, 24
-; RV64I-NEXT: srli s9, a1, 16
-; RV64I-NEXT: srli s10, a1, 8
-; RV64I-NEXT: srli s11, a0, 56
; RV64I-NEXT: sb t0, 20(a2)
+; RV64I-NEXT: srli t0, a1, 24
; RV64I-NEXT: sb a7, 21(a2)
+; RV64I-NEXT: srli a7, a1, 16
; RV64I-NEXT: sb a6, 22(a2)
+; RV64I-NEXT: srli a6, a1, 8
; RV64I-NEXT: sb a5, 23(a2)
-; RV64I-NEXT: srli a5, a0, 48
+; RV64I-NEXT: srli a5, a0, 56
; RV64I-NEXT: sb a4, 16(a2)
+; RV64I-NEXT: srli a4, a0, 48
; RV64I-NEXT: sb t3, 17(a2)
; RV64I-NEXT: sb t2, 18(a2)
; RV64I-NEXT: sb t1, 19(a2)
-; RV64I-NEXT: srli a4, a0, 40
+; RV64I-NEXT: srli t1, a0, 40
; RV64I-NEXT: sb s0, 28(a2)
; RV64I-NEXT: sb t6, 29(a2)
; RV64I-NEXT: sb t5, 30(a2)
; RV64I-NEXT: sb t4, 31(a2)
-; RV64I-NEXT: srli a6, a0, 32
+; RV64I-NEXT: srli t2, a0, 32
; RV64I-NEXT: sb a3, 24(a2)
; RV64I-NEXT: sb s3, 25(a2)
; RV64I-NEXT: sb s2, 26(a2)
@@ -3051,19 +3045,19 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
; RV64I-NEXT: sb s6, 5(a2)
; RV64I-NEXT: sb s5, 6(a2)
; RV64I-NEXT: sb s4, 7(a2)
-; RV64I-NEXT: srli a7, a0, 16
+; RV64I-NEXT: srli t3, a0, 16
; RV64I-NEXT: sb a1, 0(a2)
-; RV64I-NEXT: sb s10, 1(a2)
-; RV64I-NEXT: sb s9, 2(a2)
-; RV64I-NEXT: sb s8, 3(a2)
+; RV64I-NEXT: sb a6, 1(a2)
+; RV64I-NEXT: sb a7, 2(a2)
+; RV64I-NEXT: sb t0, 3(a2)
; RV64I-NEXT: srli a1, a0, 8
-; RV64I-NEXT: sb a6, 12(a2)
-; RV64I-NEXT: sb a4, 13(a2)
-; RV64I-NEXT: sb a5, 14(a2)
-; RV64I-NEXT: sb s11, 15(a2)
+; RV64I-NEXT: sb t2, 12(a2)
+; RV64I-NEXT: sb t1, 13(a2)
+; RV64I-NEXT: sb a4, 14(a2)
+; RV64I-NEXT: sb a5, 15(a2)
; RV64I-NEXT: sb a0, 8(a2)
; RV64I-NEXT: sb a1, 9(a2)
-; RV64I-NEXT: sb a7, 10(a2)
+; RV64I-NEXT: sb t3, 10(a2)
; RV64I-NEXT: sb a3, 11(a2)
; RV64I-NEXT: ld s0, 152(sp) # 8-byte Folded Reload
; RV64I-NEXT: ld s1, 144(sp) # 8-byte Folded Reload
@@ -3082,129 +3076,128 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
;
; RV32I-LABEL: lshr_32bytes_dwordOff:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi sp, sp, -128
-; RV32I-NEXT: sw ra, 124(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s0, 120(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s1, 116(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s2, 112(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s3, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s4, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s5, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s6, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s7, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s8, 88(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s9, 84(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s10, 80(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s11, 76(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a7, 0(a0)
-; RV32I-NEXT: lbu t0, 1(a0)
-; RV32I-NEXT: lbu t1, 2(a0)
-; RV32I-NEXT: lbu s1, 3(a0)
-; RV32I-NEXT: lbu s7, 4(a0)
-; RV32I-NEXT: lbu s8, 5(a0)
-; RV32I-NEXT: lbu s4, 6(a0)
-; RV32I-NEXT: lbu s6, 7(a0)
-; RV32I-NEXT: lbu s5, 8(a0)
-; RV32I-NEXT: lbu s10, 9(a0)
-; RV32I-NEXT: lbu s11, 10(a0)
-; RV32I-NEXT: lbu ra, 11(a0)
-; RV32I-NEXT: lbu t4, 12(a0)
-; RV32I-NEXT: lbu t6, 13(a0)
-; RV32I-NEXT: lbu a5, 14(a0)
-; RV32I-NEXT: lbu a6, 15(a0)
-; RV32I-NEXT: lbu a3, 16(a0)
-; RV32I-NEXT: lbu t2, 17(a0)
-; RV32I-NEXT: lbu t3, 18(a0)
-; RV32I-NEXT: lbu t5, 19(a0)
-; RV32I-NEXT: lbu a4, 20(a0)
-; RV32I-NEXT: lbu s0, 21(a0)
-; RV32I-NEXT: lbu s2, 22(a0)
-; RV32I-NEXT: lbu s3, 23(a0)
+; RV32I-NEXT: addi sp, sp, -112
+; RV32I-NEXT: sw s0, 108(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 104(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s2, 100(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s3, 96(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s4, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s5, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s6, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s7, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s8, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s9, 72(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s10, 68(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s11, 64(sp) # 4-byte Folded Spill
+; RV32I-NEXT: lbu a3, 0(a0)
+; RV32I-NEXT: lbu a4, 1(a0)
+; RV32I-NEXT: lbu a5, 2(a0)
+; RV32I-NEXT: lbu a6, 3(a0)
+; RV32I-NEXT: lbu a7, 4(a0)
+; RV32I-NEXT: lbu t0, 5(a0)
+; RV32I-NEXT: lbu t1, 6(a0)
+; RV32I-NEXT: lbu t2, 7(a0)
+; RV32I-NEXT: lbu t3, 8(a0)
+; RV32I-NEXT: lbu t4, 9(a0)
+; RV32I-NEXT: lbu t5, 10(a0)
+; RV32I-NEXT: lbu t6, 11(a0)
+; RV32I-NEXT: lbu s2, 12(a0)
+; RV32I-NEXT: lbu s3, 13(a0)
+; RV32I-NEXT: lbu s4, 14(a0)
+; RV32I-NEXT: lbu s5, 15(a0)
+; RV32I-NEXT: lbu s6, 16(a0)
+; RV32I-NEXT: lbu s7, 17(a0)
+; RV32I-NEXT: lbu s8, 18(a0)
+; RV32I-NEXT: lbu s9, 19(a0)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: slli a5, a5, 16
+; RV32I-NEXT: slli a6, a6, 24
; RV32I-NEXT: slli t0, t0, 8
; RV32I-NEXT: slli t1, t1, 16
-; RV32I-NEXT: slli s1, s1, 24
-; RV32I-NEXT: slli s8, s8, 8
-; RV32I-NEXT: or a7, t0, a7
-; RV32I-NEXT: or t0, s1, t1
-; RV32I-NEXT: or t1, s8, s7
-; RV32I-NEXT: lbu s1, 24(a0)
-; RV32I-NEXT: lbu s7, 25(a0)
-; RV32I-NEXT: lbu s8, 26(a0)
-; RV32I-NEXT: lbu s9, 27(a0)
+; RV32I-NEXT: slli t2, t2, 24
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a5, t0, a7
+; RV32I-NEXT: or a6, t2, t1
+; RV32I-NEXT: lbu s10, 20(a0)
+; RV32I-NEXT: lbu s11, 21(a0)
+; RV32I-NEXT: lbu s0, 22(a0)
+; RV32I-NEXT: lbu s1, 23(a0)
+; RV32I-NEXT: slli t4, t4, 8
+; RV32I-NEXT: slli t5, t5, 16
+; RV32I-NEXT: slli t6, t6, 24
+; RV32I-NEXT: slli s3, s3, 8
; RV32I-NEXT: slli s4, s4, 16
-; RV32I-NEXT: slli s6, s6, 24
-; RV32I-NEXT: slli s10, s10, 8
-; RV32I-NEXT: slli s11, s11, 16
-; RV32I-NEXT: slli ra, ra, 24
-; RV32I-NEXT: or s4, s6, s4
-; RV32I-NEXT: or s5, s10, s5
-; RV32I-NEXT: or s6, ra, s11
-; RV32I-NEXT: lbu s10, 28(a0)
-; RV32I-NEXT: lbu s11, 29(a0)
-; RV32I-NEXT: lbu ra, 30(a0)
+; RV32I-NEXT: slli s5, s5, 24
+; RV32I-NEXT: or a7, t4, t3
+; RV32I-NEXT: or t0, t6, t5
+; RV32I-NEXT: or t1, s3, s2
+; RV32I-NEXT: or t2, s5, s4
+; RV32I-NEXT: lbu t3, 24(a0)
+; RV32I-NEXT: lbu s2, 25(a0)
+; RV32I-NEXT: lbu s3, 26(a0)
+; RV32I-NEXT: lbu s4, 27(a0)
+; RV32I-NEXT: slli s7, s7, 8
+; RV32I-NEXT: slli s8, s8, 16
+; RV32I-NEXT: slli s9, s9, 24
+; RV32I-NEXT: slli s11, s11, 8
+; RV32I-NEXT: or t4, s7, s6
+; RV32I-NEXT: or t5, s9, s8
+; RV32I-NEXT: or t6, s11, s10
+; RV32I-NEXT: lbu s5, 28(a0)
+; RV32I-NEXT: lbu s6, 29(a0)
+; RV32I-NEXT: lbu s7, 30(a0)
; RV32I-NEXT: lbu a0, 31(a0)
; RV32I-NEXT: lbu a1, 0(a1)
+; RV32I-NEXT: sw zero, 48(sp)
+; RV32I-NEXT: sw zero, 52(sp)
; RV32I-NEXT: sw zero, 56(sp)
; RV32I-NEXT: sw zero, 60(sp)
-; RV32I-NEXT: sw zero, 64(sp)
-; RV32I-NEXT: sw zero, 68(sp)
+; RV32I-NEXT: sw zero, 32(sp)
+; RV32I-NEXT: sw zero, 36(sp)
; RV32I-NEXT: sw zero, 40(sp)
; RV32I-NEXT: sw zero, 44(sp)
-; RV32I-NEXT: sw zero, 48(sp)
-; RV32I-NEXT: sw zero, 52(sp)
-; RV32I-NEXT: slli t6, t6, 8
-; RV32I-NEXT: or t4, t6, t4
-; RV32I-NEXT: addi t6, sp, 8
-; RV32I-NEXT: slli a5, a5, 16
-; RV32I-NEXT: slli a6, a6, 24
-; RV32I-NEXT: slli t2, t2, 8
-; RV32I-NEXT: slli t3, t3, 16
-; RV32I-NEXT: slli t5, t5, 24
-; RV32I-NEXT: slli s0, s0, 8
-; RV32I-NEXT: slli s2, s2, 16
-; RV32I-NEXT: slli s3, s3, 24
-; RV32I-NEXT: slli s7, s7, 8
-; RV32I-NEXT: slli s8, s8, 16
-; RV32I-NEXT: slli s9, s9, 24
-; RV32I-NEXT: slli s11, s11, 8
-; RV32I-NEXT: slli ra, ra, 16
+; RV32I-NEXT: slli s0, s0, 16
+; RV32I-NEXT: slli s1, s1, 24
+; RV32I-NEXT: or s0, s1, s0
+; RV32I-NEXT: mv s1, sp
+; RV32I-NEXT: slli s2, s2, 8
+; RV32I-NEXT: slli s3, s3, 16
+; RV32I-NEXT: slli s4, s4, 24
+; RV32I-NEXT: slli s6, s6, 8
+; RV32I-NEXT: slli s7, s7, 16
; RV32I-NEXT: slli a0, a0, 24
; RV32I-NEXT: slli a1, a1, 3
-; RV32I-NEXT: or a5, a6, a5
-; RV32I-NEXT: or a3, t2, a3
-; RV32I-NEXT: or a6, t5, t3
-; RV32I-NEXT: or a4, s0, a4
-; RV32I-NEXT: or t2, s3, s2
-; RV32I-NEXT: or t3, s7, s1
-; RV32I-NEXT: or t5, s9, s8
-; RV32I-NEXT: or s0, s11, s10
-; RV32I-NEXT: or a0, a0, ra
+; RV32I-NEXT: or t3, s2, t3
+; RV32I-NEXT: or s2, s4, s3
+; RV32I-NEXT: or s3, s6, s5
+; RV32I-NEXT: or a0, a0, s7
; RV32I-NEXT: andi a1, a1, 24
-; RV32I-NEXT: or a7, t0, a7
-; RV32I-NEXT: or t0, s4, t1
-; RV32I-NEXT: or t1, s6, s5
-; RV32I-NEXT: or a5, a5, t4
-; RV32I-NEXT: or a3, a6, a3
-; RV32I-NEXT: or a4, t2, a4
-; RV32I-NEXT: or a6, t5, t3
-; RV32I-NEXT: or a0, a0, s0
-; RV32I-NEXT: add t6, t6, a1
-; RV32I-NEXT: sw a3, 24(sp)
-; RV32I-NEXT: sw a4, 28(sp)
-; RV32I-NEXT: sw a6, 32(sp)
-; RV32I-NEXT: sw a0, 36(sp)
-; RV32I-NEXT: sw a7, 8(sp)
-; RV32I-NEXT: sw t0, 12(sp)
-; RV32I-NEXT: sw t1, 16(sp)
-; RV32I-NEXT: sw a5, 20(sp)
-; RV32I-NEXT: lw a6, 16(t6)
-; RV32I-NEXT: lw a5, 20(t6)
-; RV32I-NEXT: lw a7, 24(t6)
-; RV32I-NEXT: lw a1, 0(t6)
-; RV32I-NEXT: lw a0, 4(t6)
-; RV32I-NEXT: lw a4, 8(t6)
-; RV32I-NEXT: lw a3, 12(t6)
-; RV32I-NEXT: lw t0, 28(t6)
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a5, t0, a7
+; RV32I-NEXT: or a6, t2, t1
+; RV32I-NEXT: or a7, t5, t4
+; RV32I-NEXT: or t0, s0, t6
+; RV32I-NEXT: or t1, s2, t3
+; RV32I-NEXT: or a0, a0, s3
+; RV32I-NEXT: add s1, s1, a1
+; RV32I-NEXT: sw a7, 16(sp)
+; RV32I-NEXT: sw t0, 20(sp)
+; RV32I-NEXT: sw t1, 24(sp)
+; RV32I-NEXT: sw a0, 28(sp)
+; RV32I-NEXT: sw a3, 0(sp)
+; RV32I-NEXT: sw a4, 4(sp)
+; RV32I-NEXT: sw a5, 8(sp)
+; RV32I-NEXT: sw a6, 12(sp)
+; RV32I-NEXT: lw a6, 16(s1)
+; RV32I-NEXT: lw a5, 20(s1)
+; RV32I-NEXT: lw a7, 24(s1)
+; RV32I-NEXT: lw a1, 0(s1)
+; RV32I-NEXT: lw a0, 4(s1)
+; RV32I-NEXT: lw a4, 8(s1)
+; RV32I-NEXT: lw a3, 12(s1)
+; RV32I-NEXT: lw t0, 28(s1)
; RV32I-NEXT: srli t1, a7, 24
; RV32I-NEXT: srli t2, a7, 16
; RV32I-NEXT: srli t3, a7, 8
@@ -3219,21 +3212,21 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
; RV32I-NEXT: srli s5, a5, 8
; RV32I-NEXT: srli s6, a4, 24
; RV32I-NEXT: srli s7, a4, 16
-; RV32I-NEXT: srli s8, a4, 8
-; RV32I-NEXT: srli s9, a3, 24
-; RV32I-NEXT: srli s10, a3, 16
-; RV32I-NEXT: srli s11, a3, 8
-; RV32I-NEXT: srli ra, a1, 24
; RV32I-NEXT: sb a7, 24(a2)
+; RV32I-NEXT: srli a7, a4, 8
; RV32I-NEXT: sb t3, 25(a2)
+; RV32I-NEXT: srli t3, a3, 24
; RV32I-NEXT: sb t2, 26(a2)
+; RV32I-NEXT: srli t2, a3, 16
; RV32I-NEXT: sb t1, 27(a2)
-; RV32I-NEXT: srli a7, a1, 16
+; RV32I-NEXT: srli t1, a3, 8
; RV32I-NEXT: sb t0, 28(a2)
+; RV32I-NEXT: srli t0, a1, 24
; RV32I-NEXT: sb t6, 29(a2)
+; RV32I-NEXT: srli t6, a1, 16
; RV32I-NEXT: sb t5, 30(a2)
; RV32I-NEXT: sb t4, 31(a2)
-; RV32I-NEXT: srli t0, a1, 8
+; RV32I-NEXT: srli t4, a1, 8
; RV32I-NEXT: sb a6, 16(a2)
; RV32I-NEXT: sb s2, 17(a2)
; RV32I-NEXT: sb s1, 18(a2)
@@ -3245,36 +3238,35 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
; RV32I-NEXT: sb s3, 23(a2)
; RV32I-NEXT: srli a5, a0, 16
; RV32I-NEXT: sb a4, 8(a2)
-; RV32I-NEXT: sb s8, 9(a2)
+; RV32I-NEXT: sb a7, 9(a2)
; RV32I-NEXT: sb s7, 10(a2)
; RV32I-NEXT: sb s6, 11(a2)
; RV32I-NEXT: srli a4, a0, 8
; RV32I-NEXT: sb a3, 12(a2)
-; RV32I-NEXT: sb s11, 13(a2)
-; RV32I-NEXT: sb s10, 14(a2)
-; RV32I-NEXT: sb s9, 15(a2)
+; RV32I-NEXT: sb t1, 13(a2)
+; RV32I-NEXT: sb t2, 14(a2)
+; RV32I-NEXT: sb t3, 15(a2)
; RV32I-NEXT: sb a1, 0(a2)
-; RV32I-NEXT: sb t0, 1(a2)
-; RV32I-NEXT: sb a7, 2(a2)
-; RV32I-NEXT: sb ra, 3(a2)
+; RV32I-NEXT: sb t4, 1(a2)
+; RV32I-NEXT: sb t6, 2(a2)
+; RV32I-NEXT: sb t0, 3(a2)
; RV32I-NEXT: sb a0, 4(a2)
; RV32I-NEXT: sb a4, 5(a2)
; RV32I-NEXT: sb a5, 6(a2)
; RV32I-NEXT: sb a6, 7(a2)
-; RV32I-NEXT: lw ra, 124(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s0, 120(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s1, 116(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s2, 112(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s3, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s4, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s5, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s6, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s7, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s8, 88(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s9, 84(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s10, 80(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s11, 76(sp) # 4-byte Folded Reload
-; RV32I-NEXT: addi sp, sp, 128
+; RV32I-NEXT: lw s0, 108(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 104(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s2, 100(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s3, 96(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s4, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s5, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s6, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s7, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s8, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s9, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s10, 68(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s11, 64(sp) # 4-byte Folded Reload
+; RV32I-NEXT: addi sp, sp, 112
; RV32I-NEXT: ret
%src = load i256, ptr %src.ptr, align 1
%dwordOff = load i256, ptr %dwordOff.ptr, align 1
@@ -3518,132 +3510,129 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
;
; RV32I-LABEL: shl_32bytes:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi sp, sp, -128
-; RV32I-NEXT: sw ra, 124(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s0, 120(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s1, 116(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s2, 112(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s3, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s4, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s5, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s6, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s7, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s8, 88(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s9, 84(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s10, 80(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s11, 76(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu s1, 0(a0)
+; RV32I-NEXT: addi sp, sp, -112
+; RV32I-NEXT: sw s0, 108(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 104(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s2, 100(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s3, 96(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s4, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s5, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s6, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s7, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s8, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s9, 72(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s10, 68(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s11, 64(sp) # 4-byte Folded Spill
+; RV32I-NEXT: lbu a3, 0(a0)
; RV32I-NEXT: lbu a4, 1(a0)
; RV32I-NEXT: lbu a5, 2(a0)
; RV32I-NEXT: lbu a6, 3(a0)
-; RV32I-NEXT: lbu t1, 4(a0)
-; RV32I-NEXT: lbu t3, 5(a0)
-; RV32I-NEXT: lbu t4, 6(a0)
-; RV32I-NEXT: lbu s0, 7(a0)
-; RV32I-NEXT: lbu t2, 8(a0)
-; RV32I-NEXT: lbu s3, 9(a0)
-; RV32I-NEXT: lbu s6, 10(a0)
-; RV32I-NEXT: lbu s8, 11(a0)
-; RV32I-NEXT: lbu s9, 12(a0)
-; RV32I-NEXT: lbu s10, 13(a0)
-; RV32I-NEXT: lbu s4, 14(a0)
-; RV32I-NEXT: lbu s7, 15(a0)
-; RV32I-NEXT: lbu s5, 16(a0)
-; RV32I-NEXT: lbu s11, 17(a0)
-; RV32I-NEXT: lbu ra, 18(a0)
-; RV32I-NEXT: lbu a3, 19(a0)
-; RV32I-NEXT: lbu t5, 20(a0)
-; RV32I-NEXT: lbu t6, 21(a0)
-; RV32I-NEXT: lbu a7, 22(a0)
-; RV32I-NEXT: lbu t0, 23(a0)
+; RV32I-NEXT: lbu a7, 4(a0)
+; RV32I-NEXT: lbu t0, 5(a0)
+; RV32I-NEXT: lbu t1, 6(a0)
+; RV32I-NEXT: lbu t2, 7(a0)
+; RV32I-NEXT: lbu t3, 8(a0)
+; RV32I-NEXT: lbu t4, 9(a0)
+; RV32I-NEXT: lbu t5, 10(a0)
+; RV32I-NEXT: lbu t6, 11(a0)
+; RV32I-NEXT: lbu s0, 12(a0)
+; RV32I-NEXT: lbu s1, 13(a0)
+; RV32I-NEXT: lbu s2, 14(a0)
+; RV32I-NEXT: lbu s3, 15(a0)
+; RV32I-NEXT: lbu s4, 16(a0)
+; RV32I-NEXT: lbu s5, 17(a0)
+; RV32I-NEXT: lbu s6, 18(a0)
+; RV32I-NEXT: lbu s7, 19(a0)
; RV32I-NEXT: slli a4, a4, 8
; RV32I-NEXT: slli a5, a5, 16
; RV32I-NEXT: slli a6, a6, 24
-; RV32I-NEXT: slli t3, t3, 8
-; RV32I-NEXT: slli t4, t4, 16
-; RV32I-NEXT: slli s0, s0, 24
-; RV32I-NEXT: or a4, a4, s1
-; RV32I-NEXT: sw a4, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT: slli t0, t0, 8
+; RV32I-NEXT: slli t1, t1, 16
+; RV32I-NEXT: slli t2, t2, 24
+; RV32I-NEXT: or a3, a4, a3
; RV32I-NEXT: or a4, a6, a5
-; RV32I-NEXT: or a5, t3, t1
-; RV32I-NEXT: or a6, s0, t4
-; RV32I-NEXT: lbu t1, 24(a0)
+; RV32I-NEXT: or a5, t0, a7
+; RV32I-NEXT: or a6, t2, t1
+; RV32I-NEXT: lbu s8, 20(a0)
+; RV32I-NEXT: lbu s9, 21(a0)
+; RV32I-NEXT: lbu s10, 22(a0)
+; RV32I-NEXT: lbu s11, 23(a0)
+; RV32I-NEXT: slli t4, t4, 8
+; RV32I-NEXT: slli t5, t5, 16
+; RV32I-NEXT: slli t6, t6, 24
+; RV32I-NEXT: slli s1, s1, 8
+; RV32I-NEXT: slli s2, s2, 16
+; RV32I-NEXT: slli s3, s3, 24
+; RV32I-NEXT: or a7, t4, t3
+; RV32I-NEXT: or t0, t6, t5
+; RV32I-NEXT: or t1, s1, s0
+; RV32I-NEXT: or t2, s3, s2
+; RV32I-NEXT: lbu t6, 24(a0)
; RV32I-NEXT: lbu s0, 25(a0)
; RV32I-NEXT: lbu s1, 26(a0)
; RV32I-NEXT: lbu s2, 27(a0)
-; RV32I-NEXT: slli s3, s3, 8
+; RV32I-NEXT: slli s5, s5, 8
; RV32I-NEXT: slli s6, s6, 16
-; RV32I-NEXT: slli s8, s8, 24
-; RV32I-NEXT: slli s10, s10, 8
-; RV32I-NEXT: or t2, s3, t2
-; RV32I-NEXT: or t3, s8, s6
-; RV32I-NEXT: or t4, s10, s9
-; RV32I-NEXT: lbu s3, 28(a0)
-; RV32I-NEXT: lbu s6, 29(a0)
-; RV32I-NEXT: lbu s8, 30(a0)
-; RV32I-NEXT: lbu s9, 31(a0)
-; RV32I-NEXT: slli s4, s4, 16
; RV32I-NEXT: slli s7, s7, 24
-; RV32I-NEXT: slli s11, s11, 8
-; RV32I-NEXT: slli ra, ra, 16
-; RV32I-NEXT: slli a3, a3, 24
-; RV32I-NEXT: or a0, s7, s4
-; RV32I-NEXT: or s4, s11, s5
-; RV32I-NEXT: or s5, a3, ra
-; RV32I-NEXT: lbu a3, 0(a1)
-; RV32I-NEXT: lbu s7, 1(a1)
-; RV32I-NEXT: lbu s10, 2(a1)
+; RV32I-NEXT: slli s9, s9, 8
+; RV32I-NEXT: or t3, s5, s4
+; RV32I-NEXT: or t4, s7, s6
+; RV32I-NEXT: or t5, s9, s8
+; RV32I-NEXT: lbu s3, 28(a0)
+; RV32I-NEXT: lbu s4, 29(a0)
+; RV32I-NEXT: lbu s5, 30(a0)
+; RV32I-NEXT: lbu s6, 31(a0)
+; RV32I-NEXT: slli s10, s10, 16
+; RV32I-NEXT: slli s11, s11, 24
+; RV32I-NEXT: slli s0, s0, 8
+; RV32I-NEXT: slli s1, s1, 16
+; RV32I-NEXT: slli s2, s2, 24
+; RV32I-NEXT: or a0, s11, s10
+; RV32I-NEXT: or t6, s0, t6
+; RV32I-NEXT: or s0, s2, s1
+; RV32I-NEXT: lbu s1, 0(a1)
+; RV32I-NEXT: lbu s2, 1(a1)
+; RV32I-NEXT: lbu s7, 2(a1)
; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: sw zero, 16(sp)
+; RV32I-NEXT: sw zero, 20(sp)
; RV32I-NEXT: sw zero, 24(sp)
; RV32I-NEXT: sw zero, 28(sp)
-; RV32I-NEXT: sw zero, 32(sp)
-; RV32I-NEXT: sw zero, 36(sp)
+; RV32I-NEXT: sw zero, 0(sp)
+; RV32I-NEXT: sw zero, 4(sp)
; RV32I-NEXT: sw zero, 8(sp)
; RV32I-NEXT: sw zero, 12(sp)
-; RV32I-NEXT: sw zero, 16(sp)
-; RV32I-NEXT: sw zero, 20(sp)
-; RV32I-NEXT: slli t6, t6, 8
-; RV32I-NEXT: or t5, t6, t5
-; RV32I-NEXT: addi t6, sp, 40
-; RV32I-NEXT: slli a7, a7, 16
-; RV32I-NEXT: slli t0, t0, 24
-; RV32I-NEXT: slli s0, s0, 8
-; RV32I-NEXT: slli s1, s1, 16
-; RV32I-NEXT: slli s2, s2, 24
-; RV32I-NEXT: slli s6, s6, 8
-; RV32I-NEXT: slli s8, s8, 16
-; RV32I-NEXT: slli s9, s9, 24
-; RV32I-NEXT: slli s7, s7, 8
-; RV32I-NEXT: slli s10, s10, 16
+; RV32I-NEXT: slli s4, s4, 8
+; RV32I-NEXT: or s3, s4, s3
+; RV32I-NEXT: addi s4, sp, 32
+; RV32I-NEXT: slli s5, s5, 16
+; RV32I-NEXT: slli s6, s6, 24
+; RV32I-NEXT: slli s2, s2, 8
+; RV32I-NEXT: slli s7, s7, 16
; RV32I-NEXT: slli a1, a1, 24
-; RV32I-NEXT: or a7, t0, a7
-; RV32I-NEXT: or t0, s0, t1
-; RV32I-NEXT: or t1, s2, s1
-; RV32I-NEXT: or s0, s6, s3
-; RV32I-NEXT: or s1, s9, s8
-; RV32I-NEXT: or a3, s7, a3
-; RV32I-NEXT: or a1, a1, s10
-; RV32I-NEXT: lw s2, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT: or a4, a4, s2
-; RV32I-NEXT: or a5, a6, a5
-; RV32I-NEXT: or a6, t3, t2
-; RV32I-NEXT: or a0, a0, t4
-; RV32I-NEXT: or t2, s5, s4
-; RV32I-NEXT: or a7, a7, t5
-; RV32I-NEXT: or t0, t1, t0
-; RV32I-NEXT: or s0, s1, s0
-; RV32I-NEXT: or a1, a1, a3
-; RV32I-NEXT: sw t2, 56(sp)
-; RV32I-NEXT: sw a7, 60(sp)
-; RV32I-NEXT: sw t0, 64(sp)
-; RV32I-NEXT: sw s0, 68(sp)
-; RV32I-NEXT: sw a4, 40(sp)
-; RV32I-NEXT: sw a5, 44(sp)
-; RV32I-NEXT: sw a6, 48(sp)
+; RV32I-NEXT: or s5, s6, s5
+; RV32I-NEXT: or s1, s2, s1
+; RV32I-NEXT: or a1, a1, s7
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a5, t0, a7
+; RV32I-NEXT: or a6, t2, t1
+; RV32I-NEXT: or a7, t4, t3
+; RV32I-NEXT: or a0, a0, t5
+; RV32I-NEXT: or t0, s0, t6
+; RV32I-NEXT: or t1, s5, s3
+; RV32I-NEXT: or a1, a1, s1
+; RV32I-NEXT: sw a7, 48(sp)
; RV32I-NEXT: sw a0, 52(sp)
+; RV32I-NEXT: sw t0, 56(sp)
+; RV32I-NEXT: sw t1, 60(sp)
+; RV32I-NEXT: sw a3, 32(sp)
+; RV32I-NEXT: sw a4, 36(sp)
+; RV32I-NEXT: sw a5, 40(sp)
+; RV32I-NEXT: sw a6, 44(sp)
; RV32I-NEXT: slli a3, a1, 3
; RV32I-NEXT: andi a1, a1, 28
-; RV32I-NEXT: sub a1, t6, a1
+; RV32I-NEXT: sub a1, s4, a1
; RV32I-NEXT: andi a0, a3, 24
; RV32I-NEXT: xori a0, a0, 31
; RV32I-NEXT: lw a4, 0(a1)
@@ -3658,10 +3647,10 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: srli t4, a4, 1
; RV32I-NEXT: sll t5, a7, a3
; RV32I-NEXT: srli t6, a6, 1
-; RV32I-NEXT: sll s0, a6, a3
+; RV32I-NEXT: sll a6, a6, a3
; RV32I-NEXT: srli a5, a5, 1
-; RV32I-NEXT: sll s1, t1, a3
-; RV32I-NEXT: srli a6, t0, 1
+; RV32I-NEXT: sll s0, t1, a3
+; RV32I-NEXT: srli s1, t0, 1
; RV32I-NEXT: sll s2, t0, a3
; RV32I-NEXT: srli a7, a7, 1
; RV32I-NEXT: sll s3, a1, a3
@@ -3669,56 +3658,56 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: sll s4, t2, a3
; RV32I-NEXT: srli t0, t1, 1
; RV32I-NEXT: sll s5, a4, a3
-; RV32I-NEXT: srl t2, t4, a0
-; RV32I-NEXT: srl t4, t6, a0
-; RV32I-NEXT: srl t6, a5, a0
-; RV32I-NEXT: srl s6, a6, a0
-; RV32I-NEXT: srl s7, a7, a0
-; RV32I-NEXT: srl s8, a1, a0
-; RV32I-NEXT: srl s9, t0, a0
-; RV32I-NEXT: srli t1, s4, 24
-; RV32I-NEXT: srli a7, s3, 24
+; RV32I-NEXT: srl t4, t4, a0
+; RV32I-NEXT: srl a4, t6, a0
+; RV32I-NEXT: srl t1, a5, a0
+; RV32I-NEXT: srl t6, s1, a0
+; RV32I-NEXT: srl s1, a7, a0
+; RV32I-NEXT: srl s6, a1, a0
+; RV32I-NEXT: srl s7, t0, a0
+; RV32I-NEXT: srli t2, s4, 24
+; RV32I-NEXT: srli t0, s3, 24
; RV32I-NEXT: srli a5, s2, 24
-; RV32I-NEXT: srli a3, s1, 24
-; RV32I-NEXT: srli a1, s0, 24
+; RV32I-NEXT: srli a3, s0, 24
+; RV32I-NEXT: srli a1, a6, 24
; RV32I-NEXT: srli a0, t5, 24
-; RV32I-NEXT: srli s10, s5, 24
-; RV32I-NEXT: srli s11, s5, 16
-; RV32I-NEXT: srli ra, s5, 8
-; RV32I-NEXT: srli a4, t3, 24
-; RV32I-NEXT: or a6, t3, t2
-; RV32I-NEXT: or t0, t5, t4
-; RV32I-NEXT: or t2, s0, t6
-; RV32I-NEXT: or t3, s1, s6
-; RV32I-NEXT: or t4, s2, s7
-; RV32I-NEXT: or t5, s3, s8
-; RV32I-NEXT: or t6, s4, s9
+; RV32I-NEXT: srli s8, s5, 24
+; RV32I-NEXT: or a4, t5, a4
+; RV32I-NEXT: srli t5, s5, 16
+; RV32I-NEXT: or t1, a6, t1
+; RV32I-NEXT: srli s9, s5, 8
+; RV32I-NEXT: or a7, t3, t4
+; RV32I-NEXT: srli a6, t3, 24
+; RV32I-NEXT: or t3, s0, t6
+; RV32I-NEXT: or t4, s2, s1
+; RV32I-NEXT: or t6, s3, s6
+; RV32I-NEXT: or s0, s4, s7
; RV32I-NEXT: sb s5, 0(a2)
-; RV32I-NEXT: sb ra, 1(a2)
-; RV32I-NEXT: sb s11, 2(a2)
-; RV32I-NEXT: sb s10, 3(a2)
-; RV32I-NEXT: srli s0, t6, 16
-; RV32I-NEXT: srli s1, t6, 8
-; RV32I-NEXT: srli s2, t5, 16
-; RV32I-NEXT: srli s3, t5, 8
+; RV32I-NEXT: sb s9, 1(a2)
+; RV32I-NEXT: sb t5, 2(a2)
+; RV32I-NEXT: sb s8, 3(a2)
+; RV32I-NEXT: srli t5, s0, 16
+; RV32I-NEXT: srli s1, s0, 8
+; RV32I-NEXT: srli s2, t6, 16
+; RV32I-NEXT: srli s3, t6, 8
; RV32I-NEXT: srli s4, t4, 16
; RV32I-NEXT: srli s5, t4, 8
; RV32I-NEXT: srli s6, t3, 16
; RV32I-NEXT: srli s7, t3, 8
-; RV32I-NEXT: srli s8, t2, 16
-; RV32I-NEXT: srli s9, t2, 8
-; RV32I-NEXT: srli s10, t0, 16
-; RV32I-NEXT: srli s11, t0, 8
-; RV32I-NEXT: sb t6, 24(a2)
+; RV32I-NEXT: sb s0, 24(a2)
+; RV32I-NEXT: srli s0, t1, 16
; RV32I-NEXT: sb s1, 25(a2)
-; RV32I-NEXT: sb s0, 26(a2)
-; RV32I-NEXT: sb t1, 27(a2)
-; RV32I-NEXT: srli t1, a6, 16
-; RV32I-NEXT: sb t5, 28(a2)
+; RV32I-NEXT: srli s1, t1, 8
+; RV32I-NEXT: sb t5, 26(a2)
+; RV32I-NEXT: srli t5, a4, 16
+; RV32I-NEXT: sb t2, 27(a2)
+; RV32I-NEXT: srli t2, a4, 8
+; RV32I-NEXT: sb t6, 28(a2)
+; RV32I-NEXT: srli t6, a7, 16
; RV32I-NEXT: sb s3, 29(a2)
; RV32I-NEXT: sb s2, 30(a2)
-; RV32I-NEXT: sb a7, 31(a2)
-; RV32I-NEXT: srli a7, a6, 8
+; RV32I-NEXT: sb t0, 31(a2)
+; RV32I-NEXT: srli t0, a7, 8
; RV32I-NEXT: sb t4, 16(a2)
; RV32I-NEXT: sb s5, 17(a2)
; RV32I-NEXT: sb s4, 18(a2)
@@ -3727,32 +3716,31 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: sb s7, 21(a2)
; RV32I-NEXT: sb s6, 22(a2)
; RV32I-NEXT: sb a3, 23(a2)
-; RV32I-NEXT: sb t2, 8(a2)
-; RV32I-NEXT: sb s9, 9(a2)
-; RV32I-NEXT: sb s8, 10(a2)
+; RV32I-NEXT: sb t1, 8(a2)
+; RV32I-NEXT: sb s1, 9(a2)
+; RV32I-NEXT: sb s0, 10(a2)
; RV32I-NEXT: sb a1, 11(a2)
-; RV32I-NEXT: sb t0, 12(a2)
-; RV32I-NEXT: sb s11, 13(a2)
-; RV32I-NEXT: sb s10, 14(a2)
+; RV32I-NEXT: sb a4, 12(a2)
+; RV32I-NEXT: sb t2, 13(a2)
+; RV32I-NEXT: sb t5, 14(a2)
; RV32I-NEXT: sb a0, 15(a2)
-; RV32I-NEXT: sb a6, 4(a2)
-; RV32I-NEXT: sb a7, 5(a2)
-; RV32I-NEXT: sb t1, 6(a2)
-; RV32I-NEXT: sb a4, 7(a2)
-; RV32I-NEXT: lw ra, 124(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s0, 120(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s1, 116(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s2, 112(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s3, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s4, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s5, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s6, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s7, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s8, 88(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s9, 84(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s10, 80(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s11, 76(sp) # 4-byte Folded Reload
-; RV32I-NEXT: addi sp, sp, 128
+; RV32I-NEXT: sb a7, 4(a2)
+; RV32I-NEXT: sb t0, 5(a2)
+; RV32I-NEXT: sb t6, 6(a2)
+; RV32I-NEXT: sb a6, 7(a2)
+; RV32I-NEXT: lw s0, 108(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 104(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s2, 100(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s3, 96(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s4, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s5, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s6, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s7, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s8, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s9, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s10, 68(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s11, 64(sp) # 4-byte Folded Reload
+; RV32I-NEXT: addi sp, sp, 112
; RV32I-NEXT: ret
%src = load i256, ptr %src.ptr, align 1
%byteOff = load i256, ptr %byteOff.ptr, align 1
@@ -3997,129 +3985,128 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw
;
; RV32I-LABEL: shl_32bytes_wordOff:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi sp, sp, -128
-; RV32I-NEXT: sw ra, 124(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s0, 120(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s1, 116(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s2, 112(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s3, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s4, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s5, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s6, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s7, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s8, 88(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s9, 84(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s10, 80(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s11, 76(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a7, 0(a0)
-; RV32I-NEXT: lbu t0, 1(a0)
-; RV32I-NEXT: lbu t1, 2(a0)
-; RV32I-NEXT: lbu s1, 3(a0)
-; RV32I-NEXT: lbu s7, 4(a0)
-; RV32I-NEXT: lbu s8, 5(a0)
-; RV32I-NEXT: lbu s4, 6(a0)
-; RV32I-NEXT: lbu s6, 7(a0)
-; RV32I-NEXT: lbu s5, 8(a0)
-; RV32I-NEXT: lbu s10, 9(a0)
-; RV32I-NEXT: lbu s11, 10(a0)
-; RV32I-NEXT: lbu ra, 11(a0)
-; RV32I-NEXT: lbu t4, 12(a0)
-; RV32I-NEXT: lbu t6, 13(a0)
-; RV32I-NEXT: lbu a5, 14(a0)
-; RV32I-NEXT: lbu a6, 15(a0)
-; RV32I-NEXT: lbu a3, 16(a0)
-; RV32I-NEXT: lbu t2, 17(a0)
-; RV32I-NEXT: lbu t3, 18(a0)
-; RV32I-NEXT: lbu t5, 19(a0)
-; RV32I-NEXT: lbu a4, 20(a0)
-; RV32I-NEXT: lbu s0, 21(a0)
-; RV32I-NEXT: lbu s2, 22(a0)
-; RV32I-NEXT: lbu s3, 23(a0)
+; RV32I-NEXT: addi sp, sp, -112
+; RV32I-NEXT: sw s0, 108(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 104(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s2, 100(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s3, 96(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s4, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s5, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s6, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s7, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s8, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s9, 72(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s10, 68(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s11, 64(sp) # 4-byte Folded Spill
+; RV32I-NEXT: lbu a3, 0(a0)
+; RV32I-NEXT: lbu a4, 1(a0)
+; RV32I-NEXT: lbu a5, 2(a0)
+; RV32I-NEXT: lbu a6, 3(a0)
+; RV32I-NEXT: lbu a7, 4(a0)
+; RV32I-NEXT: lbu t0, 5(a0)
+; RV32I-NEXT: lbu t1, 6(a0)
+; RV32I-NEXT: lbu t2, 7(a0)
+; RV32I-NEXT: lbu t3, 8(a0)
+; RV32I-NEXT: lbu t4, 9(a0)
+; RV32I-NEXT: lbu t5, 10(a0)
+; RV32I-NEXT: lbu t6, 11(a0)
+; RV32I-NEXT: lbu s2, 12(a0)
+; RV32I-NEXT: lbu s3, 13(a0)
+; RV32I-NEXT: lbu s4, 14(a0)
+; RV32I-NEXT: lbu s5, 15(a0)
+; RV32I-NEXT: lbu s6, 16(a0)
+; RV32I-NEXT: lbu s7, 17(a0)
+; RV32I-NEXT: lbu s8, 18(a0)
+; RV32I-NEXT: lbu s9, 19(a0)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: slli a5, a5, 16
+; RV32I-NEXT: slli a6, a6, 24
; RV32I-NEXT: slli t0, t0, 8
; RV32I-NEXT: slli t1, t1, 16
-; RV32I-NEXT: slli s1, s1, 24
-; RV32I-NEXT: slli s8, s8, 8
-; RV32I-NEXT: or a7, t0, a7
-; RV32I-NEXT: or t0, s1, t1
-; RV32I-NEXT: or t1, s8, s7
-; RV32I-NEXT: lbu s1, 24(a0)
-; RV32I-NEXT: lbu s7, 25(a0)
-; RV32I-NEXT: lbu s8, 26(a0)
-; RV32I-NEXT: lbu s9, 27(a0)
+; RV32I-NEXT: slli t2, t2, 24
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a5, t0, a7
+; RV32I-NEXT: or a6, t2, t1
+; RV32I-NEXT: lbu s10, 20(a0)
+; RV32I-NEXT: lbu s11, 21(a0)
+; RV32I-NEXT: lbu s0, 22(a0)
+; RV32I-NEXT: lbu s1, 23(a0)
+; RV32I-NEXT: slli t4, t4, 8
+; RV32I-NEXT: slli t5, t5, 16
+; RV32I-NEXT: slli t6, t6, 24
+; RV32I-NEXT: slli s3, s3, 8
; RV32I-NEXT: slli s4, s4, 16
-; RV32I-NEXT: slli s6, s6, 24
-; RV32I-NEXT: slli s10, s10, 8
-; RV32I-NEXT: slli s11, s11, 16
-; RV32I-NEXT: slli ra, ra, 24
-; RV32I-NEXT: or s4, s6, s4
-; RV32I-NEXT: or s5, s10, s5
-; RV32I-NEXT: or s6, ra, s11
-; RV32I-NEXT: lbu s10, 28(a0)
-; RV32I-NEXT: lbu s11, 29(a0)
-; RV32I-NEXT: lbu ra, 30(a0)
+; RV32I-NEXT: slli s5, s5, 24
+; RV32I-NEXT: or a7, t4, t3
+; RV32I-NEXT: or t0, t6, t5
+; RV32I-NEXT: or t1, s3, s2
+; RV32I-NEXT: or t2, s5, s4
+; RV32I-NEXT: lbu t3, 24(a0)
+; RV32I-NEXT: lbu s2, 25(a0)
+; RV32I-NEXT: lbu s3, 26(a0)
+; RV32I-NEXT: lbu s4, 27(a0)
+; RV32I-NEXT: slli s7, s7, 8
+; RV32I-NEXT: slli s8, s8, 16
+; RV32I-NEXT: slli s9, s9, 24
+; RV32I-NEXT: slli s11, s11, 8
+; RV32I-NEXT: or t4, s7, s6
+; RV32I-NEXT: or t5, s9, s8
+; RV32I-NEXT: or t6, s11, s10
+; RV32I-NEXT: lbu s5, 28(a0)
+; RV32I-NEXT: lbu s6, 29(a0)
+; RV32I-NEXT: lbu s7, 30(a0)
; RV32I-NEXT: lbu a0, 31(a0)
; RV32I-NEXT: lbu a1, 0(a1)
+; RV32I-NEXT: sw zero, 16(sp)
+; RV32I-NEXT: sw zero, 20(sp)
; RV32I-NEXT: sw zero, 24(sp)
; RV32I-NEXT: sw zero, 28(sp)
-; RV32I-NEXT: sw zero, 32(sp)
-; RV32I-NEXT: sw zero, 36(sp)
+; RV32I-NEXT: sw zero, 0(sp)
+; RV32I-NEXT: sw zero, 4(sp)
; RV32I-NEXT: sw zero, 8(sp)
; RV32I-NEXT: sw zero, 12(sp)
-; RV32I-NEXT: sw zero, 16(sp)
-; RV32I-NEXT: sw zero, 20(sp)
-; RV32I-NEXT: slli t6, t6, 8
-; RV32I-NEXT: or t4, t6, t4
-; RV32I-NEXT: addi t6, sp, 40
-; RV32I-NEXT: slli a5, a5, 16
-; RV32I-NEXT: slli a6, a6, 24
-; RV32I-NEXT: slli t2, t2, 8
-; RV32I-NEXT: slli t3, t3, 16
-; RV32I-NEXT: slli t5, t5, 24
-; RV32I-NEXT: slli s0, s0, 8
-; RV32I-NEXT: slli s2, s2, 16
-; RV32I-NEXT: slli s3, s3, 24
-; RV32I-NEXT: slli s7, s7, 8
-; RV32I-NEXT: slli s8, s8, 16
-; RV32I-NEXT: slli s9, s9, 24
-; RV32I-NEXT: slli s11, s11, 8
-; RV32I-NEXT: slli ra, ra, 16
+; RV32I-NEXT: slli s0, s0, 16
+; RV32I-NEXT: slli s1, s1, 24
+; RV32I-NEXT: or s0, s1, s0
+; RV32I-NEXT: addi s1, sp, 32
+; RV32I-NEXT: slli s2, s2, 8
+; RV32I-NEXT: slli s3, s3, 16
+; RV32I-NEXT: slli s4, s4, 24
+; RV32I-NEXT: slli s6, s6, 8
+; RV32I-NEXT: slli s7, s7, 16
; RV32I-NEXT: slli a0, a0, 24
; RV32I-NEXT: slli a1, a1, 2
-; RV32I-NEXT: or a5, a6, a5
-; RV32I-NEXT: or a3, t2, a3
-; RV32I-NEXT: or a6, t5, t3
-; RV32I-NEXT: or a4, s0, a4
-; RV32I-NEXT: or t2, s3, s2
-; RV32I-NEXT: or t3, s7, s1
-; RV32I-NEXT: or t5, s9, s8
-; RV32I-NEXT: or s0, s11, s10
-; RV32I-NEXT: or a0, a0, ra
+; RV32I-NEXT: or t3, s2, t3
+; RV32I-NEXT: or s2, s4, s3
+; RV32I-NEXT: or s3, s6, s5
+; RV32I-NEXT: or a0, a0, s7
; RV32I-NEXT: andi a1, a1, 28
-; RV32I-NEXT: or a7, t0, a7
-; RV32I-NEXT: or t0, s4, t1
-; RV32I-NEXT: or t1, s6, s5
-; RV32I-NEXT: or a5, a5, t4
-; RV32I-NEXT: or a3, a6, a3
-; RV32I-NEXT: or a4, t2, a4
-; RV32I-NEXT: or a6, t5, t3
-; RV32I-NEXT: or a0, a0, s0
-; RV32I-NEXT: sub t2, t6, a1
-; RV32I-NEXT: sw a3, 56(sp)
-; RV32I-NEXT: sw a4, 60(sp)
-; RV32I-NEXT: sw a6, 64(sp)
-; RV32I-NEXT: sw a0, 68(sp)
-; RV32I-NEXT: sw a7, 40(sp)
-; RV32I-NEXT: sw t0, 44(sp)
-; RV32I-NEXT: sw t1, 48(sp)
-; RV32I-NEXT: sw a5, 52(sp)
-; RV32I-NEXT: lw a6, 16(t2)
-; RV32I-NEXT: lw a5, 20(t2)
-; RV32I-NEXT: lw a7, 24(t2)
-; RV32I-NEXT: lw a1, 0(t2)
-; RV32I-NEXT: lw a0, 4(t2)
-; RV32I-NEXT: lw a4, 8(t2)
-; RV32I-NEXT: lw a3, 12(t2)
-; RV32I-NEXT: lw t0, 28(t2)
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a5, t0, a7
+; RV32I-NEXT: or a6, t2, t1
+; RV32I-NEXT: or a7, t5, t4
+; RV32I-NEXT: or t0, s0, t6
+; RV32I-NEXT: or t1, s2, t3
+; RV32I-NEXT: or a0, a0, s3
+; RV32I-NEXT: sub s1, s1, a1
+; RV32I-NEXT: sw a7, 48(sp)
+; RV32I-NEXT: sw t0, 52(sp)
+; RV32I-NEXT: sw t1, 56(sp)
+; RV32I-NEXT: sw a0, 60(sp)
+; RV32I-NEXT: sw a3, 32(sp)
+; RV32I-NEXT: sw a4, 36(sp)
+; RV32I-NEXT: sw a5, 40(sp)
+; RV32I-NEXT: sw a6, 44(sp)
+; RV32I-NEXT: lw a6, 16(s1)
+; RV32I-NEXT: lw a5, 20(s1)
+; RV32I-NEXT: lw a7, 24(s1)
+; RV32I-NEXT: lw a1, 0(s1)
+; RV32I-NEXT: lw a0, 4(s1)
+; RV32I-NEXT: lw a4, 8(s1)
+; RV32I-NEXT: lw a3, 12(s1)
+; RV32I-NEXT: lw t0, 28(s1)
; RV32I-NEXT: srli t1, a7, 24
; RV32I-NEXT: srli t2, a7, 16
; RV32I-NEXT: srli t3, a7, 8
@@ -4134,21 +4121,21 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw
; RV32I-NEXT: srli s5, a5, 8
; RV32I-NEXT: srli s6, a4, 24
; RV32I-NEXT: srli s7, a4, 16
-; RV32I-NEXT: srli s8, a4, 8
-; RV32I-NEXT: srli s9, a3, 24
-; RV32I-NEXT: srli s10, a3, 16
-; RV32I-NEXT: srli s11, a3, 8
-; RV32I-NEXT: srli ra, a1, 24
; RV32I-NEXT: sb a7, 24(a2)
+; RV32I-NEXT: srli a7, a4, 8
; RV32I-NEXT: sb t3, 25(a2)
+; RV32I-NEXT: srli t3, a3, 24
; RV32I-NEXT: sb t2, 26(a2)
+; RV32I-NEXT: srli t2, a3, 16
; RV32I-NEXT: sb t1, 27(a2)
-; RV32I-NEXT: srli a7, a1, 16
+; RV32I-NEXT: srli t1, a3, 8
; RV32I-NEXT: sb t0, 28(a2)
+; RV32I-NEXT: srli t0, a1, 24
; RV32I-NEXT: sb t6, 29(a2)
+; RV32I-NEXT: srli t6, a1, 16
; RV32I-NEXT: sb t5, 30(a2)
; RV32I-NEXT: sb t4, 31(a2)
-; RV32I-NEXT: srli t0, a1, 8
+; RV32I-NEXT: srli t4, a1, 8
; RV32I-NEXT: sb a6, 16(a2)
; RV32I-NEXT: sb s2, 17(a2)
; RV32I-NEXT: sb s1, 18(a2)
@@ -4160,36 +4147,35 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw
; RV32I-NEXT: sb s3, 23(a2)
; RV32I-NEXT: srli a5, a0, 16
; RV32I-NEXT: sb a4, 8(a2)
-; RV32I-NEXT: sb s8, 9(a2)
+; RV32I-NEXT: sb a7, 9(a2)
; RV32I-NEXT: sb s7, 10(a2)
; RV32I-NEXT: sb s6, 11(a2)
; RV32I-NEXT: srli a4, a0, 8
; RV32I-NEXT: sb a3, 12(a2)
-; RV32I-NEXT: sb s11, 13(a2)
-; RV32I-NEXT: sb s10, 14(a2)
-; RV32I-NEXT: sb s9, 15(a2)
+; RV32I-NEXT: sb t1, 13(a2)
+; RV32I-NEXT: sb t2, 14(a2)
+; RV32I-NEXT: sb t3, 15(a2)
; RV32I-NEXT: sb a1, 0(a2)
-; RV32I-NEXT: sb t0, 1(a2)
-; RV32I-NEXT: sb a7, 2(a2)
-; RV32I-NEXT: sb ra, 3(a2)
+; RV32I-NEXT: sb t4, 1(a2)
+; RV32I-NEXT: sb t6, 2(a2)
+; RV32I-NEXT: sb t0, 3(a2)
; RV32I-NEXT: sb a0, 4(a2)
; RV32I-NEXT: sb a4, 5(a2)
; RV32I-NEXT: sb a5, 6(a2)
; RV32I-NEXT: sb a6, 7(a2)
-; RV32I-NEXT: lw ra, 124(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s0, 120(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s1, 116(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s2, 112(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s3, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s4, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s5, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s6, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s7, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s8, 88(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s9, 84(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s10, 80(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s11, 76(sp) # 4-byte Folded Reload
-; RV32I-NEXT: addi sp, sp, 128
+; RV32I-NEXT: lw s0, 108(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 104(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s2, 100(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s3, 96(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s4, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s5, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s6, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s7, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s8, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s9, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s10, 68(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s11, 64(sp) # 4-byte Folded Reload
+; RV32I-NEXT: addi sp, sp, 112
; RV32I-NEXT: ret
%src = load i256, ptr %src.ptr, align 1
%wordOff = load i256, ptr %wordOff.ptr, align 1
@@ -4215,111 +4201,111 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
; RV64I-NEXT: sd s9, 80(sp) # 8-byte Folded Spill
; RV64I-NEXT: sd s10, 72(sp) # 8-byte Folded Spill
; RV64I-NEXT: sd s11, 64(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a5, 0(a0)
-; RV64I-NEXT: lbu a7, 1(a0)
-; RV64I-NEXT: lbu t2, 2(a0)
-; RV64I-NEXT: lbu s3, 3(a0)
-; RV64I-NEXT: lbu t0, 4(a0)
-; RV64I-NEXT: lbu s8, 5(a0)
-; RV64I-NEXT: lbu s9, 6(a0)
-; RV64I-NEXT: lbu s10, 7(a0)
-; RV64I-NEXT: lbu s2, 8(a0)
-; RV64I-NEXT: lbu s4, 9(a0)
-; RV64I-NEXT: lbu s5, 10(a0)
-; RV64I-NEXT: lbu s6, 11(a0)
-; RV64I-NEXT: lbu s7, 12(a0)
-; RV64I-NEXT: lbu s11, 13(a0)
-; RV64I-NEXT: lbu t1, 14(a0)
-; RV64I-NEXT: lbu t3, 15(a0)
-; RV64I-NEXT: lbu a3, 16(a0)
-; RV64I-NEXT: lbu a6, 17(a0)
-; RV64I-NEXT: lbu t4, 18(a0)
-; RV64I-NEXT: lbu t5, 19(a0)
-; RV64I-NEXT: lbu a4, 20(a0)
-; RV64I-NEXT: lbu t6, 21(a0)
-; RV64I-NEXT: lbu s0, 22(a0)
-; RV64I-NEXT: lbu s1, 23(a0)
-; RV64I-NEXT: slli a7, a7, 8
-; RV64I-NEXT: slli t2, t2, 16
+; RV64I-NEXT: lbu a3, 0(a0)
+; RV64I-NEXT: lbu a4, 1(a0)
+; RV64I-NEXT: lbu a5, 2(a0)
+; RV64I-NEXT: lbu a6, 3(a0)
+; RV64I-NEXT: lbu a7, 4(a0)
+; RV64I-NEXT: lbu t0, 5(a0)
+; RV64I-NEXT: lbu t1, 6(a0)
+; RV64I-NEXT: lbu t2, 7(a0)
+; RV64I-NEXT: lbu t3, 8(a0)
+; RV64I-NEXT: lbu t4, 9(a0)
+; RV64I-NEXT: lbu t5, 10(a0)
+; RV64I-NEXT: lbu t6, 11(a0)
+; RV64I-NEXT: lbu s0, 12(a0)
+; RV64I-NEXT: lbu s1, 13(a0)
+; RV64I-NEXT: lbu s2, 14(a0)
+; RV64I-NEXT: lbu s3, 15(a0)
+; RV64I-NEXT: lbu s4, 16(a0)
+; RV64I-NEXT: lbu s5, 17(a0)
+; RV64I-NEXT: lbu s6, 18(a0)
+; RV64I-NEXT: lbu s7, 19(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: slli a5, a5, 16
+; RV64I-NEXT: slli a6, a6, 24
+; RV64I-NEXT: slli t0, t0, 8
+; RV64I-NEXT: slli t1, t1, 16
+; RV64I-NEXT: slli t2, t2, 24
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: or a5, t0, a7
+; RV64I-NEXT: or a6, t2, t1
+; RV64I-NEXT: lbu s8, 20(a0)
+; RV64I-NEXT: lbu s9, 21(a0)
+; RV64I-NEXT: lbu s10, 22(a0)
+; RV64I-NEXT: lbu s11, 23(a0)
+; RV64I-NEXT: slli t4, t4, 8
+; RV64I-NEXT: slli t5, t5, 16
+; RV64I-NEXT: slli t6, t6, 24
+; RV64I-NEXT: slli s1, s1, 8
+; RV64I-NEXT: slli s2, s2, 16
; RV64I-NEXT: slli s3, s3, 24
-; RV64I-NEXT: slli s8, s8, 8
-; RV64I-NEXT: slli s9, s9, 16
-; RV64I-NEXT: slli s10, s10, 24
-; RV64I-NEXT: or a5, a7, a5
-; RV64I-NEXT: or a7, s3, t2
-; RV64I-NEXT: or t0, s8, t0
-; RV64I-NEXT: or t2, s10, s9
-; RV64I-NEXT: lbu s3, 24(a0)
-; RV64I-NEXT: lbu s8, 25(a0)
-; RV64I-NEXT: lbu s9, 26(a0)
-; RV64I-NEXT: lbu s10, 27(a0)
-; RV64I-NEXT: slli s4, s4, 8
-; RV64I-NEXT: slli s5, s5, 16
-; RV64I-NEXT: slli s6, s6, 24
-; RV64I-NEXT: slli s11, s11, 8
-; RV64I-NEXT: or s2, s4, s2
-; RV64I-NEXT: or s4, s6, s5
-; RV64I-NEXT: or s5, s11, s7
-; RV64I-NEXT: lbu s6, 28(a0)
-; RV64I-NEXT: lbu s7, 29(a0)
-; RV64I-NEXT: lbu s11, 30(a0)
+; RV64I-NEXT: or a7, t4, t3
+; RV64I-NEXT: or t0, t6, t5
+; RV64I-NEXT: or t1, s1, s0
+; RV64I-NEXT: or t2, s3, s2
+; RV64I-NEXT: lbu t3, 24(a0)
+; RV64I-NEXT: lbu t4, 25(a0)
+; RV64I-NEXT: lbu t5, 26(a0)
+; RV64I-NEXT: lbu t6, 27(a0)
+; RV64I-NEXT: slli s5, s5, 8
+; RV64I-NEXT: slli s6, s6, 16
+; RV64I-NEXT: slli s7, s7, 24
+; RV64I-NEXT: slli s9, s9, 8
+; RV64I-NEXT: or s0, s5, s4
+; RV64I-NEXT: or s1, s7, s6
+; RV64I-NEXT: or s2, s9, s8
+; RV64I-NEXT: lbu s3, 28(a0)
+; RV64I-NEXT: lbu s4, 29(a0)
+; RV64I-NEXT: lbu s5, 30(a0)
; RV64I-NEXT: lbu a0, 31(a0)
; RV64I-NEXT: lbu a1, 0(a1)
; RV64I-NEXT: sd zero, 0(sp)
; RV64I-NEXT: sd zero, 8(sp)
; RV64I-NEXT: sd zero, 16(sp)
; RV64I-NEXT: sd zero, 24(sp)
-; RV64I-NEXT: slli t1, t1, 16
-; RV64I-NEXT: slli t3, t3, 24
-; RV64I-NEXT: or t1, t3, t1
-; RV64I-NEXT: addi t3, sp, 32
-; RV64I-NEXT: slli a6, a6, 8
-; RV64I-NEXT: slli t4, t4, 16
-; RV64I-NEXT: slli t5, t5, 24
-; RV64I-NEXT: slli t6, t6, 8
-; RV64I-NEXT: slli s0, s0, 16
-; RV64I-NEXT: slli s1, s1, 24
-; RV64I-NEXT: slli s8, s8, 8
-; RV64I-NEXT: slli s9, s9, 16
-; RV64I-NEXT: slli s10, s10, 24
-; RV64I-NEXT: slli s7, s7, 8
-; RV64I-NEXT: slli s11, s11, 16
+; RV64I-NEXT: slli s10, s10, 16
+; RV64I-NEXT: slli s11, s11, 24
+; RV64I-NEXT: or s6, s11, s10
+; RV64I-NEXT: addi s7, sp, 32
+; RV64I-NEXT: slli t4, t4, 8
+; RV64I-NEXT: slli t5, t5, 16
+; RV64I-NEXT: slli t6, t6, 24
+; RV64I-NEXT: slli s4, s4, 8
+; RV64I-NEXT: slli s5, s5, 16
; RV64I-NEXT: slli a0, a0, 24
; RV64I-NEXT: slli a1, a1, 3
-; RV64I-NEXT: or a3, a6, a3
-; RV64I-NEXT: or a6, t5, t4
-; RV64I-NEXT: or a4, t6, a4
-; RV64I-NEXT: or s0, s1, s0
-; RV64I-NEXT: or t4, s8, s3
-; RV64I-NEXT: or t5, s10, s9
-; RV64I-NEXT: or t6, s7, s6
-; RV64I-NEXT: or a0, a0, s11
+; RV64I-NEXT: or t3, t4, t3
+; RV64I-NEXT: or t4, t6, t5
+; RV64I-NEXT: or t5, s4, s3
+; RV64I-NEXT: or a0, a0, s5
; RV64I-NEXT: andi a1, a1, 24
-; RV64I-NEXT: or a5, a7, a5
-; RV64I-NEXT: or a7, t2, t0
-; RV64I-NEXT: or t0, s4, s2
-; RV64I-NEXT: or t1, t1, s5
-; RV64I-NEXT: or a3, a6, a3
-; RV64I-NEXT: or a4, s0, a4
-; RV64I-NEXT: or a6, t5, t4
-; RV64I-NEXT: or a0, a0, t6
-; RV64I-NEXT: sub t2, t3, a1
-; RV64I-NEXT: slli a7, a7, 32
-; RV64I-NEXT: slli t1, t1, 32
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: or a5, t0, a7
+; RV64I-NEXT: or a6, t2, t1
+; RV64I-NEXT: or s0, s1, s0
+; RV64I-NEXT: or a7, s6, s2
+; RV64I-NEXT: or t0, t4, t3
+; RV64I-NEXT: or a0, a0, t5
+; RV64I-NEXT: sub t1, s7, a1
; RV64I-NEXT: slli a4, a4, 32
+; RV64I-NEXT: slli a6, a6, 32
+; RV64I-NEXT: slli a7, a7, 32
; RV64I-NEXT: slli a0, a0, 32
-; RV64I-NEXT: or a1, a7, a5
-; RV64I-NEXT: or a5, t1, t0
; RV64I-NEXT: or a3, a4, a3
-; RV64I-NEXT: or a0, a0, a6
-; RV64I-NEXT: sd a1, 32(sp)
-; RV64I-NEXT: sd a5, 40(sp)
-; RV64I-NEXT: sd a3, 48(sp)
+; RV64I-NEXT: or a1, a6, a5
+; RV64I-NEXT: or a4, a7, s0
+; RV64I-NEXT: or a0, a0, t0
+; RV64I-NEXT: sd a3, 32(sp)
+; RV64I-NEXT: sd a1, 40(sp)
+; RV64I-NEXT: sd a4, 48(sp)
; RV64I-NEXT: sd a0, 56(sp)
-; RV64I-NEXT: ld a4, 16(t2)
-; RV64I-NEXT: ld a0, 8(t2)
-; RV64I-NEXT: ld a1, 0(t2)
-; RV64I-NEXT: ld a3, 24(t2)
+; RV64I-NEXT: ld a4, 16(t1)
+; RV64I-NEXT: ld a0, 8(t1)
+; RV64I-NEXT: ld a1, 0(t1)
+; RV64I-NEXT: ld a3, 24(t1)
; RV64I-NEXT: srli a5, a4, 56
; RV64I-NEXT: srli a6, a4, 48
; RV64I-NEXT: srli a7, a4, 40
@@ -4338,25 +4324,25 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
; RV64I-NEXT: srli s5, a1, 48
; RV64I-NEXT: srli s6, a1, 40
; RV64I-NEXT: srli s7, a1, 32
-; RV64I-NEXT: srli s8, a1, 24
-; RV64I-NEXT: srli s9, a1, 16
-; RV64I-NEXT: srli s10, a1, 8
-; RV64I-NEXT: srli s11, a0, 56
; RV64I-NEXT: sb t0, 20(a2)
+; RV64I-NEXT: srli t0, a1, 24
; RV64I-NEXT: sb a7, 21(a2)
+; RV64I-NEXT: srli a7, a1, 16
; RV64I-NEXT: sb a6, 22(a2)
+; RV64I-NEXT: srli a6, a1, 8
; RV64I-NEXT: sb a5, 23(a2)
-; RV64I-NEXT: srli a5, a0, 48
+; RV64I-NEXT: srli a5, a0, 56
; RV64I-NEXT: sb a4, 16(a2)
+; RV64I-NEXT: srli a4, a0, 48
; RV64I-NEXT: sb t3, 17(a2)
; RV64I-NEXT: sb t2, 18(a2)
; RV64I-NEXT: sb t1, 19(a2)
-; RV64I-NEXT: srli a4, a0, 40
+; RV64I-NEXT: srli t1, a0, 40
; RV64I-NEXT: sb s0, 28(a2)
; RV64I-NEXT: sb t6, 29(a2)
; RV64I-NEXT: sb t5, 30(a2)
; RV64I-NEXT: sb t4, 31(a2)
-; RV64I-NEXT: srli a6, a0, 32
+; RV64I-NEXT: srli t2, a0, 32
; RV64I-NEXT: sb a3, 24(a2)
; RV64I-NEXT: sb s3, 25(a2)
; RV64I-NEXT: sb s2, 26(a2)
@@ -4366,19 +4352,19 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
; RV64I-NEXT: sb s6, 5(a2)
; RV64I-NEXT: sb s5, 6(a2)
; RV64I-NEXT: sb s4, 7(a2)
-; RV64I-NEXT: srli a7, a0, 16
+; RV64I-NEXT: srli t3, a0, 16
; RV64I-NEXT: sb a1, 0(a2)
-; RV64I-NEXT: sb s10, 1(a2)
-; RV64I-NEXT: sb s9, 2(a2)
-; RV64I-NEXT: sb s8, 3(a2)
+; RV64I-NEXT: sb a6, 1(a2)
+; RV64I-NEXT: sb a7, 2(a2)
+; RV64I-NEXT: sb t0, 3(a2)
; RV64I-NEXT: srli a1, a0, 8
-; RV64I-NEXT: sb a6, 12(a2)
-; RV64I-NEXT: sb a4, 13(a2)
-; RV64I-NEXT: sb a5, 14(a2)
-; RV64I-NEXT: sb s11, 15(a2)
+; RV64I-NEXT: sb t2, 12(a2)
+; RV64I-NEXT: sb t1, 13(a2)
+; RV64I-NEXT: sb a4, 14(a2)
+; RV64I-NEXT: sb a5, 15(a2)
; RV64I-NEXT: sb a0, 8(a2)
; RV64I-NEXT: sb a1, 9(a2)
-; RV64I-NEXT: sb a7, 10(a2)
+; RV64I-NEXT: sb t3, 10(a2)
; RV64I-NEXT: sb a3, 11(a2)
; RV64I-NEXT: ld s0, 152(sp) # 8-byte Folded Reload
; RV64I-NEXT: ld s1, 144(sp) # 8-byte Folded Reload
@@ -4397,129 +4383,128 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
;
; RV32I-LABEL: shl_32bytes_dwordOff:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi sp, sp, -128
-; RV32I-NEXT: sw ra, 124(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s0, 120(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s1, 116(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s2, 112(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s3, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s4, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s5, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s6, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s7, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s8, 88(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s9, 84(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s10, 80(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s11, 76(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a7, 0(a0)
-; RV32I-NEXT: lbu t0, 1(a0)
-; RV32I-NEXT: lbu t1, 2(a0)
-; RV32I-NEXT: lbu s1, 3(a0)
-; RV32I-NEXT: lbu s7, 4(a0)
-; RV32I-NEXT: lbu s8, 5(a0)
-; RV32I-NEXT: lbu s4, 6(a0)
-; RV32I-NEXT: lbu s6, 7(a0)
-; RV32I-NEXT: lbu s5, 8(a0)
-; RV32I-NEXT: lbu s10, 9(a0)
-; RV32I-NEXT: lbu s11, 10(a0)
-; RV32I-NEXT: lbu ra, 11(a0)
-; RV32I-NEXT: lbu t4, 12(a0)
-; RV32I-NEXT: lbu t6, 13(a0)
-; RV32I-NEXT: lbu a5, 14(a0)
-; RV32I-NEXT: lbu a6, 15(a0)
-; RV32I-NEXT: lbu a3, 16(a0)
-; RV32I-NEXT: lbu t2, 17(a0)
-; RV32I-NEXT: lbu t3, 18(a0)
-; RV32I-NEXT: lbu t5, 19(a0)
-; RV32I-NEXT: lbu a4, 20(a0)
-; RV32I-NEXT: lbu s0, 21(a0)
-; RV32I-NEXT: lbu s2, 22(a0)
-; RV32I-NEXT: lbu s3, 23(a0)
+; RV32I-NEXT: addi sp, sp, -112
+; RV32I-NEXT: sw s0, 108(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 104(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s2, 100(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s3, 96(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s4, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s5, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s6, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s7, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s8, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s9, 72(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s10, 68(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s11, 64(sp) # 4-byte Folded Spill
+; RV32I-NEXT: lbu a3, 0(a0)
+; RV32I-NEXT: lbu a4, 1(a0)
+; RV32I-NEXT: lbu a5, 2(a0)
+; RV32I-NEXT: lbu a6, 3(a0)
+; RV32I-NEXT: lbu a7, 4(a0)
+; RV32I-NEXT: lbu t0, 5(a0)
+; RV32I-NEXT: lbu t1, 6(a0)
+; RV32I-NEXT: lbu t2, 7(a0)
+; RV32I-NEXT: lbu t3, 8(a0)
+; RV32I-NEXT: lbu t4, 9(a0)
+; RV32I-NEXT: lbu t5, 10(a0)
+; RV32I-NEXT: lbu t6, 11(a0)
+; RV32I-NEXT: lbu s2, 12(a0)
+; RV32I-NEXT: lbu s3, 13(a0)
+; RV32I-NEXT: lbu s4, 14(a0)
+; RV32I-NEXT: lbu s5, 15(a0)
+; RV32I-NEXT: lbu s6, 16(a0)
+; RV32I-NEXT: lbu s7, 17(a0)
+; RV32I-NEXT: lbu s8, 18(a0)
+; RV32I-NEXT: lbu s9, 19(a0)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: slli a5, a5, 16
+; RV32I-NEXT: slli a6, a6, 24
; RV32I-NEXT: slli t0, t0, 8
; RV32I-NEXT: slli t1, t1, 16
-; RV32I-NEXT: slli s1, s1, 24
-; RV32I-NEXT: slli s8, s8, 8
-; RV32I-NEXT: or a7, t0, a7
-; RV32I-NEXT: or t0, s1, t1
-; RV32I-NEXT: or t1, s8, s7
-; RV32I-NEXT: lbu s1, 24(a0)
-; RV32I-NEXT: lbu s7, 25(a0)
-; RV32I-NEXT: lbu s8, 26(a0)
-; RV32I-NEXT: lbu s9, 27(a0)
+; RV32I-NEXT: slli t2, t2, 24
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a5, t0, a7
+; RV32I-NEXT: or a6, t2, t1
+; RV32I-NEXT: lbu s10, 20(a0)
+; RV32I-NEXT: lbu s11, 21(a0)
+; RV32I-NEXT: lbu s0, 22(a0)
+; RV32I-NEXT: lbu s1, 23(a0)
+; RV32I-NEXT: slli t4, t4, 8
+; RV32I-NEXT: slli t5, t5, 16
+; RV32I-NEXT: slli t6, t6, 24
+; RV32I-NEXT: slli s3, s3, 8
; RV32I-NEXT: slli s4, s4, 16
-; RV32I-NEXT: slli s6, s6, 24
-; RV32I-NEXT: slli s10, s10, 8
-; RV32I-NEXT: slli s11, s11, 16
-; RV32I-NEXT: slli ra, ra, 24
-; RV32I-NEXT: or s4, s6, s4
-; RV32I-NEXT: or s5, s10, s5
-; RV32I-NEXT: or s6, ra, s11
-; RV32I-NEXT: lbu s10, 28(a0)
-; RV32I-NEXT: lbu s11, 29(a0)
-; RV32I-NEXT: lbu ra, 30(a0)
+; RV32I-NEXT: slli s5, s5, 24
+; RV32I-NEXT: or a7, t4, t3
+; RV32I-NEXT: or t0, t6, t5
+; RV32I-NEXT: or t1, s3, s2
+; RV32I-NEXT: or t2, s5, s4
+; RV32I-NEXT: lbu t3, 24(a0)
+; RV32I-NEXT: lbu s2, 25(a0)
+; RV32I-NEXT: lbu s3, 26(a0)
+; RV32I-NEXT: lbu s4, 27(a0)
+; RV32I-NEXT: slli s7, s7, 8
+; RV32I-NEXT: slli s8, s8, 16
+; RV32I-NEXT: slli s9, s9, 24
+; RV32I-NEXT: slli s11, s11, 8
+; RV32I-NEXT: or t4, s7, s6
+; RV32I-NEXT: or t5, s9, s8
+; RV32I-NEXT: or t6, s11, s10
+; RV32I-NEXT: lbu s5, 28(a0)
+; RV32I-NEXT: lbu s6, 29(a0)
+; RV32I-NEXT: lbu s7, 30(a0)
; RV32I-NEXT: lbu a0, 31(a0)
; RV32I-NEXT: lbu a1, 0(a1)
+; RV32I-NEXT: sw zero, 16(sp)
+; RV32I-NEXT: sw zero, 20(sp)
; RV32I-NEXT: sw zero, 24(sp)
; RV32I-NEXT: sw zero, 28(sp)
-; RV32I-NEXT: sw zero, 32(sp)
-; RV32I-NEXT: sw zero, 36(sp)
+; RV32I-NEXT: sw zero, 0(sp)
+; RV32I-NEXT: sw zero, 4(sp)
; RV32I-NEXT: sw zero, 8(sp)
; RV32I-NEXT: sw zero, 12(sp)
-; RV32I-NEXT: sw zero, 16(sp)
-; RV32I-NEXT: sw zero, 20(sp)
-; RV32I-NEXT: slli t6, t6, 8
-; RV32I-NEXT: or t4, t6, t4
-; RV32I-NEXT: addi t6, sp, 40
-; RV32I-NEXT: slli a5, a5, 16
-; RV32I-NEXT: slli a6, a6, 24
-; RV32I-NEXT: slli t2, t2, 8
-; RV32I-NEXT: slli t3, t3, 16
-; RV32I-NEXT: slli t5, t5, 24
-; RV32I-NEXT: slli s0, s0, 8
-; RV32I-NEXT: slli s2, s2, 16
-; RV32I-NEXT: slli s3, s3, 24
-; RV32I-NEXT: slli s7, s7, 8
-; RV32I-NEXT: slli s8, s8, 16
-; RV32I-NEXT: slli s9, s9, 24
-; RV32I-NEXT: slli s11, s11, 8
-; RV32I-NEXT: slli ra, ra, 16
+; RV32I-NEXT: slli s0, s0, 16
+; RV32I-NEXT: slli s1, s1, 24
+; RV32I-NEXT: or s0, s1, s0
+; RV32I-NEXT: addi s1, sp, 32
+; RV32I-NEXT: slli s2, s2, 8
+; RV32I-NEXT: slli s3, s3, 16
+; RV32I-NEXT: slli s4, s4, 24
+; RV32I-NEXT: slli s6, s6, 8
+; RV32I-NEXT: slli s7, s7, 16
; RV32I-NEXT: slli a0, a0, 24
; RV32I-NEXT: slli a1, a1, 3
-; RV32I-NEXT: or a5, a6, a5
-; RV32I-NEXT: or a3, t2, a3
-; RV32I-NEXT: or a6, t5, t3
-; RV32I-NEXT: or a4, s0, a4
-; RV32I-NEXT: or t2, s3, s2
-; RV32I-NEXT: or t3, s7, s1
-; RV32I-NEXT: or t5, s9, s8
-; RV32I-NEXT: or s0, s11, s10
-; RV32I-NEXT: or a0, a0, ra
+; RV32I-NEXT: or t3, s2, t3
+; RV32I-NEXT: or s2, s4, s3
+; RV32I-NEXT: or s3, s6, s5
+; RV32I-NEXT: or a0, a0, s7
; RV32I-NEXT: andi a1, a1, 24
-; RV32I-NEXT: or a7, t0, a7
-; RV32I-NEXT: or t0, s4, t1
-; RV32I-NEXT: or t1, s6, s5
-; RV32I-NEXT: or a5, a5, t4
-; RV32I-NEXT: or a3, a6, a3
-; RV32I-NEXT: or a4, t2, a4
-; RV32I-NEXT: or a6, t5, t3
-; RV32I-NEXT: or a0, a0, s0
-; RV32I-NEXT: sub t2, t6, a1
-; RV32I-NEXT: sw a3, 56(sp)
-; RV32I-NEXT: sw a4, 60(sp)
-; RV32I-NEXT: sw a6, 64(sp)
-; RV32I-NEXT: sw a0, 68(sp)
-; RV32I-NEXT: sw a7, 40(sp)
-; RV32I-NEXT: sw t0, 44(sp)
-; RV32I-NEXT: sw t1, 48(sp)
-; RV32I-NEXT: sw a5, 52(sp)
-; RV32I-NEXT: lw a6, 16(t2)
-; RV32I-NEXT: lw a5, 20(t2)
-; RV32I-NEXT: lw a7, 24(t2)
-; RV32I-NEXT: lw a1, 0(t2)
-; RV32I-NEXT: lw a0, 4(t2)
-; RV32I-NEXT: lw a4, 8(t2)
-; RV32I-NEXT: lw a3, 12(t2)
-; RV32I-NEXT: lw t0, 28(t2)
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a5, t0, a7
+; RV32I-NEXT: or a6, t2, t1
+; RV32I-NEXT: or a7, t5, t4
+; RV32I-NEXT: or t0, s0, t6
+; RV32I-NEXT: or t1, s2, t3
+; RV32I-NEXT: or a0, a0, s3
+; RV32I-NEXT: sub s1, s1, a1
+; RV32I-NEXT: sw a7, 48(sp)
+; RV32I-NEXT: sw t0, 52(sp)
+; RV32I-NEXT: sw t1, 56(sp)
+; RV32I-NEXT: sw a0, 60(sp)
+; RV32I-NEXT: sw a3, 32(sp)
+; RV32I-NEXT: sw a4, 36(sp)
+; RV32I-NEXT: sw a5, 40(sp)
+; RV32I-NEXT: sw a6, 44(sp)
+; RV32I-NEXT: lw a6, 16(s1)
+; RV32I-NEXT: lw a5, 20(s1)
+; RV32I-NEXT: lw a7, 24(s1)
+; RV32I-NEXT: lw a1, 0(s1)
+; RV32I-NEXT: lw a0, 4(s1)
+; RV32I-NEXT: lw a4, 8(s1)
+; RV32I-NEXT: lw a3, 12(s1)
+; RV32I-NEXT: lw t0, 28(s1)
; RV32I-NEXT: srli t1, a7, 24
; RV32I-NEXT: srli t2, a7, 16
; RV32I-NEXT: srli t3, a7, 8
@@ -4534,21 +4519,21 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
; RV32I-NEXT: srli s5, a5, 8
; RV32I-NEXT: srli s6, a4, 24
; RV32I-NEXT: srli s7, a4, 16
-; RV32I-NEXT: srli s8, a4, 8
-; RV32I-NEXT: srli s9, a3, 24
-; RV32I-NEXT: srli s10, a3, 16
-; RV32I-NEXT: srli s11, a3, 8
-; RV32I-NEXT: srli ra, a1, 24
; RV32I-NEXT: sb a7, 24(a2)
+; RV32I-NEXT: srli a7, a4, 8
; RV32I-NEXT: sb t3, 25(a2)
+; RV32I-NEXT: srli t3, a3, 24
; RV32I-NEXT: sb t2, 26(a2)
+; RV32I-NEXT: srli t2, a3, 16
; RV32I-NEXT: sb t1, 27(a2)
-; RV32I-NEXT: srli a7, a1, 16
+; RV32I-NEXT: srli t1, a3, 8
; RV32I-NEXT: sb t0, 28(a2)
+; RV32I-NEXT: srli t0, a1, 24
; RV32I-NEXT: sb t6, 29(a2)
+; RV32I-NEXT: srli t6, a1, 16
; RV32I-NEXT: sb t5, 30(a2)
; RV32I-NEXT: sb t4, 31(a2)
-; RV32I-NEXT: srli t0, a1, 8
+; RV32I-NEXT: srli t4, a1, 8
; RV32I-NEXT: sb a6, 16(a2)
; RV32I-NEXT: sb s2, 17(a2)
; RV32I-NEXT: sb s1, 18(a2)
@@ -4560,36 +4545,35 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
; RV32I-NEXT: sb s3, 23(a2)
; RV32I-NEXT: srli a5, a0, 16
; RV32I-NEXT: sb a4, 8(a2)
-; RV32I-NEXT: sb s8, 9(a2)
+; RV32I-NEXT: sb a7, 9(a2)
; RV32I-NEXT: sb s7, 10(a2)
; RV32I-NEXT: sb s6, 11(a2)
; RV32I-NEXT: srli a4, a0, 8
; RV32I-NEXT: sb a3, 12(a2)
-; RV32I-NEXT: sb s11, 13(a2)
-; RV32I-NEXT: sb s10, 14(a2)
-; RV32I-NEXT: sb s9, 15(a2)
+; RV32I-NEXT: sb t1, 13(a2)
+; RV32I-NEXT: sb t2, 14(a2)
+; RV32I-NEXT: sb t3, 15(a2)
; RV32I-NEXT: sb a1, 0(a2)
-; RV32I-NEXT: sb t0, 1(a2)
-; RV32I-NEXT: sb a7, 2(a2)
-; RV32I-NEXT: sb ra, 3(a2)
+; RV32I-NEXT: sb t4, 1(a2)
+; RV32I-NEXT: sb t6, 2(a2)
+; RV32I-NEXT: sb t0, 3(a2)
; RV32I-NEXT: sb a0, 4(a2)
; RV32I-NEXT: sb a4, 5(a2)
; RV32I-NEXT: sb a5, 6(a2)
; RV32I-NEXT: sb a6, 7(a2)
-; RV32I-NEXT: lw ra, 124(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s0, 120(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s1, 116(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s2, 112(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s3, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s4, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s5, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s6, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s7, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s8, 88(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s9, 84(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s10, 80(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s11, 76(sp) # 4-byte Folded Reload
-; RV32I-NEXT: addi sp, sp, 128
+; RV32I-NEXT: lw s0, 108(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 104(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s2, 100(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s3, 96(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s4, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s5, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s6, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s7, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s8, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s9, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s10, 68(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s11, 64(sp) # 4-byte Folded Reload
+; RV32I-NEXT: addi sp, sp, 112
; RV32I-NEXT: ret
%src = load i256, ptr %src.ptr, align 1
%dwordOff = load i256, ptr %dwordOff.ptr, align 1
@@ -4834,140 +4818,137 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
;
; RV32I-LABEL: ashr_32bytes:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi sp, sp, -128
-; RV32I-NEXT: sw ra, 124(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s0, 120(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s1, 116(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s2, 112(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s3, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s4, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s5, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s6, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s7, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s8, 88(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s9, 84(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s10, 80(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s11, 76(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu t6, 0(a0)
+; RV32I-NEXT: addi sp, sp, -112
+; RV32I-NEXT: sw s0, 108(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 104(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s2, 100(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s3, 96(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s4, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s5, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s6, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s7, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s8, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s9, 72(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s10, 68(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s11, 64(sp) # 4-byte Folded Spill
+; RV32I-NEXT: lbu a3, 0(a0)
; RV32I-NEXT: lbu a4, 1(a0)
; RV32I-NEXT: lbu a5, 2(a0)
; RV32I-NEXT: lbu a6, 3(a0)
-; RV32I-NEXT: lbu t1, 4(a0)
-; RV32I-NEXT: lbu t3, 5(a0)
-; RV32I-NEXT: lbu t4, 6(a0)
-; RV32I-NEXT: lbu t5, 7(a0)
-; RV32I-NEXT: lbu t2, 8(a0)
-; RV32I-NEXT: lbu s1, 9(a0)
-; RV32I-NEXT: lbu s7, 10(a0)
-; RV32I-NEXT: lbu s8, 11(a0)
-; RV32I-NEXT: lbu s9, 12(a0)
-; RV32I-NEXT: lbu s10, 13(a0)
-; RV32I-NEXT: lbu s4, 14(a0)
-; RV32I-NEXT: lbu s6, 15(a0)
-; RV32I-NEXT: lbu s5, 16(a0)
-; RV32I-NEXT: lbu s11, 17(a0)
-; RV32I-NEXT: lbu ra, 18(a0)
-; RV32I-NEXT: lbu a3, 19(a0)
-; RV32I-NEXT: lbu s2, 20(a0)
-; RV32I-NEXT: lbu s3, 21(a0)
-; RV32I-NEXT: lbu a7, 22(a0)
-; RV32I-NEXT: lbu t0, 23(a0)
+; RV32I-NEXT: lbu a7, 4(a0)
+; RV32I-NEXT: lbu t0, 5(a0)
+; RV32I-NEXT: lbu t1, 6(a0)
+; RV32I-NEXT: lbu t2, 7(a0)
+; RV32I-NEXT: lbu t3, 8(a0)
+; RV32I-NEXT: lbu t4, 9(a0)
+; RV32I-NEXT: lbu t5, 10(a0)
+; RV32I-NEXT: lbu t6, 11(a0)
+; RV32I-NEXT: lbu s0, 12(a0)
+; RV32I-NEXT: lbu s1, 13(a0)
+; RV32I-NEXT: lbu s2, 14(a0)
+; RV32I-NEXT: lbu s3, 15(a0)
+; RV32I-NEXT: lbu s4, 16(a0)
+; RV32I-NEXT: lbu s5, 17(a0)
+; RV32I-NEXT: lbu s6, 18(a0)
+; RV32I-NEXT: lbu s7, 19(a0)
; RV32I-NEXT: slli a4, a4, 8
; RV32I-NEXT: slli a5, a5, 16
; RV32I-NEXT: slli a6, a6, 24
-; RV32I-NEXT: slli t3, t3, 8
-; RV32I-NEXT: slli t4, t4, 16
-; RV32I-NEXT: slli t5, t5, 24
-; RV32I-NEXT: or a4, a4, t6
-; RV32I-NEXT: sw a4, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT: slli t0, t0, 8
+; RV32I-NEXT: slli t1, t1, 16
+; RV32I-NEXT: slli t2, t2, 24
+; RV32I-NEXT: or a3, a4, a3
; RV32I-NEXT: or a4, a6, a5
-; RV32I-NEXT: or a5, t3, t1
-; RV32I-NEXT: or a6, t5, t4
-; RV32I-NEXT: lbu t1, 24(a0)
-; RV32I-NEXT: lbu t5, 25(a0)
-; RV32I-NEXT: lbu t6, 26(a0)
-; RV32I-NEXT: lbu s0, 27(a0)
+; RV32I-NEXT: or a5, t0, a7
+; RV32I-NEXT: or a6, t2, t1
+; RV32I-NEXT: lbu s8, 20(a0)
+; RV32I-NEXT: lbu s9, 21(a0)
+; RV32I-NEXT: lbu s10, 22(a0)
+; RV32I-NEXT: lbu s11, 23(a0)
+; RV32I-NEXT: slli t4, t4, 8
+; RV32I-NEXT: slli t5, t5, 16
+; RV32I-NEXT: slli t6, t6, 24
; RV32I-NEXT: slli s1, s1, 8
-; RV32I-NEXT: slli s7, s7, 16
-; RV32I-NEXT: slli s8, s8, 24
-; RV32I-NEXT: slli s10, s10, 8
-; RV32I-NEXT: or t2, s1, t2
-; RV32I-NEXT: or t3, s8, s7
-; RV32I-NEXT: or t4, s10, s9
-; RV32I-NEXT: lbu s1, 28(a0)
-; RV32I-NEXT: lbu s7, 29(a0)
-; RV32I-NEXT: lbu s8, 30(a0)
-; RV32I-NEXT: lbu s9, 31(a0)
-; RV32I-NEXT: slli s4, s4, 16
-; RV32I-NEXT: slli s6, s6, 24
-; RV32I-NEXT: slli s11, s11, 8
-; RV32I-NEXT: slli ra, ra, 16
-; RV32I-NEXT: slli a3, a3, 24
-; RV32I-NEXT: or a0, s6, s4
-; RV32I-NEXT: or s4, s11, s5
-; RV32I-NEXT: or s5, a3, ra
-; RV32I-NEXT: lbu a3, 0(a1)
-; RV32I-NEXT: lbu s6, 1(a1)
-; RV32I-NEXT: lbu s10, 2(a1)
-; RV32I-NEXT: lbu a1, 3(a1)
-; RV32I-NEXT: slli s3, s3, 8
-; RV32I-NEXT: or s2, s3, s2
-; RV32I-NEXT: addi s3, sp, 8
-; RV32I-NEXT: slli a7, a7, 16
-; RV32I-NEXT: slli t0, t0, 24
-; RV32I-NEXT: slli t5, t5, 8
-; RV32I-NEXT: slli t6, t6, 16
-; RV32I-NEXT: slli s0, s0, 24
-; RV32I-NEXT: slli s7, s7, 8
-; RV32I-NEXT: slli s8, s8, 16
-; RV32I-NEXT: slli s9, s9, 24
-; RV32I-NEXT: slli s6, s6, 8
+; RV32I-NEXT: slli s2, s2, 16
+; RV32I-NEXT: slli s3, s3, 24
+; RV32I-NEXT: or a7, t4, t3
+; RV32I-NEXT: or t0, t6, t5
+; RV32I-NEXT: or t1, s1, s0
+; RV32I-NEXT: or t2, s3, s2
+; RV32I-NEXT: lbu t6, 24(a0)
+; RV32I-NEXT: lbu s0, 25(a0)
+; RV32I-NEXT: lbu s1, 26(a0)
+; RV32I-NEXT: lbu s2, 27(a0)
+; RV32I-NEXT: slli s5, s5, 8
+; RV32I-NEXT: slli s6, s6, 16
+; RV32I-NEXT: slli s7, s7, 24
+; RV32I-NEXT: slli s9, s9, 8
+; RV32I-NEXT: or t3, s5, s4
+; RV32I-NEXT: or t4, s7, s6
+; RV32I-NEXT: or t5, s9, s8
+; RV32I-NEXT: lbu s3, 28(a0)
+; RV32I-NEXT: lbu s4, 29(a0)
+; RV32I-NEXT: lbu s5, 30(a0)
+; RV32I-NEXT: lbu a0, 31(a0)
; RV32I-NEXT: slli s10, s10, 16
+; RV32I-NEXT: slli s11, s11, 24
+; RV32I-NEXT: slli s0, s0, 8
+; RV32I-NEXT: slli s1, s1, 16
+; RV32I-NEXT: slli s2, s2, 24
+; RV32I-NEXT: or s6, s11, s10
+; RV32I-NEXT: or t6, s0, t6
+; RV32I-NEXT: or s0, s2, s1
+; RV32I-NEXT: lbu s1, 0(a1)
+; RV32I-NEXT: lbu s2, 1(a1)
+; RV32I-NEXT: lbu s7, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: slli s4, s4, 8
+; RV32I-NEXT: or s3, s4, s3
+; RV32I-NEXT: mv s4, sp
+; RV32I-NEXT: slli s5, s5, 16
+; RV32I-NEXT: slli a0, a0, 24
+; RV32I-NEXT: slli s2, s2, 8
+; RV32I-NEXT: slli s7, s7, 16
; RV32I-NEXT: slli a1, a1, 24
-; RV32I-NEXT: or a7, t0, a7
-; RV32I-NEXT: or t0, t5, t1
-; RV32I-NEXT: or t1, s0, t6
-; RV32I-NEXT: or t5, s7, s1
-; RV32I-NEXT: or t6, s9, s8
-; RV32I-NEXT: or a3, s6, a3
-; RV32I-NEXT: or a1, a1, s10
-; RV32I-NEXT: srai s0, s9, 31
-; RV32I-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT: or a4, a4, s1
-; RV32I-NEXT: or a5, a6, a5
-; RV32I-NEXT: or a6, t3, t2
-; RV32I-NEXT: or a0, a0, t4
-; RV32I-NEXT: or t2, s5, s4
-; RV32I-NEXT: or a7, a7, s2
-; RV32I-NEXT: or t0, t1, t0
-; RV32I-NEXT: or t1, t6, t5
-; RV32I-NEXT: or a1, a1, a3
-; RV32I-NEXT: sw s0, 56(sp)
-; RV32I-NEXT: sw s0, 60(sp)
-; RV32I-NEXT: sw s0, 64(sp)
-; RV32I-NEXT: sw s0, 68(sp)
-; RV32I-NEXT: sw s0, 40(sp)
-; RV32I-NEXT: sw s0, 44(sp)
-; RV32I-NEXT: sw s0, 48(sp)
-; RV32I-NEXT: sw s0, 52(sp)
-; RV32I-NEXT: sw t2, 24(sp)
-; RV32I-NEXT: sw a7, 28(sp)
-; RV32I-NEXT: sw t0, 32(sp)
-; RV32I-NEXT: sw t1, 36(sp)
-; RV32I-NEXT: sw a4, 8(sp)
-; RV32I-NEXT: sw a5, 12(sp)
-; RV32I-NEXT: sw a6, 16(sp)
-; RV32I-NEXT: sw a0, 20(sp)
+; RV32I-NEXT: or s5, a0, s5
+; RV32I-NEXT: or s1, s2, s1
+; RV32I-NEXT: or a1, a1, s7
+; RV32I-NEXT: srai a0, a0, 31
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a5, t0, a7
+; RV32I-NEXT: or a6, t2, t1
+; RV32I-NEXT: or a7, t4, t3
+; RV32I-NEXT: or t0, s6, t5
+; RV32I-NEXT: or t1, s0, t6
+; RV32I-NEXT: or t2, s5, s3
+; RV32I-NEXT: or a1, a1, s1
+; RV32I-NEXT: sw a0, 48(sp)
+; RV32I-NEXT: sw a0, 52(sp)
+; RV32I-NEXT: sw a0, 56(sp)
+; RV32I-NEXT: sw a0, 60(sp)
+; RV32I-NEXT: sw a0, 32(sp)
+; RV32I-NEXT: sw a0, 36(sp)
+; RV32I-NEXT: sw a0, 40(sp)
+; RV32I-NEXT: sw a0, 44(sp)
+; RV32I-NEXT: sw a7, 16(sp)
+; RV32I-NEXT: sw t0, 20(sp)
+; RV32I-NEXT: sw t1, 24(sp)
+; RV32I-NEXT: sw t2, 28(sp)
+; RV32I-NEXT: sw a3, 0(sp)
+; RV32I-NEXT: sw a4, 4(sp)
+; RV32I-NEXT: sw a5, 8(sp)
+; RV32I-NEXT: sw a6, 12(sp)
; RV32I-NEXT: slli t1, a1, 3
; RV32I-NEXT: andi a1, a1, 28
-; RV32I-NEXT: add a1, s3, a1
+; RV32I-NEXT: add a1, s4, a1
; RV32I-NEXT: andi a0, t1, 24
-; RV32I-NEXT: xori t0, a0, 31
+; RV32I-NEXT: xori a7, a0, 31
; RV32I-NEXT: lw a3, 0(a1)
; RV32I-NEXT: lw a4, 4(a1)
; RV32I-NEXT: lw a5, 8(a1)
; RV32I-NEXT: lw a6, 12(a1)
-; RV32I-NEXT: lw a7, 16(a1)
+; RV32I-NEXT: lw t0, 16(a1)
; RV32I-NEXT: lw t2, 20(a1)
; RV32I-NEXT: lw t3, 24(a1)
; RV32I-NEXT: lw t4, 28(a1)
@@ -4976,33 +4957,33 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: srl a1, a3, t1
; RV32I-NEXT: slli t6, a4, 1
; RV32I-NEXT: srl a3, a6, t1
-; RV32I-NEXT: slli s0, a7, 1
+; RV32I-NEXT: slli s0, t0, 1
; RV32I-NEXT: srl a4, a5, t1
; RV32I-NEXT: slli s1, a6, 1
; RV32I-NEXT: srl a5, t2, t1
; RV32I-NEXT: slli s2, t3, 1
-; RV32I-NEXT: srl a6, a7, t1
+; RV32I-NEXT: srl a6, t0, t1
; RV32I-NEXT: slli t2, t2, 1
-; RV32I-NEXT: srl a7, t3, t1
+; RV32I-NEXT: srl t0, t3, t1
; RV32I-NEXT: slli t3, t4, 1
; RV32I-NEXT: sra t1, t4, t1
-; RV32I-NEXT: sll t4, t5, t0
-; RV32I-NEXT: sll t5, t6, t0
-; RV32I-NEXT: sll t6, s0, t0
-; RV32I-NEXT: sll s0, s1, t0
-; RV32I-NEXT: sll s1, s2, t0
-; RV32I-NEXT: sll t2, t2, t0
-; RV32I-NEXT: sll t3, t3, t0
+; RV32I-NEXT: sll t4, t5, a7
+; RV32I-NEXT: sll t5, t6, a7
+; RV32I-NEXT: sll t6, s0, a7
+; RV32I-NEXT: sll s0, s1, a7
+; RV32I-NEXT: sll s1, s2, a7
+; RV32I-NEXT: sll t2, t2, a7
+; RV32I-NEXT: sll t3, t3, a7
; RV32I-NEXT: srli s2, t1, 24
; RV32I-NEXT: srli s3, t1, 16
; RV32I-NEXT: srli s4, t1, 8
-; RV32I-NEXT: or t0, a0, t4
+; RV32I-NEXT: or a7, a0, t4
; RV32I-NEXT: or t4, a1, t5
; RV32I-NEXT: or t5, a3, t6
; RV32I-NEXT: or s0, a4, s0
; RV32I-NEXT: or s1, a5, s1
; RV32I-NEXT: or t2, a6, t2
-; RV32I-NEXT: or t3, a7, t3
+; RV32I-NEXT: or t3, t0, t3
; RV32I-NEXT: sb t1, 28(a2)
; RV32I-NEXT: sb s4, 29(a2)
; RV32I-NEXT: sb s3, 30(a2)
@@ -5019,23 +5000,23 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: srli s6, s0, 24
; RV32I-NEXT: srli s7, s0, 16
; RV32I-NEXT: srli s0, s0, 8
-; RV32I-NEXT: srli s8, t5, 24
-; RV32I-NEXT: srli s9, t5, 16
-; RV32I-NEXT: srli t5, t5, 8
-; RV32I-NEXT: srli s10, t4, 24
-; RV32I-NEXT: srli s11, t4, 16
-; RV32I-NEXT: srli t4, t4, 8
-; RV32I-NEXT: sb a7, 24(a2)
+; RV32I-NEXT: sb t0, 24(a2)
+; RV32I-NEXT: srli t0, t5, 24
; RV32I-NEXT: sb t3, 25(a2)
+; RV32I-NEXT: srli t3, t5, 16
+; RV32I-NEXT: srli t5, t5, 8
; RV32I-NEXT: sb t6, 26(a2)
+; RV32I-NEXT: srli t6, t4, 24
; RV32I-NEXT: sb t1, 27(a2)
-; RV32I-NEXT: srli a7, t0, 24
+; RV32I-NEXT: srli t1, t4, 16
+; RV32I-NEXT: srli t4, t4, 8
; RV32I-NEXT: sb a6, 16(a2)
+; RV32I-NEXT: srli a6, a7, 24
; RV32I-NEXT: sb t2, 17(a2)
; RV32I-NEXT: sb s3, 18(a2)
; RV32I-NEXT: sb s2, 19(a2)
-; RV32I-NEXT: srli a6, t0, 16
-; RV32I-NEXT: srli t0, t0, 8
+; RV32I-NEXT: srli t2, a7, 16
+; RV32I-NEXT: srli a7, a7, 8
; RV32I-NEXT: sb a5, 20(a2)
; RV32I-NEXT: sb s1, 21(a2)
; RV32I-NEXT: sb s5, 22(a2)
@@ -5046,30 +5027,29 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: sb s6, 11(a2)
; RV32I-NEXT: sb a3, 12(a2)
; RV32I-NEXT: sb t5, 13(a2)
-; RV32I-NEXT: sb s9, 14(a2)
-; RV32I-NEXT: sb s8, 15(a2)
+; RV32I-NEXT: sb t3, 14(a2)
+; RV32I-NEXT: sb t0, 15(a2)
; RV32I-NEXT: sb a1, 0(a2)
; RV32I-NEXT: sb t4, 1(a2)
-; RV32I-NEXT: sb s11, 2(a2)
-; RV32I-NEXT: sb s10, 3(a2)
+; RV32I-NEXT: sb t1, 2(a2)
+; RV32I-NEXT: sb t6, 3(a2)
; RV32I-NEXT: sb a0, 4(a2)
-; RV32I-NEXT: sb t0, 5(a2)
-; RV32I-NEXT: sb a6, 6(a2)
-; RV32I-NEXT: sb a7, 7(a2)
-; RV32I-NEXT: lw ra, 124(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s0, 120(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s1, 116(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s2, 112(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s3, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s4, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s5, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s6, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s7, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s8, 88(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s9, 84(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s10, 80(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s11, 76(sp) # 4-byte Folded Reload
-; RV32I-NEXT: addi sp, sp, 128
+; RV32I-NEXT: sb a7, 5(a2)
+; RV32I-NEXT: sb t2, 6(a2)
+; RV32I-NEXT: sb a6, 7(a2)
+; RV32I-NEXT: lw s0, 108(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 104(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s2, 100(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s3, 96(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s4, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s5, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s6, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s7, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s8, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s9, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s10, 68(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s11, 64(sp) # 4-byte Folded Reload
+; RV32I-NEXT: addi sp, sp, 112
; RV32I-NEXT: ret
%src = load i256, ptr %src.ptr, align 1
%byteOff = load i256, ptr %byteOff.ptr, align 1
@@ -5315,130 +5295,129 @@ define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
;
; RV32I-LABEL: ashr_32bytes_wordOff:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi sp, sp, -128
-; RV32I-NEXT: sw ra, 124(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s0, 120(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s1, 116(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s2, 112(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s3, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s4, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s5, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s6, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s7, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s8, 88(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s9, 84(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s10, 80(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s11, 76(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a6, 0(a0)
-; RV32I-NEXT: lbu t0, 1(a0)
-; RV32I-NEXT: lbu t1, 2(a0)
-; RV32I-NEXT: lbu t6, 3(a0)
-; RV32I-NEXT: lbu s7, 4(a0)
-; RV32I-NEXT: lbu s8, 5(a0)
-; RV32I-NEXT: lbu s3, 6(a0)
-; RV32I-NEXT: lbu s5, 7(a0)
-; RV32I-NEXT: lbu s4, 8(a0)
-; RV32I-NEXT: lbu s9, 9(a0)
-; RV32I-NEXT: lbu s10, 10(a0)
-; RV32I-NEXT: lbu s11, 11(a0)
-; RV32I-NEXT: lbu s2, 12(a0)
-; RV32I-NEXT: lbu s6, 13(a0)
-; RV32I-NEXT: lbu a5, 14(a0)
-; RV32I-NEXT: lbu a7, 15(a0)
-; RV32I-NEXT: lbu a3, 16(a0)
-; RV32I-NEXT: lbu t2, 17(a0)
-; RV32I-NEXT: lbu t3, 18(a0)
-; RV32I-NEXT: lbu t4, 19(a0)
-; RV32I-NEXT: lbu a4, 20(a0)
-; RV32I-NEXT: lbu t5, 21(a0)
-; RV32I-NEXT: lbu s0, 22(a0)
-; RV32I-NEXT: lbu s1, 23(a0)
+; RV32I-NEXT: addi sp, sp, -112
+; RV32I-NEXT: sw s0, 108(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 104(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s2, 100(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s3, 96(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s4, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s5, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s6, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s7, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s8, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s9, 72(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s10, 68(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s11, 64(sp) # 4-byte Folded Spill
+; RV32I-NEXT: lbu a3, 0(a0)
+; RV32I-NEXT: lbu a4, 1(a0)
+; RV32I-NEXT: lbu a5, 2(a0)
+; RV32I-NEXT: lbu a6, 3(a0)
+; RV32I-NEXT: lbu a7, 4(a0)
+; RV32I-NEXT: lbu t0, 5(a0)
+; RV32I-NEXT: lbu t1, 6(a0)
+; RV32I-NEXT: lbu t2, 7(a0)
+; RV32I-NEXT: lbu t3, 8(a0)
+; RV32I-NEXT: lbu t4, 9(a0)
+; RV32I-NEXT: lbu t5, 10(a0)
+; RV32I-NEXT: lbu t6, 11(a0)
+; RV32I-NEXT: lbu s0, 12(a0)
+; RV32I-NEXT: lbu s1, 13(a0)
+; RV32I-NEXT: lbu s2, 14(a0)
+; RV32I-NEXT: lbu s3, 15(a0)
+; RV32I-NEXT: lbu s4, 16(a0)
+; RV32I-NEXT: lbu s5, 17(a0)
+; RV32I-NEXT: lbu s6, 18(a0)
+; RV32I-NEXT: lbu s7, 19(a0)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: slli a5, a5, 16
+; RV32I-NEXT: slli a6, a6, 24
; RV32I-NEXT: slli t0, t0, 8
; RV32I-NEXT: slli t1, t1, 16
+; RV32I-NEXT: slli t2, t2, 24
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a5, t0, a7
+; RV32I-NEXT: or a6, t2, t1
+; RV32I-NEXT: lbu s8, 20(a0)
+; RV32I-NEXT: lbu s9, 21(a0)
+; RV32I-NEXT: lbu s10, 22(a0)
+; RV32I-NEXT: lbu s11, 23(a0)
+; RV32I-NEXT: slli t4, t4, 8
+; RV32I-NEXT: slli t5, t5, 16
; RV32I-NEXT: slli t6, t6, 24
-; RV32I-NEXT: slli s8, s8, 8
-; RV32I-NEXT: or a6, t0, a6
-; RV32I-NEXT: or t0, t6, t1
-; RV32I-NEXT: or t1, s8, s7
-; RV32I-NEXT: lbu t6, 24(a0)
-; RV32I-NEXT: lbu s7, 25(a0)
-; RV32I-NEXT: lbu s8, 26(a0)
-; RV32I-NEXT: lbu ra, 27(a0)
-; RV32I-NEXT: slli s3, s3, 16
-; RV32I-NEXT: slli s5, s5, 24
+; RV32I-NEXT: slli s1, s1, 8
+; RV32I-NEXT: slli s2, s2, 16
+; RV32I-NEXT: slli s3, s3, 24
+; RV32I-NEXT: or a7, t4, t3
+; RV32I-NEXT: or t0, t6, t5
+; RV32I-NEXT: or t1, s1, s0
+; RV32I-NEXT: or t2, s3, s2
+; RV32I-NEXT: lbu t3, 24(a0)
+; RV32I-NEXT: lbu t5, 25(a0)
+; RV32I-NEXT: lbu t6, 26(a0)
+; RV32I-NEXT: lbu s0, 27(a0)
+; RV32I-NEXT: slli s5, s5, 8
+; RV32I-NEXT: slli s6, s6, 16
+; RV32I-NEXT: slli s7, s7, 24
; RV32I-NEXT: slli s9, s9, 8
-; RV32I-NEXT: slli s10, s10, 16
-; RV32I-NEXT: slli s11, s11, 24
-; RV32I-NEXT: or s3, s5, s3
-; RV32I-NEXT: or s4, s9, s4
-; RV32I-NEXT: or s5, s11, s10
-; RV32I-NEXT: lbu s9, 28(a0)
-; RV32I-NEXT: lbu s10, 29(a0)
-; RV32I-NEXT: lbu s11, 30(a0)
+; RV32I-NEXT: or t4, s5, s4
+; RV32I-NEXT: or s1, s7, s6
+; RV32I-NEXT: or s2, s9, s8
+; RV32I-NEXT: lbu s3, 28(a0)
+; RV32I-NEXT: lbu s4, 29(a0)
+; RV32I-NEXT: lbu s5, 30(a0)
; RV32I-NEXT: lbu a0, 31(a0)
; RV32I-NEXT: lbu a1, 0(a1)
-; RV32I-NEXT: slli s6, s6, 8
-; RV32I-NEXT: or s2, s6, s2
-; RV32I-NEXT: addi s6, sp, 8
-; RV32I-NEXT: slli a5, a5, 16
-; RV32I-NEXT: slli a7, a7, 24
-; RV32I-NEXT: slli t2, t2, 8
-; RV32I-NEXT: slli t3, t3, 16
-; RV32I-NEXT: slli t4, t4, 24
+; RV32I-NEXT: slli s10, s10, 16
+; RV32I-NEXT: slli s11, s11, 24
+; RV32I-NEXT: or s6, s11, s10
+; RV32I-NEXT: mv s7, sp
; RV32I-NEXT: slli t5, t5, 8
-; RV32I-NEXT: slli s0, s0, 16
-; RV32I-NEXT: slli s1, s1, 24
-; RV32I-NEXT: slli s7, s7, 8
-; RV32I-NEXT: slli s8, s8, 16
-; RV32I-NEXT: slli ra, ra, 24
-; RV32I-NEXT: slli s10, s10, 8
-; RV32I-NEXT: slli s11, s11, 16
+; RV32I-NEXT: slli t6, t6, 16
+; RV32I-NEXT: slli s0, s0, 24
+; RV32I-NEXT: slli s4, s4, 8
+; RV32I-NEXT: slli s5, s5, 16
; RV32I-NEXT: slli a0, a0, 24
; RV32I-NEXT: slli a1, a1, 2
-; RV32I-NEXT: or a5, a7, a5
-; RV32I-NEXT: or a3, t2, a3
-; RV32I-NEXT: or a7, t4, t3
-; RV32I-NEXT: or a4, t5, a4
-; RV32I-NEXT: or s0, s1, s0
-; RV32I-NEXT: or t2, s7, t6
-; RV32I-NEXT: or t3, ra, s8
-; RV32I-NEXT: or t4, s10, s9
-; RV32I-NEXT: or t5, a0, s11
+; RV32I-NEXT: or t3, t5, t3
+; RV32I-NEXT: or t5, s0, t6
+; RV32I-NEXT: or t6, s4, s3
+; RV32I-NEXT: or s0, a0, s5
; RV32I-NEXT: srai a0, a0, 31
; RV32I-NEXT: andi a1, a1, 28
-; RV32I-NEXT: or a6, t0, a6
-; RV32I-NEXT: or t0, s3, t1
-; RV32I-NEXT: or t1, s5, s4
-; RV32I-NEXT: or a5, a5, s2
-; RV32I-NEXT: or a3, a7, a3
-; RV32I-NEXT: or a4, s0, a4
-; RV32I-NEXT: or a7, t3, t2
-; RV32I-NEXT: or t2, t5, t4
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a5, t0, a7
+; RV32I-NEXT: or a6, t2, t1
+; RV32I-NEXT: or a7, s1, t4
+; RV32I-NEXT: or t0, s6, s2
+; RV32I-NEXT: or t1, t5, t3
+; RV32I-NEXT: or t2, s0, t6
+; RV32I-NEXT: sw a0, 48(sp)
+; RV32I-NEXT: sw a0, 52(sp)
; RV32I-NEXT: sw a0, 56(sp)
; RV32I-NEXT: sw a0, 60(sp)
-; RV32I-NEXT: sw a0, 64(sp)
-; RV32I-NEXT: sw a0, 68(sp)
+; RV32I-NEXT: sw a0, 32(sp)
+; RV32I-NEXT: sw a0, 36(sp)
; RV32I-NEXT: sw a0, 40(sp)
; RV32I-NEXT: sw a0, 44(sp)
-; RV32I-NEXT: sw a0, 48(sp)
-; RV32I-NEXT: sw a0, 52(sp)
-; RV32I-NEXT: add s6, s6, a1
-; RV32I-NEXT: sw a3, 24(sp)
-; RV32I-NEXT: sw a4, 28(sp)
-; RV32I-NEXT: sw a7, 32(sp)
-; RV32I-NEXT: sw t2, 36(sp)
-; RV32I-NEXT: sw a6, 8(sp)
-; RV32I-NEXT: sw t0, 12(sp)
-; RV32I-NEXT: sw t1, 16(sp)
-; RV32I-NEXT: sw a5, 20(sp)
-; RV32I-NEXT: lw a6, 16(s6)
-; RV32I-NEXT: lw a5, 20(s6)
-; RV32I-NEXT: lw a7, 24(s6)
-; RV32I-NEXT: lw a1, 0(s6)
-; RV32I-NEXT: lw a0, 4(s6)
-; RV32I-NEXT: lw a4, 8(s6)
-; RV32I-NEXT: lw a3, 12(s6)
-; RV32I-NEXT: lw t0, 28(s6)
+; RV32I-NEXT: add s7, s7, a1
+; RV32I-NEXT: sw a7, 16(sp)
+; RV32I-NEXT: sw t0, 20(sp)
+; RV32I-NEXT: sw t1, 24(sp)
+; RV32I-NEXT: sw t2, 28(sp)
+; RV32I-NEXT: sw a3, 0(sp)
+; RV32I-NEXT: sw a4, 4(sp)
+; RV32I-NEXT: sw a5, 8(sp)
+; RV32I-NEXT: sw a6, 12(sp)
+; RV32I-NEXT: lw a6, 16(s7)
+; RV32I-NEXT: lw a5, 20(s7)
+; RV32I-NEXT: lw a7, 24(s7)
+; RV32I-NEXT: lw a1, 0(s7)
+; RV32I-NEXT: lw a0, 4(s7)
+; RV32I-NEXT: lw a4, 8(s7)
+; RV32I-NEXT: lw a3, 12(s7)
+; RV32I-NEXT: lw t0, 28(s7)
; RV32I-NEXT: srli t1, a7, 24
; RV32I-NEXT: srli t2, a7, 16
; RV32I-NEXT: srli t3, a7, 8
@@ -5453,21 +5432,21 @@ define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
; RV32I-NEXT: srli s5, a5, 8
; RV32I-NEXT: srli s6, a4, 24
; RV32I-NEXT: srli s7, a4, 16
-; RV32I-NEXT: srli s8, a4, 8
-; RV32I-NEXT: srli s9, a3, 24
-; RV32I-NEXT: srli s10, a3, 16
-; RV32I-NEXT: srli s11, a3, 8
-; RV32I-NEXT: srli ra, a1, 24
; RV32I-NEXT: sb a7, 24(a2)
+; RV32I-NEXT: srli a7, a4, 8
; RV32I-NEXT: sb t3, 25(a2)
+; RV32I-NEXT: srli t3, a3, 24
; RV32I-NEXT: sb t2, 26(a2)
+; RV32I-NEXT: srli t2, a3, 16
; RV32I-NEXT: sb t1, 27(a2)
-; RV32I-NEXT: srli a7, a1, 16
+; RV32I-NEXT: srli t1, a3, 8
; RV32I-NEXT: sb t0, 28(a2)
+; RV32I-NEXT: srli t0, a1, 24
; RV32I-NEXT: sb t6, 29(a2)
+; RV32I-NEXT: srli t6, a1, 16
; RV32I-NEXT: sb t5, 30(a2)
; RV32I-NEXT: sb t4, 31(a2)
-; RV32I-NEXT: srli t0, a1, 8
+; RV32I-NEXT: srli t4, a1, 8
; RV32I-NEXT: sb a6, 16(a2)
; RV32I-NEXT: sb s2, 17(a2)
; RV32I-NEXT: sb s1, 18(a2)
@@ -5479,36 +5458,35 @@ define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
; RV32I-NEXT: sb s3, 23(a2)
; RV32I-NEXT: srli a5, a0, 16
; RV32I-NEXT: sb a4, 8(a2)
-; RV32I-NEXT: sb s8, 9(a2)
+; RV32I-NEXT: sb a7, 9(a2)
; RV32I-NEXT: sb s7, 10(a2)
; RV32I-NEXT: sb s6, 11(a2)
; RV32I-NEXT: srli a4, a0, 8
; RV32I-NEXT: sb a3, 12(a2)
-; RV32I-NEXT: sb s11, 13(a2)
-; RV32I-NEXT: sb s10, 14(a2)
-; RV32I-NEXT: sb s9, 15(a2)
+; RV32I-NEXT: sb t1, 13(a2)
+; RV32I-NEXT: sb t2, 14(a2)
+; RV32I-NEXT: sb t3, 15(a2)
; RV32I-NEXT: sb a1, 0(a2)
-; RV32I-NEXT: sb t0, 1(a2)
-; RV32I-NEXT: sb a7, 2(a2)
-; RV32I-NEXT: sb ra, 3(a2)
+; RV32I-NEXT: sb t4, 1(a2)
+; RV32I-NEXT: sb t6, 2(a2)
+; RV32I-NEXT: sb t0, 3(a2)
; RV32I-NEXT: sb a0, 4(a2)
; RV32I-NEXT: sb a4, 5(a2)
; RV32I-NEXT: sb a5, 6(a2)
; RV32I-NEXT: sb a6, 7(a2)
-; RV32I-NEXT: lw ra, 124(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s0, 120(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s1, 116(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s2, 112(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s3, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s4, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s5, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s6, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s7, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s8, 88(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s9, 84(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s10, 80(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s11, 76(sp) # 4-byte Folded Reload
-; RV32I-NEXT: addi sp, sp, 128
+; RV32I-NEXT: lw s0, 108(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 104(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s2, 100(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s3, 96(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s4, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s5, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s6, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s7, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s8, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s9, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s10, 68(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s11, 64(sp) # 4-byte Folded Reload
+; RV32I-NEXT: addi sp, sp, 112
; RV32I-NEXT: ret
%src = load i256, ptr %src.ptr, align 1
%wordOff = load i256, ptr %wordOff.ptr, align 1
@@ -5534,112 +5512,112 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
; RV64I-NEXT: sd s9, 80(sp) # 8-byte Folded Spill
; RV64I-NEXT: sd s10, 72(sp) # 8-byte Folded Spill
; RV64I-NEXT: sd s11, 64(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a5, 0(a0)
-; RV64I-NEXT: lbu a7, 1(a0)
-; RV64I-NEXT: lbu t1, 2(a0)
-; RV64I-NEXT: lbu s3, 3(a0)
-; RV64I-NEXT: lbu t0, 4(a0)
-; RV64I-NEXT: lbu s8, 5(a0)
-; RV64I-NEXT: lbu s9, 6(a0)
-; RV64I-NEXT: lbu s10, 7(a0)
-; RV64I-NEXT: lbu s2, 8(a0)
-; RV64I-NEXT: lbu s4, 9(a0)
-; RV64I-NEXT: lbu s5, 10(a0)
-; RV64I-NEXT: lbu s6, 11(a0)
-; RV64I-NEXT: lbu s7, 12(a0)
-; RV64I-NEXT: lbu s11, 13(a0)
-; RV64I-NEXT: lbu t4, 14(a0)
-; RV64I-NEXT: lbu t5, 15(a0)
-; RV64I-NEXT: lbu a3, 16(a0)
-; RV64I-NEXT: lbu a6, 17(a0)
-; RV64I-NEXT: lbu t2, 18(a0)
-; RV64I-NEXT: lbu t3, 19(a0)
-; RV64I-NEXT: lbu a4, 20(a0)
-; RV64I-NEXT: lbu t6, 21(a0)
-; RV64I-NEXT: lbu s0, 22(a0)
-; RV64I-NEXT: lbu s1, 23(a0)
-; RV64I-NEXT: slli a7, a7, 8
+; RV64I-NEXT: lbu a3, 0(a0)
+; RV64I-NEXT: lbu a4, 1(a0)
+; RV64I-NEXT: lbu a5, 2(a0)
+; RV64I-NEXT: lbu a6, 3(a0)
+; RV64I-NEXT: lbu a7, 4(a0)
+; RV64I-NEXT: lbu t0, 5(a0)
+; RV64I-NEXT: lbu t1, 6(a0)
+; RV64I-NEXT: lbu t2, 7(a0)
+; RV64I-NEXT: lbu t3, 8(a0)
+; RV64I-NEXT: lbu t4, 9(a0)
+; RV64I-NEXT: lbu t5, 10(a0)
+; RV64I-NEXT: lbu t6, 11(a0)
+; RV64I-NEXT: lbu s0, 12(a0)
+; RV64I-NEXT: lbu s1, 13(a0)
+; RV64I-NEXT: lbu s2, 14(a0)
+; RV64I-NEXT: lbu s3, 15(a0)
+; RV64I-NEXT: lbu s4, 16(a0)
+; RV64I-NEXT: lbu s5, 17(a0)
+; RV64I-NEXT: lbu s6, 18(a0)
+; RV64I-NEXT: lbu s7, 19(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: slli a5, a5, 16
+; RV64I-NEXT: slli a6, a6, 24
+; RV64I-NEXT: slli t0, t0, 8
; RV64I-NEXT: slli t1, t1, 16
+; RV64I-NEXT: slli t2, t2, 24
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: or a5, t0, a7
+; RV64I-NEXT: or a6, t2, t1
+; RV64I-NEXT: lbu s8, 20(a0)
+; RV64I-NEXT: lbu s9, 21(a0)
+; RV64I-NEXT: lbu s10, 22(a0)
+; RV64I-NEXT: lbu s11, 23(a0)
+; RV64I-NEXT: slli t4, t4, 8
+; RV64I-NEXT: slli t5, t5, 16
+; RV64I-NEXT: slli t6, t6, 24
+; RV64I-NEXT: slli s1, s1, 8
+; RV64I-NEXT: slli s2, s2, 16
; RV64I-NEXT: slli s3, s3, 24
-; RV64I-NEXT: slli s8, s8, 8
-; RV64I-NEXT: slli s9, s9, 16
-; RV64I-NEXT: slli s10, s10, 24
-; RV64I-NEXT: or a5, a7, a5
-; RV64I-NEXT: or a7, s3, t1
-; RV64I-NEXT: or t0, s8, t0
-; RV64I-NEXT: or t1, s10, s9
-; RV64I-NEXT: lbu s3, 24(a0)
-; RV64I-NEXT: lbu s8, 25(a0)
-; RV64I-NEXT: lbu s9, 26(a0)
-; RV64I-NEXT: lbu s10, 27(a0)
-; RV64I-NEXT: slli s4, s4, 8
-; RV64I-NEXT: slli s5, s5, 16
-; RV64I-NEXT: slli s6, s6, 24
-; RV64I-NEXT: slli s11, s11, 8
-; RV64I-NEXT: or s2, s4, s2
-; RV64I-NEXT: or s4, s6, s5
-; RV64I-NEXT: or s5, s11, s7
-; RV64I-NEXT: lbu s6, 28(a0)
-; RV64I-NEXT: lbu s7, 29(a0)
-; RV64I-NEXT: lbu s11, 30(a0)
+; RV64I-NEXT: or a7, t4, t3
+; RV64I-NEXT: or t0, t6, t5
+; RV64I-NEXT: or t1, s1, s0
+; RV64I-NEXT: or t2, s3, s2
+; RV64I-NEXT: lbu t3, 24(a0)
+; RV64I-NEXT: lbu t4, 25(a0)
+; RV64I-NEXT: lbu t5, 26(a0)
+; RV64I-NEXT: lbu t6, 27(a0)
+; RV64I-NEXT: slli s5, s5, 8
+; RV64I-NEXT: slli s6, s6, 16
+; RV64I-NEXT: slli s7, s7, 24
+; RV64I-NEXT: slli s9, s9, 8
+; RV64I-NEXT: or s0, s5, s4
+; RV64I-NEXT: or s1, s7, s6
+; RV64I-NEXT: or s2, s9, s8
+; RV64I-NEXT: lbu s3, 28(a0)
+; RV64I-NEXT: lbu s4, 29(a0)
+; RV64I-NEXT: lbu s5, 30(a0)
; RV64I-NEXT: lbu a0, 31(a0)
; RV64I-NEXT: lbu a1, 0(a1)
-; RV64I-NEXT: slli t4, t4, 16
-; RV64I-NEXT: slli t5, t5, 24
-; RV64I-NEXT: or t4, t5, t4
-; RV64I-NEXT: mv t5, sp
-; RV64I-NEXT: slli a6, a6, 8
-; RV64I-NEXT: slli t2, t2, 16
-; RV64I-NEXT: slli t3, t3, 24
-; RV64I-NEXT: slli t6, t6, 8
-; RV64I-NEXT: slli s0, s0, 16
-; RV64I-NEXT: slli s1, s1, 24
-; RV64I-NEXT: slli s8, s8, 8
-; RV64I-NEXT: slli s9, s9, 16
-; RV64I-NEXT: slli s10, s10, 24
-; RV64I-NEXT: slli s7, s7, 8
-; RV64I-NEXT: slli s11, s11, 16
+; RV64I-NEXT: slli s10, s10, 16
+; RV64I-NEXT: slli s11, s11, 24
+; RV64I-NEXT: or s6, s11, s10
+; RV64I-NEXT: mv s7, sp
+; RV64I-NEXT: slli t4, t4, 8
+; RV64I-NEXT: slli t5, t5, 16
+; RV64I-NEXT: slli t6, t6, 24
+; RV64I-NEXT: slli s4, s4, 8
+; RV64I-NEXT: slli s5, s5, 16
; RV64I-NEXT: slli a0, a0, 24
; RV64I-NEXT: slli a1, a1, 3
-; RV64I-NEXT: or a3, a6, a3
-; RV64I-NEXT: or a6, t3, t2
-; RV64I-NEXT: or a4, t6, a4
-; RV64I-NEXT: or s0, s1, s0
-; RV64I-NEXT: or t2, s8, s3
-; RV64I-NEXT: or t3, s10, s9
-; RV64I-NEXT: or t6, s7, s6
-; RV64I-NEXT: or a0, a0, s11
+; RV64I-NEXT: or t3, t4, t3
+; RV64I-NEXT: or t4, t6, t5
+; RV64I-NEXT: or t5, s4, s3
+; RV64I-NEXT: or a0, a0, s5
; RV64I-NEXT: andi a1, a1, 24
-; RV64I-NEXT: or a5, a7, a5
-; RV64I-NEXT: or a7, t1, t0
-; RV64I-NEXT: or t0, s4, s2
-; RV64I-NEXT: or t1, t4, s5
-; RV64I-NEXT: or a3, a6, a3
-; RV64I-NEXT: or a4, s0, a4
-; RV64I-NEXT: or a6, t3, t2
-; RV64I-NEXT: or a0, a0, t6
-; RV64I-NEXT: add t5, t5, a1
-; RV64I-NEXT: slli a7, a7, 32
-; RV64I-NEXT: slli t1, t1, 32
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: or a5, t0, a7
+; RV64I-NEXT: or a6, t2, t1
+; RV64I-NEXT: or s0, s1, s0
+; RV64I-NEXT: or a7, s6, s2
+; RV64I-NEXT: or t0, t4, t3
+; RV64I-NEXT: or a0, a0, t5
+; RV64I-NEXT: add s7, s7, a1
; RV64I-NEXT: slli a4, a4, 32
+; RV64I-NEXT: slli a6, a6, 32
+; RV64I-NEXT: slli a7, a7, 32
; RV64I-NEXT: slli a1, a0, 32
; RV64I-NEXT: sraiw a0, a0, 31
-; RV64I-NEXT: or a5, a7, a5
-; RV64I-NEXT: or a7, t1, t0
; RV64I-NEXT: or a3, a4, a3
-; RV64I-NEXT: or a1, a1, a6
+; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: or a5, a7, s0
+; RV64I-NEXT: or a1, a1, t0
; RV64I-NEXT: sd a0, 32(sp)
; RV64I-NEXT: sd a0, 40(sp)
; RV64I-NEXT: sd a0, 48(sp)
; RV64I-NEXT: sd a0, 56(sp)
-; RV64I-NEXT: sd a5, 0(sp)
-; RV64I-NEXT: sd a7, 8(sp)
-; RV64I-NEXT: sd a3, 16(sp)
+; RV64I-NEXT: sd a3, 0(sp)
+; RV64I-NEXT: sd a4, 8(sp)
+; RV64I-NEXT: sd a5, 16(sp)
; RV64I-NEXT: sd a1, 24(sp)
-; RV64I-NEXT: ld a4, 16(t5)
-; RV64I-NEXT: ld a0, 8(t5)
-; RV64I-NEXT: ld a1, 0(t5)
-; RV64I-NEXT: ld a3, 24(t5)
+; RV64I-NEXT: ld a4, 16(s7)
+; RV64I-NEXT: ld a0, 8(s7)
+; RV64I-NEXT: ld a1, 0(s7)
+; RV64I-NEXT: ld a3, 24(s7)
; RV64I-NEXT: srli a5, a4, 56
; RV64I-NEXT: srli a6, a4, 48
; RV64I-NEXT: srli a7, a4, 40
@@ -5658,25 +5636,25 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
; RV64I-NEXT: srli s5, a1, 48
; RV64I-NEXT: srli s6, a1, 40
; RV64I-NEXT: srli s7, a1, 32
-; RV64I-NEXT: srli s8, a1, 24
-; RV64I-NEXT: srli s9, a1, 16
-; RV64I-NEXT: srli s10, a1, 8
-; RV64I-NEXT: srli s11, a0, 56
; RV64I-NEXT: sb t0, 20(a2)
+; RV64I-NEXT: srli t0, a1, 24
; RV64I-NEXT: sb a7, 21(a2)
+; RV64I-NEXT: srli a7, a1, 16
; RV64I-NEXT: sb a6, 22(a2)
+; RV64I-NEXT: srli a6, a1, 8
; RV64I-NEXT: sb a5, 23(a2)
-; RV64I-NEXT: srli a5, a0, 48
+; RV64I-NEXT: srli a5, a0, 56
; RV64I-NEXT: sb a4, 16(a2)
+; RV64I-NEXT: srli a4, a0, 48
; RV64I-NEXT: sb t3, 17(a2)
; RV64I-NEXT: sb t2, 18(a2)
; RV64I-NEXT: sb t1, 19(a2)
-; RV64I-NEXT: srli a4, a0, 40
+; RV64I-NEXT: srli t1, a0, 40
; RV64I-NEXT: sb s0, 28(a2)
; RV64I-NEXT: sb t6, 29(a2)
; RV64I-NEXT: sb t5, 30(a2)
; RV64I-NEXT: sb t4, 31(a2)
-; RV64I-NEXT: srli a6, a0, 32
+; RV64I-NEXT: srli t2, a0, 32
; RV64I-NEXT: sb a3, 24(a2)
; RV64I-NEXT: sb s3, 25(a2)
; RV64I-NEXT: sb s2, 26(a2)
@@ -5686,19 +5664,19 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
; RV64I-NEXT: sb s6, 5(a2)
; RV64I-NEXT: sb s5, 6(a2)
; RV64I-NEXT: sb s4, 7(a2)
-; RV64I-NEXT: srli a7, a0, 16
+; RV64I-NEXT: srli t3, a0, 16
; RV64I-NEXT: sb a1, 0(a2)
-; RV64I-NEXT: sb s10, 1(a2)
-; RV64I-NEXT: sb s9, 2(a2)
-; RV64I-NEXT: sb s8, 3(a2)
+; RV64I-NEXT: sb a6, 1(a2)
+; RV64I-NEXT: sb a7, 2(a2)
+; RV64I-NEXT: sb t0, 3(a2)
; RV64I-NEXT: srli a1, a0, 8
-; RV64I-NEXT: sb a6, 12(a2)
-; RV64I-NEXT: sb a4, 13(a2)
-; RV64I-NEXT: sb a5, 14(a2)
-; RV64I-NEXT: sb s11, 15(a2)
+; RV64I-NEXT: sb t2, 12(a2)
+; RV64I-NEXT: sb t1, 13(a2)
+; RV64I-NEXT: sb a4, 14(a2)
+; RV64I-NEXT: sb a5, 15(a2)
; RV64I-NEXT: sb a0, 8(a2)
; RV64I-NEXT: sb a1, 9(a2)
-; RV64I-NEXT: sb a7, 10(a2)
+; RV64I-NEXT: sb t3, 10(a2)
; RV64I-NEXT: sb a3, 11(a2)
; RV64I-NEXT: ld s0, 152(sp) # 8-byte Folded Reload
; RV64I-NEXT: ld s1, 144(sp) # 8-byte Folded Reload
@@ -5717,130 +5695,129 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
;
; RV32I-LABEL: ashr_32bytes_dwordOff:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi sp, sp, -128
-; RV32I-NEXT: sw ra, 124(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s0, 120(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s1, 116(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s2, 112(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s3, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s4, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s5, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s6, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s7, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s8, 88(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s9, 84(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s10, 80(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s11, 76(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a6, 0(a0)
-; RV32I-NEXT: lbu t0, 1(a0)
-; RV32I-NEXT: lbu t1, 2(a0)
-; RV32I-NEXT: lbu t6, 3(a0)
-; RV32I-NEXT: lbu s7, 4(a0)
-; RV32I-NEXT: lbu s8, 5(a0)
-; RV32I-NEXT: lbu s3, 6(a0)
-; RV32I-NEXT: lbu s5, 7(a0)
-; RV32I-NEXT: lbu s4, 8(a0)
-; RV32I-NEXT: lbu s9, 9(a0)
-; RV32I-NEXT: lbu s10, 10(a0)
-; RV32I-NEXT: lbu s11, 11(a0)
-; RV32I-NEXT: lbu s2, 12(a0)
-; RV32I-NEXT: lbu s6, 13(a0)
-; RV32I-NEXT: lbu a5, 14(a0)
-; RV32I-NEXT: lbu a7, 15(a0)
-; RV32I-NEXT: lbu a3, 16(a0)
-; RV32I-NEXT: lbu t2, 17(a0)
-; RV32I-NEXT: lbu t3, 18(a0)
-; RV32I-NEXT: lbu t4, 19(a0)
-; RV32I-NEXT: lbu a4, 20(a0)
-; RV32I-NEXT: lbu t5, 21(a0)
-; RV32I-NEXT: lbu s0, 22(a0)
-; RV32I-NEXT: lbu s1, 23(a0)
+; RV32I-NEXT: addi sp, sp, -112
+; RV32I-NEXT: sw s0, 108(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 104(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s2, 100(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s3, 96(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s4, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s5, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s6, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s7, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s8, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s9, 72(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s10, 68(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s11, 64(sp) # 4-byte Folded Spill
+; RV32I-NEXT: lbu a3, 0(a0)
+; RV32I-NEXT: lbu a4, 1(a0)
+; RV32I-NEXT: lbu a5, 2(a0)
+; RV32I-NEXT: lbu a6, 3(a0)
+; RV32I-NEXT: lbu a7, 4(a0)
+; RV32I-NEXT: lbu t0, 5(a0)
+; RV32I-NEXT: lbu t1, 6(a0)
+; RV32I-NEXT: lbu t2, 7(a0)
+; RV32I-NEXT: lbu t3, 8(a0)
+; RV32I-NEXT: lbu t4, 9(a0)
+; RV32I-NEXT: lbu t5, 10(a0)
+; RV32I-NEXT: lbu t6, 11(a0)
+; RV32I-NEXT: lbu s0, 12(a0)
+; RV32I-NEXT: lbu s1, 13(a0)
+; RV32I-NEXT: lbu s2, 14(a0)
+; RV32I-NEXT: lbu s3, 15(a0)
+; RV32I-NEXT: lbu s4, 16(a0)
+; RV32I-NEXT: lbu s5, 17(a0)
+; RV32I-NEXT: lbu s6, 18(a0)
+; RV32I-NEXT: lbu s7, 19(a0)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: slli a5, a5, 16
+; RV32I-NEXT: slli a6, a6, 24
; RV32I-NEXT: slli t0, t0, 8
; RV32I-NEXT: slli t1, t1, 16
+; RV32I-NEXT: slli t2, t2, 24
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a5, t0, a7
+; RV32I-NEXT: or a6, t2, t1
+; RV32I-NEXT: lbu s8, 20(a0)
+; RV32I-NEXT: lbu s9, 21(a0)
+; RV32I-NEXT: lbu s10, 22(a0)
+; RV32I-NEXT: lbu s11, 23(a0)
+; RV32I-NEXT: slli t4, t4, 8
+; RV32I-NEXT: slli t5, t5, 16
; RV32I-NEXT: slli t6, t6, 24
-; RV32I-NEXT: slli s8, s8, 8
-; RV32I-NEXT: or a6, t0, a6
-; RV32I-NEXT: or t0, t6, t1
-; RV32I-NEXT: or t1, s8, s7
-; RV32I-NEXT: lbu t6, 24(a0)
-; RV32I-NEXT: lbu s7, 25(a0)
-; RV32I-NEXT: lbu s8, 26(a0)
-; RV32I-NEXT: lbu ra, 27(a0)
-; RV32I-NEXT: slli s3, s3, 16
-; RV32I-NEXT: slli s5, s5, 24
+; RV32I-NEXT: slli s1, s1, 8
+; RV32I-NEXT: slli s2, s2, 16
+; RV32I-NEXT: slli s3, s3, 24
+; RV32I-NEXT: or a7, t4, t3
+; RV32I-NEXT: or t0, t6, t5
+; RV32I-NEXT: or t1, s1, s0
+; RV32I-NEXT: or t2, s3, s2
+; RV32I-NEXT: lbu t3, 24(a0)
+; RV32I-NEXT: lbu t5, 25(a0)
+; RV32I-NEXT: lbu t6, 26(a0)
+; RV32I-NEXT: lbu s0, 27(a0)
+; RV32I-NEXT: slli s5, s5, 8
+; RV32I-NEXT: slli s6, s6, 16
+; RV32I-NEXT: slli s7, s7, 24
; RV32I-NEXT: slli s9, s9, 8
-; RV32I-NEXT: slli s10, s10, 16
-; RV32I-NEXT: slli s11, s11, 24
-; RV32I-NEXT: or s3, s5, s3
-; RV32I-NEXT: or s4, s9, s4
-; RV32I-NEXT: or s5, s11, s10
-; RV32I-NEXT: lbu s9, 28(a0)
-; RV32I-NEXT: lbu s10, 29(a0)
-; RV32I-NEXT: lbu s11, 30(a0)
+; RV32I-NEXT: or t4, s5, s4
+; RV32I-NEXT: or s1, s7, s6
+; RV32I-NEXT: or s2, s9, s8
+; RV32I-NEXT: lbu s3, 28(a0)
+; RV32I-NEXT: lbu s4, 29(a0)
+; RV32I-NEXT: lbu s5, 30(a0)
; RV32I-NEXT: lbu a0, 31(a0)
; RV32I-NEXT: lbu a1, 0(a1)
-; RV32I-NEXT: slli s6, s6, 8
-; RV32I-NEXT: or s2, s6, s2
-; RV32I-NEXT: addi s6, sp, 8
-; RV32I-NEXT: slli a5, a5, 16
-; RV32I-NEXT: slli a7, a7, 24
-; RV32I-NEXT: slli t2, t2, 8
-; RV32I-NEXT: slli t3, t3, 16
-; RV32I-NEXT: slli t4, t4, 24
+; RV32I-NEXT: slli s10, s10, 16
+; RV32I-NEXT: slli s11, s11, 24
+; RV32I-NEXT: or s6, s11, s10
+; RV32I-NEXT: mv s7, sp
; RV32I-NEXT: slli t5, t5, 8
-; RV32I-NEXT: slli s0, s0, 16
-; RV32I-NEXT: slli s1, s1, 24
-; RV32I-NEXT: slli s7, s7, 8
-; RV32I-NEXT: slli s8, s8, 16
-; RV32I-NEXT: slli ra, ra, 24
-; RV32I-NEXT: slli s10, s10, 8
-; RV32I-NEXT: slli s11, s11, 16
+; RV32I-NEXT: slli t6, t6, 16
+; RV32I-NEXT: slli s0, s0, 24
+; RV32I-NEXT: slli s4, s4, 8
+; RV32I-NEXT: slli s5, s5, 16
; RV32I-NEXT: slli a0, a0, 24
; RV32I-NEXT: slli a1, a1, 3
-; RV32I-NEXT: or a5, a7, a5
-; RV32I-NEXT: or a3, t2, a3
-; RV32I-NEXT: or a7, t4, t3
-; RV32I-NEXT: or a4, t5, a4
-; RV32I-NEXT: or s0, s1, s0
-; RV32I-NEXT: or t2, s7, t6
-; RV32I-NEXT: or t3, ra, s8
-; RV32I-NEXT: or t4, s10, s9
-; RV32I-NEXT: or t5, a0, s11
+; RV32I-NEXT: or t3, t5, t3
+; RV32I-NEXT: or t5, s0, t6
+; RV32I-NEXT: or t6, s4, s3
+; RV32I-NEXT: or s0, a0, s5
; RV32I-NEXT: srai a0, a0, 31
; RV32I-NEXT: andi a1, a1, 24
-; RV32I-NEXT: or a6, t0, a6
-; RV32I-NEXT: or t0, s3, t1
-; RV32I-NEXT: or t1, s5, s4
-; RV32I-NEXT: or a5, a5, s2
-; RV32I-NEXT: or a3, a7, a3
-; RV32I-NEXT: or a4, s0, a4
-; RV32I-NEXT: or a7, t3, t2
-; RV32I-NEXT: or t2, t5, t4
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a5, t0, a7
+; RV32I-NEXT: or a6, t2, t1
+; RV32I-NEXT: or a7, s1, t4
+; RV32I-NEXT: or t0, s6, s2
+; RV32I-NEXT: or t1, t5, t3
+; RV32I-NEXT: or t2, s0, t6
+; RV32I-NEXT: sw a0, 48(sp)
+; RV32I-NEXT: sw a0, 52(sp)
; RV32I-NEXT: sw a0, 56(sp)
; RV32I-NEXT: sw a0, 60(sp)
-; RV32I-NEXT: sw a0, 64(sp)
-; RV32I-NEXT: sw a0, 68(sp)
+; RV32I-NEXT: sw a0, 32(sp)
+; RV32I-NEXT: sw a0, 36(sp)
; RV32I-NEXT: sw a0, 40(sp)
; RV32I-NEXT: sw a0, 44(sp)
-; RV32I-NEXT: sw a0, 48(sp)
-; RV32I-NEXT: sw a0, 52(sp)
-; RV32I-NEXT: add s6, s6, a1
-; RV32I-NEXT: sw a3, 24(sp)
-; RV32I-NEXT: sw a4, 28(sp)
-; RV32I-NEXT: sw a7, 32(sp)
-; RV32I-NEXT: sw t2, 36(sp)
-; RV32I-NEXT: sw a6, 8(sp)
-; RV32I-NEXT: sw t0, 12(sp)
-; RV32I-NEXT: sw t1, 16(sp)
-; RV32I-NEXT: sw a5, 20(sp)
-; RV32I-NEXT: lw a6, 16(s6)
-; RV32I-NEXT: lw a5, 20(s6)
-; RV32I-NEXT: lw a7, 24(s6)
-; RV32I-NEXT: lw a1, 0(s6)
-; RV32I-NEXT: lw a0, 4(s6)
-; RV32I-NEXT: lw a4, 8(s6)
-; RV32I-NEXT: lw a3, 12(s6)
-; RV32I-NEXT: lw t0, 28(s6)
+; RV32I-NEXT: add s7, s7, a1
+; RV32I-NEXT: sw a7, 16(sp)
+; RV32I-NEXT: sw t0, 20(sp)
+; RV32I-NEXT: sw t1, 24(sp)
+; RV32I-NEXT: sw t2, 28(sp)
+; RV32I-NEXT: sw a3, 0(sp)
+; RV32I-NEXT: sw a4, 4(sp)
+; RV32I-NEXT: sw a5, 8(sp)
+; RV32I-NEXT: sw a6, 12(sp)
+; RV32I-NEXT: lw a6, 16(s7)
+; RV32I-NEXT: lw a5, 20(s7)
+; RV32I-NEXT: lw a7, 24(s7)
+; RV32I-NEXT: lw a1, 0(s7)
+; RV32I-NEXT: lw a0, 4(s7)
+; RV32I-NEXT: lw a4, 8(s7)
+; RV32I-NEXT: lw a3, 12(s7)
+; RV32I-NEXT: lw t0, 28(s7)
; RV32I-NEXT: srli t1, a7, 24
; RV32I-NEXT: srli t2, a7, 16
; RV32I-NEXT: srli t3, a7, 8
@@ -5855,21 +5832,21 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
; RV32I-NEXT: srli s5, a5, 8
; RV32I-NEXT: srli s6, a4, 24
; RV32I-NEXT: srli s7, a4, 16
-; RV32I-NEXT: srli s8, a4, 8
-; RV32I-NEXT: srli s9, a3, 24
-; RV32I-NEXT: srli s10, a3, 16
-; RV32I-NEXT: srli s11, a3, 8
-; RV32I-NEXT: srli ra, a1, 24
; RV32I-NEXT: sb a7, 24(a2)
+; RV32I-NEXT: srli a7, a4, 8
; RV32I-NEXT: sb t3, 25(a2)
+; RV32I-NEXT: srli t3, a3, 24
; RV32I-NEXT: sb t2, 26(a2)
+; RV32I-NEXT: srli t2, a3, 16
; RV32I-NEXT: sb t1, 27(a2)
-; RV32I-NEXT: srli a7, a1, 16
+; RV32I-NEXT: srli t1, a3, 8
; RV32I-NEXT: sb t0, 28(a2)
+; RV32I-NEXT: srli t0, a1, 24
; RV32I-NEXT: sb t6, 29(a2)
+; RV32I-NEXT: srli t6, a1, 16
; RV32I-NEXT: sb t5, 30(a2)
; RV32I-NEXT: sb t4, 31(a2)
-; RV32I-NEXT: srli t0, a1, 8
+; RV32I-NEXT: srli t4, a1, 8
; RV32I-NEXT: sb a6, 16(a2)
; RV32I-NEXT: sb s2, 17(a2)
; RV32I-NEXT: sb s1, 18(a2)
@@ -5881,36 +5858,35 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
; RV32I-NEXT: sb s3, 23(a2)
; RV32I-NEXT: srli a5, a0, 16
; RV32I-NEXT: sb a4, 8(a2)
-; RV32I-NEXT: sb s8, 9(a2)
+; RV32I-NEXT: sb a7, 9(a2)
; RV32I-NEXT: sb s7, 10(a2)
; RV32I-NEXT: sb s6, 11(a2)
; RV32I-NEXT: srli a4, a0, 8
; RV32I-NEXT: sb a3, 12(a2)
-; RV32I-NEXT: sb s11, 13(a2)
-; RV32I-NEXT: sb s10, 14(a2)
-; RV32I-NEXT: sb s9, 15(a2)
+; RV32I-NEXT: sb t1, 13(a2)
+; RV32I-NEXT: sb t2, 14(a2)
+; RV32I-NEXT: sb t3, 15(a2)
; RV32I-NEXT: sb a1, 0(a2)
-; RV32I-NEXT: sb t0, 1(a2)
-; RV32I-NEXT: sb a7, 2(a2)
-; RV32I-NEXT: sb ra, 3(a2)
+; RV32I-NEXT: sb t4, 1(a2)
+; RV32I-NEXT: sb t6, 2(a2)
+; RV32I-NEXT: sb t0, 3(a2)
; RV32I-NEXT: sb a0, 4(a2)
; RV32I-NEXT: sb a4, 5(a2)
; RV32I-NEXT: sb a5, 6(a2)
; RV32I-NEXT: sb a6, 7(a2)
-; RV32I-NEXT: lw ra, 124(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s0, 120(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s1, 116(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s2, 112(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s3, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s4, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s5, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s6, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s7, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s8, 88(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s9, 84(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s10, 80(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s11, 76(sp) # 4-byte Folded Reload
-; RV32I-NEXT: addi sp, sp, 128
+; RV32I-NEXT: lw s0, 108(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 104(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s2, 100(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s3, 96(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s4, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s5, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s6, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s7, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s8, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s9, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s10, 68(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s11, 64(sp) # 4-byte Folded Reload
+; RV32I-NEXT: addi sp, sp, 112
; RV32I-NEXT: ret
%src = load i256, ptr %src.ptr, align 1
%dwordOff = load i256, ptr %dwordOff.ptr, align 1
diff --git a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
index b2c130c2d7c10a..b8952d2cb2b29e 100644
--- a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
+++ b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
@@ -1530,25 +1530,24 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
;
; RV32I-LABEL: lshr_32bytes:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi sp, sp, -128
-; RV32I-NEXT: sw ra, 124(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s0, 120(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s1, 116(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s2, 112(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s3, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s4, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s5, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s6, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s7, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s8, 88(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s9, 84(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s10, 80(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s11, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT: addi sp, sp, -112
+; RV32I-NEXT: sw s0, 108(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 104(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s2, 100(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s3, 96(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s4, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s5, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s6, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s7, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s8, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s9, 72(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s10, 68(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s11, 64(sp) # 4-byte Folded Spill
; RV32I-NEXT: lbu a3, 0(a0)
; RV32I-NEXT: lbu a4, 1(a0)
-; RV32I-NEXT: lbu a6, 2(a0)
-; RV32I-NEXT: lbu a7, 3(a0)
-; RV32I-NEXT: lbu a5, 4(a0)
+; RV32I-NEXT: lbu a5, 2(a0)
+; RV32I-NEXT: lbu a6, 3(a0)
+; RV32I-NEXT: lbu a7, 4(a0)
; RV32I-NEXT: lbu t0, 5(a0)
; RV32I-NEXT: lbu t1, 6(a0)
; RV32I-NEXT: lbu t2, 7(a0)
@@ -1557,107 +1556,105 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: lbu t5, 10(a0)
; RV32I-NEXT: lbu t6, 11(a0)
; RV32I-NEXT: lbu s0, 12(a0)
-; RV32I-NEXT: lbu s2, 13(a0)
-; RV32I-NEXT: lbu s4, 14(a0)
-; RV32I-NEXT: lbu s5, 15(a0)
-; RV32I-NEXT: lbu s6, 16(a0)
-; RV32I-NEXT: lbu s7, 17(a0)
-; RV32I-NEXT: lbu s8, 18(a0)
-; RV32I-NEXT: lbu s9, 19(a0)
+; RV32I-NEXT: lbu s1, 13(a0)
+; RV32I-NEXT: lbu s2, 14(a0)
+; RV32I-NEXT: lbu s3, 15(a0)
+; RV32I-NEXT: lbu s4, 16(a0)
+; RV32I-NEXT: lbu s5, 17(a0)
+; RV32I-NEXT: lbu s6, 18(a0)
+; RV32I-NEXT: lbu s7, 19(a0)
; RV32I-NEXT: slli a4, a4, 8
-; RV32I-NEXT: slli a6, a6, 16
-; RV32I-NEXT: slli a7, a7, 24
-; RV32I-NEXT: or a3, a4, a3
-; RV32I-NEXT: sw a3, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT: or a4, a7, a6
-; RV32I-NEXT: lbu s10, 20(a0)
-; RV32I-NEXT: lbu s11, 21(a0)
-; RV32I-NEXT: lbu ra, 22(a0)
-; RV32I-NEXT: lbu a3, 23(a0)
+; RV32I-NEXT: slli a5, a5, 16
+; RV32I-NEXT: slli a6, a6, 24
; RV32I-NEXT: slli t0, t0, 8
; RV32I-NEXT: slli t1, t1, 16
; RV32I-NEXT: slli t2, t2, 24
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a5, t0, a7
+; RV32I-NEXT: or a6, t2, t1
+; RV32I-NEXT: lbu s8, 20(a0)
+; RV32I-NEXT: lbu s9, 21(a0)
+; RV32I-NEXT: lbu s10, 22(a0)
+; RV32I-NEXT: lbu s11, 23(a0)
; RV32I-NEXT: slli t4, t4, 8
; RV32I-NEXT: slli t5, t5, 16
; RV32I-NEXT: slli t6, t6, 24
-; RV32I-NEXT: or a5, t0, a5
-; RV32I-NEXT: or a6, t2, t1
+; RV32I-NEXT: slli s1, s1, 8
+; RV32I-NEXT: slli s2, s2, 16
+; RV32I-NEXT: slli s3, s3, 24
; RV32I-NEXT: or a7, t4, t3
; RV32I-NEXT: or t0, t6, t5
-; RV32I-NEXT: lbu s1, 24(a0)
-; RV32I-NEXT: lbu s3, 25(a0)
-; RV32I-NEXT: lbu t4, 26(a0)
-; RV32I-NEXT: lbu t5, 27(a0)
-; RV32I-NEXT: slli s2, s2, 8
-; RV32I-NEXT: slli s4, s4, 16
-; RV32I-NEXT: slli s5, s5, 24
-; RV32I-NEXT: slli s7, s7, 8
-; RV32I-NEXT: or t1, s2, s0
-; RV32I-NEXT: or t2, s5, s4
-; RV32I-NEXT: or t3, s7, s6
-; RV32I-NEXT: lbu t6, 28(a0)
+; RV32I-NEXT: or t1, s1, s0
+; RV32I-NEXT: or t2, s3, s2
+; RV32I-NEXT: lbu t6, 24(a0)
+; RV32I-NEXT: lbu s0, 25(a0)
+; RV32I-NEXT: lbu s1, 26(a0)
+; RV32I-NEXT: lbu s2, 27(a0)
+; RV32I-NEXT: slli s5, s5, 8
+; RV32I-NEXT: slli s6, s6, 16
+; RV32I-NEXT: slli s7, s7, 24
+; RV32I-NEXT: slli s9, s9, 8
+; RV32I-NEXT: or t3, s5, s4
+; RV32I-NEXT: or t4, s7, s6
+; RV32I-NEXT: or t5, s9, s8
+; RV32I-NEXT: lbu s3, 28(a0)
; RV32I-NEXT: lbu s4, 29(a0)
; RV32I-NEXT: lbu s5, 30(a0)
; RV32I-NEXT: lbu s6, 31(a0)
-; RV32I-NEXT: slli s8, s8, 16
-; RV32I-NEXT: slli s9, s9, 24
-; RV32I-NEXT: slli s11, s11, 8
-; RV32I-NEXT: slli ra, ra, 16
-; RV32I-NEXT: slli a3, a3, 24
-; RV32I-NEXT: or a0, s9, s8
-; RV32I-NEXT: or s0, s11, s10
-; RV32I-NEXT: or s2, a3, ra
-; RV32I-NEXT: lbu a3, 0(a1)
-; RV32I-NEXT: lbu s7, 1(a1)
-; RV32I-NEXT: lbu s8, 2(a1)
+; RV32I-NEXT: slli s10, s10, 16
+; RV32I-NEXT: slli s11, s11, 24
+; RV32I-NEXT: slli s0, s0, 8
+; RV32I-NEXT: slli s1, s1, 16
+; RV32I-NEXT: slli s2, s2, 24
+; RV32I-NEXT: or a0, s11, s10
+; RV32I-NEXT: or t6, s0, t6
+; RV32I-NEXT: or s0, s2, s1
+; RV32I-NEXT: lbu s1, 0(a1)
+; RV32I-NEXT: lbu s2, 1(a1)
+; RV32I-NEXT: lbu s7, 2(a1)
; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: sw zero, 48(sp)
+; RV32I-NEXT: sw zero, 52(sp)
; RV32I-NEXT: sw zero, 56(sp)
; RV32I-NEXT: sw zero, 60(sp)
-; RV32I-NEXT: sw zero, 64(sp)
-; RV32I-NEXT: sw zero, 68(sp)
+; RV32I-NEXT: sw zero, 32(sp)
+; RV32I-NEXT: sw zero, 36(sp)
; RV32I-NEXT: sw zero, 40(sp)
; RV32I-NEXT: sw zero, 44(sp)
-; RV32I-NEXT: sw zero, 48(sp)
-; RV32I-NEXT: sw zero, 52(sp)
-; RV32I-NEXT: slli s3, s3, 8
-; RV32I-NEXT: or s1, s3, s1
-; RV32I-NEXT: addi s3, sp, 8
-; RV32I-NEXT: slli t4, t4, 16
-; RV32I-NEXT: slli t5, t5, 24
; RV32I-NEXT: slli s4, s4, 8
+; RV32I-NEXT: or s3, s4, s3
+; RV32I-NEXT: mv s4, sp
; RV32I-NEXT: slli s5, s5, 16
; RV32I-NEXT: slli s6, s6, 24
-; RV32I-NEXT: slli s7, s7, 8
-; RV32I-NEXT: slli s8, s8, 16
+; RV32I-NEXT: slli s2, s2, 8
+; RV32I-NEXT: slli s7, s7, 16
; RV32I-NEXT: slli a1, a1, 24
-; RV32I-NEXT: or t4, t5, t4
-; RV32I-NEXT: or t5, s4, t6
-; RV32I-NEXT: or t6, s6, s5
-; RV32I-NEXT: or a3, s7, a3
-; RV32I-NEXT: or a1, a1, s8
-; RV32I-NEXT: lw s4, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT: or a4, a4, s4
-; RV32I-NEXT: or a5, a6, a5
-; RV32I-NEXT: or a6, t0, a7
-; RV32I-NEXT: or a7, t2, t1
-; RV32I-NEXT: or t0, a0, t3
-; RV32I-NEXT: or t1, s2, s0
-; RV32I-NEXT: or t2, t4, s1
-; RV32I-NEXT: or t3, t6, t5
-; RV32I-NEXT: or a0, a1, a3
-; RV32I-NEXT: sw t0, 24(sp)
-; RV32I-NEXT: sw t1, 28(sp)
-; RV32I-NEXT: sw t2, 32(sp)
-; RV32I-NEXT: sw t3, 36(sp)
-; RV32I-NEXT: sw a4, 8(sp)
-; RV32I-NEXT: sw a5, 12(sp)
-; RV32I-NEXT: sw a6, 16(sp)
-; RV32I-NEXT: sw a7, 20(sp)
+; RV32I-NEXT: or s5, s6, s5
+; RV32I-NEXT: or s1, s2, s1
+; RV32I-NEXT: or a1, a1, s7
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a5, t0, a7
+; RV32I-NEXT: or a6, t2, t1
+; RV32I-NEXT: or a7, t4, t3
+; RV32I-NEXT: or t0, a0, t5
+; RV32I-NEXT: or t1, s0, t6
+; RV32I-NEXT: or t2, s5, s3
+; RV32I-NEXT: or a0, a1, s1
+; RV32I-NEXT: sw a7, 16(sp)
+; RV32I-NEXT: sw t0, 20(sp)
+; RV32I-NEXT: sw t1, 24(sp)
+; RV32I-NEXT: sw t2, 28(sp)
+; RV32I-NEXT: sw a3, 0(sp)
+; RV32I-NEXT: sw a4, 4(sp)
+; RV32I-NEXT: sw a5, 8(sp)
+; RV32I-NEXT: sw a6, 12(sp)
; RV32I-NEXT: srli a1, a0, 3
; RV32I-NEXT: andi a3, a0, 31
; RV32I-NEXT: andi a4, a1, 28
; RV32I-NEXT: xori a1, a3, 31
-; RV32I-NEXT: add a4, s3, a4
+; RV32I-NEXT: add a4, s4, a4
; RV32I-NEXT: lw a3, 0(a4)
; RV32I-NEXT: lw a5, 4(a4)
; RV32I-NEXT: lw a6, 8(a4)
@@ -1717,13 +1714,13 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: srli s5, a3, 24
; RV32I-NEXT: srli s6, a3, 16
; RV32I-NEXT: srli s7, a3, 8
-; RV32I-NEXT: srli s8, a1, 24
-; RV32I-NEXT: srli s9, a1, 16
; RV32I-NEXT: sb a7, 24(a2)
+; RV32I-NEXT: srli a7, a1, 24
; RV32I-NEXT: sb t2, 25(a2)
+; RV32I-NEXT: srli t2, a1, 16
; RV32I-NEXT: sb t1, 26(a2)
; RV32I-NEXT: sb t0, 27(a2)
-; RV32I-NEXT: srli a7, a1, 8
+; RV32I-NEXT: srli t0, a1, 8
; RV32I-NEXT: sb a6, 16(a2)
; RV32I-NEXT: sb t5, 17(a2)
; RV32I-NEXT: sb t4, 18(a2)
@@ -1744,27 +1741,26 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: sb s6, 14(a2)
; RV32I-NEXT: sb s5, 15(a2)
; RV32I-NEXT: sb a1, 0(a2)
-; RV32I-NEXT: sb a7, 1(a2)
-; RV32I-NEXT: sb s9, 2(a2)
-; RV32I-NEXT: sb s8, 3(a2)
+; RV32I-NEXT: sb t0, 1(a2)
+; RV32I-NEXT: sb t2, 2(a2)
+; RV32I-NEXT: sb a7, 3(a2)
; RV32I-NEXT: sb a0, 4(a2)
; RV32I-NEXT: sb a4, 5(a2)
; RV32I-NEXT: sb a5, 6(a2)
; RV32I-NEXT: sb a6, 7(a2)
-; RV32I-NEXT: lw ra, 124(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s0, 120(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s1, 116(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s2, 112(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s3, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s4, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s5, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s6, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s7, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s8, 88(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s9, 84(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s10, 80(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s11, 76(sp) # 4-byte Folded Reload
-; RV32I-NEXT: addi sp, sp, 128
+; RV32I-NEXT: lw s0, 108(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 104(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s2, 100(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s3, 96(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s4, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s5, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s6, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s7, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s8, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s9, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s10, 68(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s11, 64(sp) # 4-byte Folded Reload
+; RV32I-NEXT: addi sp, sp, 112
; RV32I-NEXT: ret
%src = load i256, ptr %src.ptr, align 1
%bitOff = load i256, ptr %bitOff.ptr, align 1
@@ -2006,25 +2002,24 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
;
; RV32I-LABEL: shl_32bytes:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi sp, sp, -128
-; RV32I-NEXT: sw ra, 124(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s0, 120(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s1, 116(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s2, 112(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s3, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s4, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s5, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s6, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s7, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s8, 88(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s9, 84(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s10, 80(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s11, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT: addi sp, sp, -112
+; RV32I-NEXT: sw s0, 108(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 104(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s2, 100(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s3, 96(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s4, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s5, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s6, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s7, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s8, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s9, 72(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s10, 68(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s11, 64(sp) # 4-byte Folded Spill
; RV32I-NEXT: lbu a3, 0(a0)
; RV32I-NEXT: lbu a4, 1(a0)
-; RV32I-NEXT: lbu a6, 2(a0)
-; RV32I-NEXT: lbu a7, 3(a0)
-; RV32I-NEXT: lbu a5, 4(a0)
+; RV32I-NEXT: lbu a5, 2(a0)
+; RV32I-NEXT: lbu a6, 3(a0)
+; RV32I-NEXT: lbu a7, 4(a0)
; RV32I-NEXT: lbu t0, 5(a0)
; RV32I-NEXT: lbu t1, 6(a0)
; RV32I-NEXT: lbu t2, 7(a0)
@@ -2033,107 +2028,105 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: lbu t5, 10(a0)
; RV32I-NEXT: lbu t6, 11(a0)
; RV32I-NEXT: lbu s0, 12(a0)
-; RV32I-NEXT: lbu s2, 13(a0)
-; RV32I-NEXT: lbu s4, 14(a0)
-; RV32I-NEXT: lbu s5, 15(a0)
-; RV32I-NEXT: lbu s6, 16(a0)
-; RV32I-NEXT: lbu s7, 17(a0)
-; RV32I-NEXT: lbu s8, 18(a0)
-; RV32I-NEXT: lbu s9, 19(a0)
+; RV32I-NEXT: lbu s1, 13(a0)
+; RV32I-NEXT: lbu s2, 14(a0)
+; RV32I-NEXT: lbu s3, 15(a0)
+; RV32I-NEXT: lbu s4, 16(a0)
+; RV32I-NEXT: lbu s5, 17(a0)
+; RV32I-NEXT: lbu s6, 18(a0)
+; RV32I-NEXT: lbu s7, 19(a0)
; RV32I-NEXT: slli a4, a4, 8
-; RV32I-NEXT: slli a6, a6, 16
-; RV32I-NEXT: slli a7, a7, 24
-; RV32I-NEXT: or a3, a4, a3
-; RV32I-NEXT: sw a3, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT: or a4, a7, a6
-; RV32I-NEXT: lbu s10, 20(a0)
-; RV32I-NEXT: lbu s11, 21(a0)
-; RV32I-NEXT: lbu ra, 22(a0)
-; RV32I-NEXT: lbu a3, 23(a0)
+; RV32I-NEXT: slli a5, a5, 16
+; RV32I-NEXT: slli a6, a6, 24
; RV32I-NEXT: slli t0, t0, 8
; RV32I-NEXT: slli t1, t1, 16
; RV32I-NEXT: slli t2, t2, 24
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a5, t0, a7
+; RV32I-NEXT: or a6, t2, t1
+; RV32I-NEXT: lbu s8, 20(a0)
+; RV32I-NEXT: lbu s9, 21(a0)
+; RV32I-NEXT: lbu s10, 22(a0)
+; RV32I-NEXT: lbu s11, 23(a0)
; RV32I-NEXT: slli t4, t4, 8
; RV32I-NEXT: slli t5, t5, 16
; RV32I-NEXT: slli t6, t6, 24
-; RV32I-NEXT: or a5, t0, a5
-; RV32I-NEXT: or a6, t2, t1
+; RV32I-NEXT: slli s1, s1, 8
+; RV32I-NEXT: slli s2, s2, 16
+; RV32I-NEXT: slli s3, s3, 24
; RV32I-NEXT: or a7, t4, t3
; RV32I-NEXT: or t0, t6, t5
-; RV32I-NEXT: lbu s1, 24(a0)
-; RV32I-NEXT: lbu s3, 25(a0)
-; RV32I-NEXT: lbu t4, 26(a0)
-; RV32I-NEXT: lbu t5, 27(a0)
-; RV32I-NEXT: slli s2, s2, 8
-; RV32I-NEXT: slli s4, s4, 16
-; RV32I-NEXT: slli s5, s5, 24
-; RV32I-NEXT: slli s7, s7, 8
-; RV32I-NEXT: or t1, s2, s0
-; RV32I-NEXT: or t2, s5, s4
-; RV32I-NEXT: or t3, s7, s6
-; RV32I-NEXT: lbu t6, 28(a0)
+; RV32I-NEXT: or t1, s1, s0
+; RV32I-NEXT: or t2, s3, s2
+; RV32I-NEXT: lbu t6, 24(a0)
+; RV32I-NEXT: lbu s0, 25(a0)
+; RV32I-NEXT: lbu s1, 26(a0)
+; RV32I-NEXT: lbu s2, 27(a0)
+; RV32I-NEXT: slli s5, s5, 8
+; RV32I-NEXT: slli s6, s6, 16
+; RV32I-NEXT: slli s7, s7, 24
+; RV32I-NEXT: slli s9, s9, 8
+; RV32I-NEXT: or t3, s5, s4
+; RV32I-NEXT: or t4, s7, s6
+; RV32I-NEXT: or t5, s9, s8
+; RV32I-NEXT: lbu s3, 28(a0)
; RV32I-NEXT: lbu s4, 29(a0)
; RV32I-NEXT: lbu s5, 30(a0)
; RV32I-NEXT: lbu s6, 31(a0)
-; RV32I-NEXT: slli s8, s8, 16
-; RV32I-NEXT: slli s9, s9, 24
-; RV32I-NEXT: slli s11, s11, 8
-; RV32I-NEXT: slli ra, ra, 16
-; RV32I-NEXT: slli a3, a3, 24
-; RV32I-NEXT: or a0, s9, s8
-; RV32I-NEXT: or s0, s11, s10
-; RV32I-NEXT: or s2, a3, ra
-; RV32I-NEXT: lbu a3, 0(a1)
-; RV32I-NEXT: lbu s7, 1(a1)
-; RV32I-NEXT: lbu s8, 2(a1)
+; RV32I-NEXT: slli s10, s10, 16
+; RV32I-NEXT: slli s11, s11, 24
+; RV32I-NEXT: slli s0, s0, 8
+; RV32I-NEXT: slli s1, s1, 16
+; RV32I-NEXT: slli s2, s2, 24
+; RV32I-NEXT: or a0, s11, s10
+; RV32I-NEXT: or t6, s0, t6
+; RV32I-NEXT: or s0, s2, s1
+; RV32I-NEXT: lbu s1, 0(a1)
+; RV32I-NEXT: lbu s2, 1(a1)
+; RV32I-NEXT: lbu s7, 2(a1)
; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: sw zero, 16(sp)
+; RV32I-NEXT: sw zero, 20(sp)
; RV32I-NEXT: sw zero, 24(sp)
; RV32I-NEXT: sw zero, 28(sp)
-; RV32I-NEXT: sw zero, 32(sp)
-; RV32I-NEXT: sw zero, 36(sp)
+; RV32I-NEXT: sw zero, 0(sp)
+; RV32I-NEXT: sw zero, 4(sp)
; RV32I-NEXT: sw zero, 8(sp)
; RV32I-NEXT: sw zero, 12(sp)
-; RV32I-NEXT: sw zero, 16(sp)
-; RV32I-NEXT: sw zero, 20(sp)
-; RV32I-NEXT: slli s3, s3, 8
-; RV32I-NEXT: or s1, s3, s1
-; RV32I-NEXT: addi s3, sp, 40
-; RV32I-NEXT: slli t4, t4, 16
-; RV32I-NEXT: slli t5, t5, 24
; RV32I-NEXT: slli s4, s4, 8
+; RV32I-NEXT: or s3, s4, s3
+; RV32I-NEXT: addi s4, sp, 32
; RV32I-NEXT: slli s5, s5, 16
; RV32I-NEXT: slli s6, s6, 24
-; RV32I-NEXT: slli s7, s7, 8
-; RV32I-NEXT: slli s8, s8, 16
+; RV32I-NEXT: slli s2, s2, 8
+; RV32I-NEXT: slli s7, s7, 16
; RV32I-NEXT: slli a1, a1, 24
-; RV32I-NEXT: or t4, t5, t4
-; RV32I-NEXT: or t5, s4, t6
-; RV32I-NEXT: or t6, s6, s5
-; RV32I-NEXT: or a3, s7, a3
-; RV32I-NEXT: or a1, a1, s8
-; RV32I-NEXT: lw s4, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT: or a4, a4, s4
-; RV32I-NEXT: or a5, a6, a5
-; RV32I-NEXT: or a6, t0, a7
-; RV32I-NEXT: or a7, t2, t1
-; RV32I-NEXT: or t0, a0, t3
-; RV32I-NEXT: or t1, s2, s0
-; RV32I-NEXT: or t2, t4, s1
-; RV32I-NEXT: or t3, t6, t5
-; RV32I-NEXT: or a0, a1, a3
-; RV32I-NEXT: sw t0, 56(sp)
-; RV32I-NEXT: sw t1, 60(sp)
-; RV32I-NEXT: sw t2, 64(sp)
-; RV32I-NEXT: sw t3, 68(sp)
-; RV32I-NEXT: sw a4, 40(sp)
-; RV32I-NEXT: sw a5, 44(sp)
-; RV32I-NEXT: sw a6, 48(sp)
-; RV32I-NEXT: sw a7, 52(sp)
+; RV32I-NEXT: or s5, s6, s5
+; RV32I-NEXT: or s1, s2, s1
+; RV32I-NEXT: or a1, a1, s7
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a5, t0, a7
+; RV32I-NEXT: or a6, t2, t1
+; RV32I-NEXT: or a7, t4, t3
+; RV32I-NEXT: or t0, a0, t5
+; RV32I-NEXT: or t1, s0, t6
+; RV32I-NEXT: or t2, s5, s3
+; RV32I-NEXT: or a0, a1, s1
+; RV32I-NEXT: sw a7, 48(sp)
+; RV32I-NEXT: sw t0, 52(sp)
+; RV32I-NEXT: sw t1, 56(sp)
+; RV32I-NEXT: sw t2, 60(sp)
+; RV32I-NEXT: sw a3, 32(sp)
+; RV32I-NEXT: sw a4, 36(sp)
+; RV32I-NEXT: sw a5, 40(sp)
+; RV32I-NEXT: sw a6, 44(sp)
; RV32I-NEXT: srli a1, a0, 3
; RV32I-NEXT: andi a3, a0, 31
; RV32I-NEXT: andi a4, a1, 28
; RV32I-NEXT: xori a1, a3, 31
-; RV32I-NEXT: sub a3, s3, a4
+; RV32I-NEXT: sub a3, s4, a4
; RV32I-NEXT: lw a4, 0(a3)
; RV32I-NEXT: lw a5, 4(a3)
; RV32I-NEXT: lw a6, 8(a3)
@@ -2193,13 +2186,13 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: srli s5, a3, 24
; RV32I-NEXT: srli s6, a3, 16
; RV32I-NEXT: srli s7, a3, 8
-; RV32I-NEXT: srli s8, a1, 24
-; RV32I-NEXT: srli s9, a1, 16
; RV32I-NEXT: sb a7, 24(a2)
+; RV32I-NEXT: srli a7, a1, 24
; RV32I-NEXT: sb t2, 25(a2)
+; RV32I-NEXT: srli t2, a1, 16
; RV32I-NEXT: sb t1, 26(a2)
; RV32I-NEXT: sb t0, 27(a2)
-; RV32I-NEXT: srli a7, a1, 8
+; RV32I-NEXT: srli t0, a1, 8
; RV32I-NEXT: sb a6, 28(a2)
; RV32I-NEXT: sb t5, 29(a2)
; RV32I-NEXT: sb t4, 30(a2)
@@ -2220,27 +2213,26 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: sb s6, 10(a2)
; RV32I-NEXT: sb s5, 11(a2)
; RV32I-NEXT: sb a1, 12(a2)
-; RV32I-NEXT: sb a7, 13(a2)
-; RV32I-NEXT: sb s9, 14(a2)
-; RV32I-NEXT: sb s8, 15(a2)
+; RV32I-NEXT: sb t0, 13(a2)
+; RV32I-NEXT: sb t2, 14(a2)
+; RV32I-NEXT: sb a7, 15(a2)
; RV32I-NEXT: sb a0, 4(a2)
; RV32I-NEXT: sb a4, 5(a2)
; RV32I-NEXT: sb a5, 6(a2)
; RV32I-NEXT: sb a6, 7(a2)
-; RV32I-NEXT: lw ra, 124(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s0, 120(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s1, 116(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s2, 112(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s3, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s4, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s5, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s6, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s7, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s8, 88(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s9, 84(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s10, 80(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s11, 76(sp) # 4-byte Folded Reload
-; RV32I-NEXT: addi sp, sp, 128
+; RV32I-NEXT: lw s0, 108(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 104(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s2, 100(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s3, 96(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s4, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s5, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s6, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s7, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s8, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s9, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s10, 68(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s11, 64(sp) # 4-byte Folded Reload
+; RV32I-NEXT: addi sp, sp, 112
; RV32I-NEXT: ret
%src = load i256, ptr %src.ptr, align 1
%bitOff = load i256, ptr %bitOff.ptr, align 1
@@ -2483,25 +2475,24 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
;
; RV32I-LABEL: ashr_32bytes:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi sp, sp, -128
-; RV32I-NEXT: sw ra, 124(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s0, 120(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s1, 116(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s2, 112(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s3, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s4, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s5, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s6, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s7, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s8, 88(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s9, 84(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s10, 80(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s11, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT: addi sp, sp, -112
+; RV32I-NEXT: sw s0, 108(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 104(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s2, 100(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s3, 96(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s4, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s5, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s6, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s7, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s8, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s9, 72(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s10, 68(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s11, 64(sp) # 4-byte Folded Spill
; RV32I-NEXT: lbu a3, 0(a0)
; RV32I-NEXT: lbu a4, 1(a0)
-; RV32I-NEXT: lbu a6, 2(a0)
-; RV32I-NEXT: lbu a7, 3(a0)
-; RV32I-NEXT: lbu a5, 4(a0)
+; RV32I-NEXT: lbu a5, 2(a0)
+; RV32I-NEXT: lbu a6, 3(a0)
+; RV32I-NEXT: lbu a7, 4(a0)
; RV32I-NEXT: lbu t0, 5(a0)
; RV32I-NEXT: lbu t1, 6(a0)
; RV32I-NEXT: lbu t2, 7(a0)
@@ -2518,100 +2509,98 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: lbu s6, 18(a0)
; RV32I-NEXT: lbu s7, 19(a0)
; RV32I-NEXT: slli a4, a4, 8
-; RV32I-NEXT: slli a6, a6, 16
-; RV32I-NEXT: slli a7, a7, 24
+; RV32I-NEXT: slli a5, a5, 16
+; RV32I-NEXT: slli a6, a6, 24
+; RV32I-NEXT: slli t0, t0, 8
+; RV32I-NEXT: slli t1, t1, 16
+; RV32I-NEXT: slli t2, t2, 24
; RV32I-NEXT: or a3, a4, a3
-; RV32I-NEXT: sw a3, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT: or a4, a7, a6
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a5, t0, a7
+; RV32I-NEXT: or a6, t2, t1
; RV32I-NEXT: lbu s8, 20(a0)
; RV32I-NEXT: lbu s9, 21(a0)
; RV32I-NEXT: lbu s10, 22(a0)
; RV32I-NEXT: lbu s11, 23(a0)
-; RV32I-NEXT: slli t0, t0, 8
-; RV32I-NEXT: slli t1, t1, 16
-; RV32I-NEXT: slli t2, t2, 24
; RV32I-NEXT: slli t4, t4, 8
; RV32I-NEXT: slli t5, t5, 16
; RV32I-NEXT: slli t6, t6, 24
-; RV32I-NEXT: or a5, t0, a5
-; RV32I-NEXT: or a6, t2, t1
-; RV32I-NEXT: or a7, t4, t3
-; RV32I-NEXT: or t0, t6, t5
-; RV32I-NEXT: lbu ra, 24(a0)
-; RV32I-NEXT: lbu a3, 25(a0)
-; RV32I-NEXT: lbu t4, 26(a0)
-; RV32I-NEXT: lbu t5, 27(a0)
; RV32I-NEXT: slli s1, s1, 8
; RV32I-NEXT: slli s2, s2, 16
; RV32I-NEXT: slli s3, s3, 24
-; RV32I-NEXT: slli s5, s5, 8
+; RV32I-NEXT: or a7, t4, t3
+; RV32I-NEXT: or t0, t6, t5
; RV32I-NEXT: or t1, s1, s0
; RV32I-NEXT: or t2, s3, s2
-; RV32I-NEXT: or t3, s5, s4
-; RV32I-NEXT: lbu t6, 28(a0)
-; RV32I-NEXT: lbu s0, 29(a0)
-; RV32I-NEXT: lbu s1, 30(a0)
-; RV32I-NEXT: lbu a0, 31(a0)
+; RV32I-NEXT: lbu t6, 24(a0)
+; RV32I-NEXT: lbu s0, 25(a0)
+; RV32I-NEXT: lbu s1, 26(a0)
+; RV32I-NEXT: lbu s2, 27(a0)
+; RV32I-NEXT: slli s5, s5, 8
; RV32I-NEXT: slli s6, s6, 16
; RV32I-NEXT: slli s7, s7, 24
; RV32I-NEXT: slli s9, s9, 8
+; RV32I-NEXT: or t3, s5, s4
+; RV32I-NEXT: or t4, s7, s6
+; RV32I-NEXT: or t5, s9, s8
+; RV32I-NEXT: lbu s3, 28(a0)
+; RV32I-NEXT: lbu s4, 29(a0)
+; RV32I-NEXT: lbu s5, 30(a0)
+; RV32I-NEXT: lbu a0, 31(a0)
; RV32I-NEXT: slli s10, s10, 16
; RV32I-NEXT: slli s11, s11, 24
-; RV32I-NEXT: or s2, s7, s6
-; RV32I-NEXT: or s3, s9, s8
-; RV32I-NEXT: or s4, s11, s10
-; RV32I-NEXT: lbu s5, 0(a1)
-; RV32I-NEXT: lbu s6, 1(a1)
-; RV32I-NEXT: lbu s7, 2(a1)
-; RV32I-NEXT: lbu a1, 3(a1)
-; RV32I-NEXT: slli a3, a3, 8
-; RV32I-NEXT: or a3, a3, ra
-; RV32I-NEXT: addi s8, sp, 8
-; RV32I-NEXT: slli t4, t4, 16
-; RV32I-NEXT: slli t5, t5, 24
; RV32I-NEXT: slli s0, s0, 8
; RV32I-NEXT: slli s1, s1, 16
+; RV32I-NEXT: slli s2, s2, 24
+; RV32I-NEXT: or s6, s11, s10
+; RV32I-NEXT: or t6, s0, t6
+; RV32I-NEXT: or s0, s2, s1
+; RV32I-NEXT: lbu s1, 0(a1)
+; RV32I-NEXT: lbu s2, 1(a1)
+; RV32I-NEXT: lbu s7, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: slli s4, s4, 8
+; RV32I-NEXT: or s3, s4, s3
+; RV32I-NEXT: mv s4, sp
+; RV32I-NEXT: slli s5, s5, 16
; RV32I-NEXT: slli a0, a0, 24
-; RV32I-NEXT: slli s6, s6, 8
+; RV32I-NEXT: slli s2, s2, 8
; RV32I-NEXT: slli s7, s7, 16
; RV32I-NEXT: slli a1, a1, 24
-; RV32I-NEXT: or t4, t5, t4
-; RV32I-NEXT: or t5, s0, t6
-; RV32I-NEXT: or s1, a0, s1
-; RV32I-NEXT: or t6, s6, s5
+; RV32I-NEXT: or s5, a0, s5
+; RV32I-NEXT: or s1, s2, s1
; RV32I-NEXT: or a1, a1, s7
-; RV32I-NEXT: srai s0, a0, 31
-; RV32I-NEXT: lw a0, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT: or a4, a4, a0
-; RV32I-NEXT: or a5, a6, a5
-; RV32I-NEXT: or a6, t0, a7
-; RV32I-NEXT: or a7, t2, t1
-; RV32I-NEXT: or t0, s2, t3
-; RV32I-NEXT: or t1, s4, s3
-; RV32I-NEXT: or a3, t4, a3
-; RV32I-NEXT: or t2, s1, t5
-; RV32I-NEXT: or a0, a1, t6
-; RV32I-NEXT: sw s0, 56(sp)
-; RV32I-NEXT: sw s0, 60(sp)
-; RV32I-NEXT: sw s0, 64(sp)
-; RV32I-NEXT: sw s0, 68(sp)
-; RV32I-NEXT: sw s0, 40(sp)
-; RV32I-NEXT: sw s0, 44(sp)
-; RV32I-NEXT: sw s0, 48(sp)
-; RV32I-NEXT: sw s0, 52(sp)
-; RV32I-NEXT: sw t0, 24(sp)
-; RV32I-NEXT: sw t1, 28(sp)
-; RV32I-NEXT: sw a3, 32(sp)
-; RV32I-NEXT: sw t2, 36(sp)
-; RV32I-NEXT: sw a4, 8(sp)
-; RV32I-NEXT: sw a5, 12(sp)
-; RV32I-NEXT: sw a6, 16(sp)
-; RV32I-NEXT: sw a7, 20(sp)
+; RV32I-NEXT: srai s2, a0, 31
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a5, t0, a7
+; RV32I-NEXT: or a6, t2, t1
+; RV32I-NEXT: or a7, t4, t3
+; RV32I-NEXT: or t0, s6, t5
+; RV32I-NEXT: or t1, s0, t6
+; RV32I-NEXT: or t2, s5, s3
+; RV32I-NEXT: or a0, a1, s1
+; RV32I-NEXT: sw s2, 48(sp)
+; RV32I-NEXT: sw s2, 52(sp)
+; RV32I-NEXT: sw s2, 56(sp)
+; RV32I-NEXT: sw s2, 60(sp)
+; RV32I-NEXT: sw s2, 32(sp)
+; RV32I-NEXT: sw s2, 36(sp)
+; RV32I-NEXT: sw s2, 40(sp)
+; RV32I-NEXT: sw s2, 44(sp)
+; RV32I-NEXT: sw a7, 16(sp)
+; RV32I-NEXT: sw t0, 20(sp)
+; RV32I-NEXT: sw t1, 24(sp)
+; RV32I-NEXT: sw t2, 28(sp)
+; RV32I-NEXT: sw a3, 0(sp)
+; RV32I-NEXT: sw a4, 4(sp)
+; RV32I-NEXT: sw a5, 8(sp)
+; RV32I-NEXT: sw a6, 12(sp)
; RV32I-NEXT: srli a1, a0, 3
; RV32I-NEXT: andi a3, a0, 31
; RV32I-NEXT: andi a4, a1, 28
; RV32I-NEXT: xori a1, a3, 31
-; RV32I-NEXT: add a4, s8, a4
+; RV32I-NEXT: add a4, s4, a4
; RV32I-NEXT: lw a3, 0(a4)
; RV32I-NEXT: lw a5, 4(a4)
; RV32I-NEXT: lw a6, 8(a4)
@@ -2671,13 +2660,13 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: srli s5, a3, 24
; RV32I-NEXT: srli s6, a3, 16
; RV32I-NEXT: srli s7, a3, 8
-; RV32I-NEXT: srli s8, a1, 24
-; RV32I-NEXT: srli s9, a1, 16
; RV32I-NEXT: sb a7, 24(a2)
+; RV32I-NEXT: srli a7, a1, 24
; RV32I-NEXT: sb t2, 25(a2)
+; RV32I-NEXT: srli t2, a1, 16
; RV32I-NEXT: sb t1, 26(a2)
; RV32I-NEXT: sb t0, 27(a2)
-; RV32I-NEXT: srli a7, a1, 8
+; RV32I-NEXT: srli t0, a1, 8
; RV32I-NEXT: sb a6, 16(a2)
; RV32I-NEXT: sb t5, 17(a2)
; RV32I-NEXT: sb t4, 18(a2)
@@ -2698,27 +2687,26 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: sb s6, 14(a2)
; RV32I-NEXT: sb s5, 15(a2)
; RV32I-NEXT: sb a1, 0(a2)
-; RV32I-NEXT: sb a7, 1(a2)
-; RV32I-NEXT: sb s9, 2(a2)
-; RV32I-NEXT: sb s8, 3(a2)
+; RV32I-NEXT: sb t0, 1(a2)
+; RV32I-NEXT: sb t2, 2(a2)
+; RV32I-NEXT: sb a7, 3(a2)
; RV32I-NEXT: sb a0, 4(a2)
; RV32I-NEXT: sb a4, 5(a2)
; RV32I-NEXT: sb a5, 6(a2)
; RV32I-NEXT: sb a6, 7(a2)
-; RV32I-NEXT: lw ra, 124(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s0, 120(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s1, 116(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s2, 112(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s3, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s4, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s5, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s6, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s7, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s8, 88(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s9, 84(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s10, 80(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s11, 76(sp) # 4-byte Folded Reload
-; RV32I-NEXT: addi sp, sp, 128
+; RV32I-NEXT: lw s0, 108(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 104(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s2, 100(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s3, 96(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s4, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s5, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s6, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s7, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s8, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s9, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s10, 68(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s11, 64(sp) # 4-byte Folded Reload
+; RV32I-NEXT: addi sp, sp, 112
; RV32I-NEXT: ret
%src = load i256, ptr %src.ptr, align 1
%bitOff = load i256, ptr %bitOff.ptr, align 1
>From e96f7f7898790da1fe9cdc5cd3be7e3ae8eb8705 Mon Sep 17 00:00:00 2001
From: Wang Pengcheng <wangpengcheng.pp at bytedance.com>
Date: Tue, 3 Dec 2024 21:44:29 +0800
Subject: [PATCH 2/3] Test commit: add a parameter to keep reserved
---
.../include/llvm/CodeGen/TargetRegisterInfo.h | 4 +-
llvm/lib/CodeGen/RegisterClassInfo.cpp | 3 +-
llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp | 3 +-
llvm/lib/Target/AMDGPU/SIRegisterInfo.h | 4 +-
llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp | 8 +-
llvm/lib/Target/RISCV/RISCVRegisterInfo.h | 4 +-
llvm/test/CodeGen/RISCV/pr69586.ll | 844 +++--
.../RISCV/rvv/fixed-vectors-masked-scatter.ll | 78 +-
.../RISCV/rvv/fixed-vectors-setcc-fp-vp.ll | 2104 +++++------
.../RISCV/rvv/intrinsic-vector-match.ll | 472 +--
...lar-shift-by-byte-multiple-legalization.ll | 3238 +++++++++--------
.../RISCV/wide-scalar-shift-legalization.ll | 646 ++--
llvm/unittests/CodeGen/MFCommon.inc | 4 +-
llvm/utils/TableGen/RegisterInfoEmitter.cpp | 5 +-
14 files changed, 3813 insertions(+), 3604 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
index 292fa3c94969be..eaed26e33c4eb5 100644
--- a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
@@ -914,8 +914,10 @@ class TargetRegisterInfo : public MCRegisterInfo {
/// Get the register unit pressure limit for this dimension.
/// This limit must be adjusted dynamically for reserved registers.
+ /// If RemoveReserved is true, the target should remove reserved registers.
virtual unsigned getRegPressureSetLimit(const MachineFunction &MF,
- unsigned Idx) const = 0;
+ unsigned Idx,
+ bool RemoveReserved = true) const = 0;
/// Get the dimensions of register pressure impacted by this register class.
/// Returns a -1 terminated array of pressure set IDs.
diff --git a/llvm/lib/CodeGen/RegisterClassInfo.cpp b/llvm/lib/CodeGen/RegisterClassInfo.cpp
index 9312bc03bc522a..0a33915ed1e40b 100644
--- a/llvm/lib/CodeGen/RegisterClassInfo.cpp
+++ b/llvm/lib/CodeGen/RegisterClassInfo.cpp
@@ -222,7 +222,8 @@ unsigned RegisterClassInfo::computePSetLimit(unsigned Idx) const {
assert(RC && "Failed to find register class");
compute(RC);
unsigned NAllocatableRegs = getNumAllocatableRegs(RC);
- unsigned RegPressureSetLimit = TRI->getRegPressureSetLimit(*MF, Idx);
+ unsigned RegPressureSetLimit =
+ TRI->getRegPressureSetLimit(*MF, Idx, /*RemoveReserved=*/false);
// If all the regs are reserved, return raw RegPressureSetLimit.
// One example is VRSAVERC in PowerPC.
// Avoid returning zero, getRegPressureSetLimit(Idx) assumes computePSetLimit
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 049f4af4dd2f93..9883454ed78298 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -3640,7 +3640,8 @@ unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
}
unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF,
- unsigned Idx) const {
+ unsigned Idx,
+ bool RemoveReserved) const {
if (Idx == AMDGPU::RegisterPressureSets::VGPR_32 ||
Idx == AMDGPU::RegisterPressureSets::AGPR_32)
return getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index 8e481e3ac23043..b55f5f2c418b09 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -331,8 +331,8 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
unsigned getRegPressureLimit(const TargetRegisterClass *RC,
MachineFunction &MF) const override;
- unsigned getRegPressureSetLimit(const MachineFunction &MF,
- unsigned Idx) const override;
+ unsigned getRegPressureSetLimit(const MachineFunction &MF, unsigned Idx,
+ bool RemoveReserved = true) const override;
const int *getRegUnitPressureSets(unsigned RegUnit) const override;
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
index a73bd1621a739d..d5a769b6c78c7c 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
@@ -936,8 +936,12 @@ bool RISCVRegisterInfo::getRegAllocationHints(
}
unsigned RISCVRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF,
- unsigned Idx) const {
+ unsigned Idx,
+ bool RemoveReserved) const {
if (Idx == RISCV::RegisterPressureSets::GPRAll) {
+ if (!RemoveReserved)
+ return 32;
+
unsigned Reserved = 0;
BitVector ReservedRegs = getReservedRegs(MF);
for (MCPhysReg Reg = RISCV::X0_H; Reg <= RISCV::X31_H; Reg++)
@@ -946,5 +950,5 @@ unsigned RISCVRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF,
return 32 - Reserved;
}
- return RISCVGenRegisterInfo::getRegPressureSetLimit(MF, Idx);
+ return RISCVGenRegisterInfo::getRegPressureSetLimit(MF, Idx, RemoveReserved);
}
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.h b/llvm/lib/Target/RISCV/RISCVRegisterInfo.h
index ca4934de2f52d2..58f97394ec559b 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.h
@@ -144,8 +144,8 @@ struct RISCVRegisterInfo : public RISCVGenRegisterInfo {
static bool isRVVRegClass(const TargetRegisterClass *RC) {
return RISCVRI::isVRegClass(RC->TSFlags);
}
- unsigned getRegPressureSetLimit(const MachineFunction &MF,
- unsigned Idx) const override;
+ unsigned getRegPressureSetLimit(const MachineFunction &MF, unsigned Idx,
+ bool RemoveReserved = true) const override;
};
} // namespace llvm
diff --git a/llvm/test/CodeGen/RISCV/pr69586.ll b/llvm/test/CodeGen/RISCV/pr69586.ll
index 21e64ada7061aa..8e6a7add781c93 100644
--- a/llvm/test/CodeGen/RISCV/pr69586.ll
+++ b/llvm/test/CodeGen/RISCV/pr69586.ll
@@ -39,384 +39,388 @@ define void @test(ptr %0, ptr %1, i64 %2) {
; NOREMAT-NEXT: slli a2, a2, 1
; NOREMAT-NEXT: sub sp, sp, a2
; NOREMAT-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xf0, 0x05, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 752 + 2 * vlenb
-; NOREMAT-NEXT: mv a7, a0
-; NOREMAT-NEXT: li a0, 32
-; NOREMAT-NEXT: addi a5, a7, 512
-; NOREMAT-NEXT: addi a4, a7, 1024
-; NOREMAT-NEXT: addi a6, a7, 1536
-; NOREMAT-NEXT: li t1, 1
+; NOREMAT-NEXT: li a7, 32
+; NOREMAT-NEXT: addi a6, a0, 512
+; NOREMAT-NEXT: addi a4, a0, 1024
+; NOREMAT-NEXT: addi a5, a0, 1536
+; NOREMAT-NEXT: li t0, 1
; NOREMAT-NEXT: li a3, 5
-; NOREMAT-NEXT: li t0, 3
+; NOREMAT-NEXT: li t1, 3
; NOREMAT-NEXT: li a2, 7
; NOREMAT-NEXT: lui t2, 1
-; NOREMAT-NEXT: li s5, 9
-; NOREMAT-NEXT: li s8, 11
-; NOREMAT-NEXT: lui s1, 2
-; NOREMAT-NEXT: lui t5, 3
-; NOREMAT-NEXT: lui s11, 4
-; NOREMAT-NEXT: lui ra, 5
-; NOREMAT-NEXT: lui t3, 6
-; NOREMAT-NEXT: lui s0, 7
-; NOREMAT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOREMAT-NEXT: slli t4, t1, 11
-; NOREMAT-NEXT: slli t6, a3, 9
-; NOREMAT-NEXT: slli s2, t0, 10
-; NOREMAT-NEXT: slli s4, a2, 9
-; NOREMAT-NEXT: add a0, a7, t2
-; NOREMAT-NEXT: vle32.v v8, (a5)
-; NOREMAT-NEXT: slli s5, s5, 9
+; NOREMAT-NEXT: li s4, 9
+; NOREMAT-NEXT: li s6, 11
+; NOREMAT-NEXT: li s9, 13
+; NOREMAT-NEXT: lui s7, 2
+; NOREMAT-NEXT: lui s1, 3
+; NOREMAT-NEXT: lui ra, 4
+; NOREMAT-NEXT: lui t3, 5
+; NOREMAT-NEXT: lui s0, 6
+; NOREMAT-NEXT: lui s3, 7
+; NOREMAT-NEXT: vsetvli zero, a7, e32, m2, ta, ma
+; NOREMAT-NEXT: slli t0, t0, 11
+; NOREMAT-NEXT: sd t0, 504(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT: slli t5, a3, 9
+; NOREMAT-NEXT: slli t6, t1, 10
+; NOREMAT-NEXT: slli s2, a2, 9
+; NOREMAT-NEXT: add a7, a0, t2
+; NOREMAT-NEXT: lui s11, 1
+; NOREMAT-NEXT: slli s4, s4, 9
+; NOREMAT-NEXT: slli s5, a3, 10
+; NOREMAT-NEXT: vle32.v v8, (a6)
+; NOREMAT-NEXT: slli s6, s6, 9
+; NOREMAT-NEXT: slli s8, t1, 11
; NOREMAT-NEXT: vle32.v v10, (a4)
; NOREMAT-NEXT: vle32.v v2, (a4)
-; NOREMAT-NEXT: slli s6, a3, 10
-; NOREMAT-NEXT: vle32.v v0, (a6)
-; NOREMAT-NEXT: vle32.v v12, (a6)
-; NOREMAT-NEXT: slli s8, s8, 9
-; NOREMAT-NEXT: slli s9, t0, 11
-; NOREMAT-NEXT: vle32.v v4, (a0)
-; NOREMAT-NEXT: vle32.v v20, (a0)
-; NOREMAT-NEXT: add a4, a7, s1
+; NOREMAT-NEXT: slli s9, s9, 9
+; NOREMAT-NEXT: vle32.v v0, (a5)
+; NOREMAT-NEXT: vle32.v v12, (a5)
+; NOREMAT-NEXT: slli s10, a2, 10
+; NOREMAT-NEXT: vle32.v v4, (a7)
+; NOREMAT-NEXT: vle32.v v20, (a7)
+; NOREMAT-NEXT: add a4, a0, s7
; NOREMAT-NEXT: vle32.v v6, (a4)
; NOREMAT-NEXT: vle32.v v30, (a4)
-; NOREMAT-NEXT: add a4, a7, t5
+; NOREMAT-NEXT: add a4, a0, s1
; NOREMAT-NEXT: vle32.v v28, (a4)
; NOREMAT-NEXT: vle32.v v26, (a4)
-; NOREMAT-NEXT: add a4, a7, s11
+; NOREMAT-NEXT: add a4, a0, ra
; NOREMAT-NEXT: vle32.v v24, (a4)
; NOREMAT-NEXT: vle32.v v22, (a4)
-; NOREMAT-NEXT: add a4, a7, ra
-; NOREMAT-NEXT: vle32.v v14, (a7)
+; NOREMAT-NEXT: add a4, a0, t3
+; NOREMAT-NEXT: vle32.v v14, (a0)
; NOREMAT-NEXT: vle32.v v18, (a4)
; NOREMAT-NEXT: vle32.v v16, (a4)
-; NOREMAT-NEXT: add a4, a7, t3
+; NOREMAT-NEXT: add a4, a0, s0
; NOREMAT-NEXT: sf.vc.vv 3, 0, v14, v8
; NOREMAT-NEXT: vle32.v v14, (a4)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v8, v10
; NOREMAT-NEXT: vle32.v v8, (a4)
-; NOREMAT-NEXT: addi a0, sp, 640
-; NOREMAT-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill
-; NOREMAT-NEXT: add a4, a7, t4
+; NOREMAT-NEXT: addi a4, sp, 640
+; NOREMAT-NEXT: vs2r.v v8, (a4) # Unknown-size Folded Spill
+; NOREMAT-NEXT: add a4, a0, t0
; NOREMAT-NEXT: vle32.v v10, (a4)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v2, v0
; NOREMAT-NEXT: vle32.v v2, (a4)
-; NOREMAT-NEXT: add a4, a7, t6
+; NOREMAT-NEXT: add a4, a0, t5
; NOREMAT-NEXT: vle32.v v0, (a4)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v10
; NOREMAT-NEXT: vle32.v v10, (a4)
-; NOREMAT-NEXT: add a4, a7, s2
+; NOREMAT-NEXT: add a4, a0, t6
; NOREMAT-NEXT: vle32.v v12, (a4)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v2, v0
; NOREMAT-NEXT: vle32.v v2, (a4)
-; NOREMAT-NEXT: add a4, a7, s4
+; NOREMAT-NEXT: add a4, a0, s2
; NOREMAT-NEXT: vle32.v v8, (a4)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v10, v12
; NOREMAT-NEXT: vle32.v v12, (a4)
-; NOREMAT-NEXT: add a4, a7, s0
+; NOREMAT-NEXT: add a4, a0, s3
; NOREMAT-NEXT: vle32.v v0, (a4)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v2, v8
; NOREMAT-NEXT: vle32.v v10, (a4)
-; NOREMAT-NEXT: add a4, a7, s5
+; NOREMAT-NEXT: add a4, a0, s4
; NOREMAT-NEXT: vle32.v v8, (a4)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v4
; NOREMAT-NEXT: vle32.v v12, (a4)
-; NOREMAT-NEXT: add a4, a7, s6
+; NOREMAT-NEXT: add a4, a0, s5
; NOREMAT-NEXT: vle32.v v4, (a4)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v20, v8
; NOREMAT-NEXT: vle32.v v8, (a4)
-; NOREMAT-NEXT: add a4, a7, s8
+; NOREMAT-NEXT: add a4, a0, s6
; NOREMAT-NEXT: vle32.v v20, (a4)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v4
; NOREMAT-NEXT: vle32.v v12, (a4)
-; NOREMAT-NEXT: add a4, a7, s9
+; NOREMAT-NEXT: add a4, a0, s8
; NOREMAT-NEXT: vle32.v v4, (a4)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v8, v20
; NOREMAT-NEXT: vle32.v v8, (a4)
-; NOREMAT-NEXT: li t5, 13
-; NOREMAT-NEXT: slli a4, t5, 9
-; NOREMAT-NEXT: sd a4, 624(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a4, a7, a4
+; NOREMAT-NEXT: add a4, a0, s9
; NOREMAT-NEXT: vle32.v v20, (a4)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v4
; NOREMAT-NEXT: vle32.v v12, (a4)
-; NOREMAT-NEXT: slli a4, a2, 10
-; NOREMAT-NEXT: sd a4, 616(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a4, a7, a4
+; NOREMAT-NEXT: add a4, a0, s10
; NOREMAT-NEXT: vle32.v v4, (a4)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v8, v20
; NOREMAT-NEXT: vle32.v v8, (a4)
-; NOREMAT-NEXT: li a6, 15
-; NOREMAT-NEXT: slli a4, a6, 9
-; NOREMAT-NEXT: sd a4, 608(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a4, a7, a4
+; NOREMAT-NEXT: li t2, 15
+; NOREMAT-NEXT: slli a4, t2, 9
+; NOREMAT-NEXT: sd a4, 624(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT: add a4, a0, a4
; NOREMAT-NEXT: vle32.v v2, (a4)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v4
-; NOREMAT-NEXT: lui t1, 8
-; NOREMAT-NEXT: add a5, a7, t1
+; NOREMAT-NEXT: lui t4, 8
+; NOREMAT-NEXT: add a5, a0, t4
; NOREMAT-NEXT: vle32.v v20, (a5)
; NOREMAT-NEXT: vle32.v v12, (a4)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v8, v2
; NOREMAT-NEXT: li a4, 17
; NOREMAT-NEXT: slli a4, a4, 9
-; NOREMAT-NEXT: li t2, 17
-; NOREMAT-NEXT: sd a4, 600(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a4, a7, a4
+; NOREMAT-NEXT: li s1, 17
+; NOREMAT-NEXT: sd a4, 616(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT: add a4, a0, a4
; NOREMAT-NEXT: vle32.v v8, (a4)
; NOREMAT-NEXT: vle32.v v4, (a4)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v6
; NOREMAT-NEXT: li a5, 9
; NOREMAT-NEXT: slli a4, a5, 10
-; NOREMAT-NEXT: sd a4, 592(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a4, a7, a4
+; NOREMAT-NEXT: sd a4, 608(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT: add a4, a0, a4
; NOREMAT-NEXT: vle32.v v12, (a4)
; NOREMAT-NEXT: vle32.v v6, (a4)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v8
; NOREMAT-NEXT: li a4, 19
; NOREMAT-NEXT: slli a4, a4, 9
-; NOREMAT-NEXT: li s1, 19
-; NOREMAT-NEXT: sd a4, 584(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a4, a7, a4
+; NOREMAT-NEXT: li t1, 19
+; NOREMAT-NEXT: sd a4, 600(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT: add a4, a0, a4
; NOREMAT-NEXT: vle32.v v8, (a4)
; NOREMAT-NEXT: vle32.v v30, (a4)
; NOREMAT-NEXT: slli a3, a3, 11
-; NOREMAT-NEXT: sd a3, 576(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT: sd a3, 592(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v12
-; NOREMAT-NEXT: add a3, a7, a3
+; NOREMAT-NEXT: add a3, a0, a3
; NOREMAT-NEXT: vle32.v v12, (a3)
; NOREMAT-NEXT: vle32.v v4, (a3)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v8
; NOREMAT-NEXT: li s7, 21
; NOREMAT-NEXT: slli a3, s7, 9
-; NOREMAT-NEXT: sd a3, 568(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a3, a7, a3
+; NOREMAT-NEXT: sd a3, 584(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT: add a3, a0, a3
; NOREMAT-NEXT: vle32.v v8, (a3)
; NOREMAT-NEXT: vle32.v v6, (a3)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v12
-; NOREMAT-NEXT: li a4, 11
-; NOREMAT-NEXT: slli a3, a4, 10
-; NOREMAT-NEXT: sd a3, 560(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a3, a7, a3
+; NOREMAT-NEXT: li a6, 11
+; NOREMAT-NEXT: slli a3, a6, 10
+; NOREMAT-NEXT: sd a3, 576(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT: add a3, a0, a3
; NOREMAT-NEXT: vle32.v v12, (a3)
; NOREMAT-NEXT: vle32.v v30, (a3)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v8
; NOREMAT-NEXT: li s3, 23
-; NOREMAT-NEXT: slli s10, s3, 9
-; NOREMAT-NEXT: add a3, a7, s10
+; NOREMAT-NEXT: slli a3, s3, 9
+; NOREMAT-NEXT: sd a3, 568(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT: add a3, a0, a3
; NOREMAT-NEXT: vle32.v v8, (a3)
; NOREMAT-NEXT: vle32.v v4, (a3)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v12
; NOREMAT-NEXT: li s0, 25
; NOREMAT-NEXT: slli a3, s0, 9
-; NOREMAT-NEXT: sd a3, 552(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a3, a7, a3
+; NOREMAT-NEXT: sd a3, 560(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT: add a3, a0, a3
; NOREMAT-NEXT: vle32.v v12, (a3)
; NOREMAT-NEXT: vle32.v v6, (a3)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v8
-; NOREMAT-NEXT: slli a3, t5, 10
-; NOREMAT-NEXT: sd a3, 544(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a3, a7, a3
+; NOREMAT-NEXT: li a7, 13
+; NOREMAT-NEXT: slli a3, a7, 10
+; NOREMAT-NEXT: sd a3, 552(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT: add a3, a0, a3
; NOREMAT-NEXT: vle32.v v8, (a3)
; NOREMAT-NEXT: vle32.v v30, (a3)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v28
; NOREMAT-NEXT: li t3, 27
; NOREMAT-NEXT: slli a3, t3, 9
-; NOREMAT-NEXT: sd a3, 536(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a3, a7, a3
+; NOREMAT-NEXT: sd a3, 544(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT: add a3, a0, a3
; NOREMAT-NEXT: vle32.v v28, (a3)
; NOREMAT-NEXT: vle32.v v4, (a3)
; NOREMAT-NEXT: slli a2, a2, 11
-; NOREMAT-NEXT: sd a2, 528(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT: sd a2, 536(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: sf.vc.vv 3, 0, v26, v12
-; NOREMAT-NEXT: add a2, a7, a2
+; NOREMAT-NEXT: add a2, a0, a2
; NOREMAT-NEXT: vle32.v v12, (a2)
; NOREMAT-NEXT: vle32.v v26, (a2)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v8
; NOREMAT-NEXT: li t0, 29
; NOREMAT-NEXT: slli a2, t0, 9
-; NOREMAT-NEXT: sd a2, 520(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a2, a7, a2
+; NOREMAT-NEXT: sd a2, 528(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT: add a2, a0, a2
; NOREMAT-NEXT: vle32.v v8, (a2)
; NOREMAT-NEXT: vle32.v v6, (a2)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v28
-; NOREMAT-NEXT: slli a2, a6, 10
-; NOREMAT-NEXT: sd a2, 512(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a2, a7, a2
+; NOREMAT-NEXT: slli a2, t2, 10
+; NOREMAT-NEXT: sd a2, 520(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT: li t2, 15
+; NOREMAT-NEXT: add a2, a0, a2
; NOREMAT-NEXT: vle32.v v28, (a2)
; NOREMAT-NEXT: vle32.v v30, (a2)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v12
; NOREMAT-NEXT: li a3, 31
-; NOREMAT-NEXT: slli a0, a3, 9
-; NOREMAT-NEXT: sd a0, 504(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a0, a7, a0
-; NOREMAT-NEXT: vle32.v v12, (a0)
-; NOREMAT-NEXT: vle32.v v4, (a0)
+; NOREMAT-NEXT: slli a2, a3, 9
+; NOREMAT-NEXT: sd a2, 512(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT: add a2, a0, a2
+; NOREMAT-NEXT: vle32.v v12, (a2)
+; NOREMAT-NEXT: vle32.v v4, (a2)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v26, v8
-; NOREMAT-NEXT: addiw a2, s11, 512
+; NOREMAT-NEXT: addiw a2, ra, 512
; NOREMAT-NEXT: sd a2, 496(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a2, a7, a2
+; NOREMAT-NEXT: add a2, a0, a2
; NOREMAT-NEXT: vle32.v v8, (a2)
; NOREMAT-NEXT: vle32.v v26, (a2)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v28
-; NOREMAT-NEXT: slli a2, t2, 10
+; NOREMAT-NEXT: slli a2, s1, 10
; NOREMAT-NEXT: sd a2, 488(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a2, a7, a2
+; NOREMAT-NEXT: add a2, a0, a2
; NOREMAT-NEXT: vle32.v v28, (a2)
; NOREMAT-NEXT: vle32.v v6, (a2)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v12
-; NOREMAT-NEXT: addiw a2, s11, 1536
+; NOREMAT-NEXT: addiw a2, ra, 1536
; NOREMAT-NEXT: sd a2, 480(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a2, a7, a2
+; NOREMAT-NEXT: add a2, a0, a2
; NOREMAT-NEXT: vle32.v v12, (a2)
; NOREMAT-NEXT: vle32.v v30, (a2)
; NOREMAT-NEXT: slli a2, a5, 11
; NOREMAT-NEXT: sd a2, 472(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v24
-; NOREMAT-NEXT: add a2, a7, a2
+; NOREMAT-NEXT: add a2, a0, a2
; NOREMAT-NEXT: vle32.v v24, (a2)
; NOREMAT-NEXT: vle32.v v4, (a2)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v22, v8
-; NOREMAT-NEXT: addiw a2, ra, -1536
+; NOREMAT-NEXT: lui a4, 5
+; NOREMAT-NEXT: addiw a2, a4, -1536
; NOREMAT-NEXT: sd a2, 464(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a2, a7, a2
+; NOREMAT-NEXT: add a2, a0, a2
; NOREMAT-NEXT: vle32.v v8, (a2)
; NOREMAT-NEXT: vle32.v v22, (a2)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v26, v28
-; NOREMAT-NEXT: slli a2, s1, 10
+; NOREMAT-NEXT: slli a2, t1, 10
; NOREMAT-NEXT: sd a2, 456(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a2, a7, a2
+; NOREMAT-NEXT: li t1, 19
+; NOREMAT-NEXT: add a2, a0, a2
; NOREMAT-NEXT: vle32.v v26, (a2)
; NOREMAT-NEXT: vle32.v v28, (a2)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v12
-; NOREMAT-NEXT: addiw a2, ra, -512
+; NOREMAT-NEXT: addiw a2, a4, -512
; NOREMAT-NEXT: sd a2, 448(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a2, a7, a2
+; NOREMAT-NEXT: add a2, a0, a2
; NOREMAT-NEXT: vle32.v v12, (a2)
; NOREMAT-NEXT: vle32.v v6, (a2)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v24
-; NOREMAT-NEXT: addiw a2, ra, 512
+; NOREMAT-NEXT: addiw a2, a4, 512
; NOREMAT-NEXT: sd a2, 440(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a2, a7, a2
+; NOREMAT-NEXT: add a2, a0, a2
; NOREMAT-NEXT: vle32.v v24, (a2)
; NOREMAT-NEXT: vle32.v v30, (a2)
; NOREMAT-NEXT: slli a2, s7, 10
; NOREMAT-NEXT: sd a2, 432(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v8
-; NOREMAT-NEXT: add a2, a7, a2
+; NOREMAT-NEXT: add a2, a0, a2
; NOREMAT-NEXT: vle32.v v8, (a2)
; NOREMAT-NEXT: vle32.v v4, (a2)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v22, v26
-; NOREMAT-NEXT: addiw a2, ra, 1536
+; NOREMAT-NEXT: addiw a2, a4, 1536
; NOREMAT-NEXT: sd a2, 424(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a2, a7, a2
+; NOREMAT-NEXT: add a2, a0, a2
; NOREMAT-NEXT: vle32.v v22, (a2)
; NOREMAT-NEXT: vle32.v v26, (a2)
-; NOREMAT-NEXT: slli a2, a4, 11
+; NOREMAT-NEXT: slli a2, a6, 11
; NOREMAT-NEXT: sd a2, 416(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: sf.vc.vv 3, 0, v28, v12
-; NOREMAT-NEXT: add a2, a7, a2
+; NOREMAT-NEXT: add a2, a0, a2
; NOREMAT-NEXT: vle32.v v12, (a2)
; NOREMAT-NEXT: vle32.v v28, (a2)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v18
-; NOREMAT-NEXT: lui a4, 6
-; NOREMAT-NEXT: addiw a2, a4, -1536
+; NOREMAT-NEXT: lui a5, 6
+; NOREMAT-NEXT: addiw a2, a5, -1536
; NOREMAT-NEXT: sd a2, 408(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a2, a7, a2
+; NOREMAT-NEXT: add a2, a0, a2
; NOREMAT-NEXT: vle32.v v18, (a2)
; NOREMAT-NEXT: vle32.v v6, (a2)
; NOREMAT-NEXT: slli a2, s3, 10
; NOREMAT-NEXT: sd a2, 400(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: sf.vc.vv 3, 0, v16, v24
-; NOREMAT-NEXT: add a2, a7, a2
+; NOREMAT-NEXT: add a2, a0, a2
; NOREMAT-NEXT: vle32.v v16, (a2)
; NOREMAT-NEXT: vle32.v v24, (a2)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v8
-; NOREMAT-NEXT: addiw a2, a4, -512
+; NOREMAT-NEXT: addiw a2, a5, -512
; NOREMAT-NEXT: sd a2, 392(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a2, a7, a2
+; NOREMAT-NEXT: add a2, a0, a2
; NOREMAT-NEXT: vle32.v v8, (a2)
; NOREMAT-NEXT: vle32.v v30, (a2)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v22
-; NOREMAT-NEXT: addiw a2, a4, 512
+; NOREMAT-NEXT: addiw a2, a5, 512
; NOREMAT-NEXT: sd a2, 384(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a2, a7, a2
+; NOREMAT-NEXT: add a2, a0, a2
; NOREMAT-NEXT: vle32.v v22, (a2)
; NOREMAT-NEXT: vle32.v v4, (a2)
; NOREMAT-NEXT: slli a2, s0, 10
; NOREMAT-NEXT: sd a2, 376(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: sf.vc.vv 3, 0, v26, v12
-; NOREMAT-NEXT: add a2, a7, a2
+; NOREMAT-NEXT: add a2, a0, a2
; NOREMAT-NEXT: vle32.v v26, (a2)
; NOREMAT-NEXT: vle32.v v2, (a2)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v28, v18
-; NOREMAT-NEXT: addiw a2, a4, 1536
+; NOREMAT-NEXT: addiw a2, a5, 1536
; NOREMAT-NEXT: sd a2, 368(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a2, a7, a2
+; NOREMAT-NEXT: add a2, a0, a2
; NOREMAT-NEXT: vle32.v v18, (a2)
; NOREMAT-NEXT: vle32.v v28, (a2)
-; NOREMAT-NEXT: slli a2, t5, 11
+; NOREMAT-NEXT: slli a2, a7, 11
; NOREMAT-NEXT: sd a2, 360(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v16
-; NOREMAT-NEXT: add a2, a7, a2
+; NOREMAT-NEXT: add a2, a0, a2
; NOREMAT-NEXT: vle32.v v16, (a2)
; NOREMAT-NEXT: vle32.v v6, (a2)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v24, v8
-; NOREMAT-NEXT: lui a5, 7
-; NOREMAT-NEXT: addiw a2, a5, -1536
+; NOREMAT-NEXT: lui a7, 7
+; NOREMAT-NEXT: addiw a2, a7, -1536
; NOREMAT-NEXT: sd a2, 352(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a2, a7, a2
+; NOREMAT-NEXT: add a2, a0, a2
; NOREMAT-NEXT: vle32.v v8, (a2)
; NOREMAT-NEXT: vle32.v v24, (a2)
; NOREMAT-NEXT: slli a2, t3, 10
; NOREMAT-NEXT: sd a2, 344(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v14
-; NOREMAT-NEXT: add a2, a7, a2
+; NOREMAT-NEXT: add a2, a0, a2
; NOREMAT-NEXT: vle32.v v14, (a2)
; NOREMAT-NEXT: vle32.v v30, (a2)
-; NOREMAT-NEXT: addi a0, sp, 640
-; NOREMAT-NEXT: vl2r.v v12, (a0) # Unknown-size Folded Reload
+; NOREMAT-NEXT: addi a2, sp, 640
+; NOREMAT-NEXT: vl2r.v v12, (a2) # Unknown-size Folded Reload
; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v22
-; NOREMAT-NEXT: addiw a2, a5, -512
+; NOREMAT-NEXT: addiw a2, a7, -512
; NOREMAT-NEXT: sd a2, 336(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a2, a7, a2
+; NOREMAT-NEXT: add a2, a0, a2
; NOREMAT-NEXT: vle32.v v22, (a2)
; NOREMAT-NEXT: vle32.v v12, (a2)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v26
-; NOREMAT-NEXT: addiw a2, a5, 512
+; NOREMAT-NEXT: addiw a2, a7, 512
; NOREMAT-NEXT: sd a2, 328(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a2, a7, a2
+; NOREMAT-NEXT: add a2, a0, a2
; NOREMAT-NEXT: vle32.v v26, (a2)
; NOREMAT-NEXT: vle32.v v4, (a2)
; NOREMAT-NEXT: slli a2, t0, 10
; NOREMAT-NEXT: sd a2, 320(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: sf.vc.vv 3, 0, v2, v18
-; NOREMAT-NEXT: add a2, a7, a2
+; NOREMAT-NEXT: add a2, a0, a2
; NOREMAT-NEXT: vle32.v v18, (a2)
; NOREMAT-NEXT: vle32.v v2, (a2)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v28, v16
-; NOREMAT-NEXT: addiw a2, a5, 1536
+; NOREMAT-NEXT: addiw a2, a7, 1536
; NOREMAT-NEXT: sd a2, 312(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a2, a7, a2
+; NOREMAT-NEXT: add a2, a0, a2
; NOREMAT-NEXT: vle32.v v16, (a2)
; NOREMAT-NEXT: vle32.v v28, (a2)
-; NOREMAT-NEXT: slli a2, a6, 11
+; NOREMAT-NEXT: slli a2, t2, 11
; NOREMAT-NEXT: sd a2, 304(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v8
-; NOREMAT-NEXT: add a2, a7, a2
+; NOREMAT-NEXT: add a2, a0, a2
; NOREMAT-NEXT: vle32.v v8, (a2)
; NOREMAT-NEXT: vle32.v v6, (a2)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v24, v14
-; NOREMAT-NEXT: addiw a2, t1, -1536
+; NOREMAT-NEXT: addiw a2, t4, -1536
; NOREMAT-NEXT: sd a2, 296(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a2, a7, a2
+; NOREMAT-NEXT: add a2, a0, a2
; NOREMAT-NEXT: vle32.v v14, (a2)
; NOREMAT-NEXT: vle32.v v24, (a2)
; NOREMAT-NEXT: slli a2, a3, 10
; NOREMAT-NEXT: sd a2, 288(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v22
-; NOREMAT-NEXT: add a2, a7, a2
+; NOREMAT-NEXT: add a2, a0, a2
; NOREMAT-NEXT: vle32.v v22, (a2)
; NOREMAT-NEXT: vle32.v v30, (a2)
-; NOREMAT-NEXT: addiw a0, t1, -512
-; NOREMAT-NEXT: sd a0, 280(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a0, a7, a0
+; NOREMAT-NEXT: addiw a2, t4, -512
+; NOREMAT-NEXT: sd a2, 280(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT: add a0, a0, a2
; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v0
; NOREMAT-NEXT: vle32.v v12, (a0)
; NOREMAT-NEXT: vle32.v v0, (a0)
@@ -431,33 +435,32 @@ define void @test(ptr %0, ptr %1, i64 %2) {
; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0
; NOREMAT-NEXT: addi a0, a1, 1024
; NOREMAT-NEXT: vse32.v v8, (a0)
-; NOREMAT-NEXT: lui a0, 1
-; NOREMAT-NEXT: add a0, a1, a0
-; NOREMAT-NEXT: sd a0, 272(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT: add s11, a1, s11
+; NOREMAT-NEXT: sd s11, 272(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: lui a0, 2
; NOREMAT-NEXT: add a0, a1, a0
; NOREMAT-NEXT: sd a0, 264(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: lui a0, 3
; NOREMAT-NEXT: add a0, a1, a0
; NOREMAT-NEXT: sd a0, 256(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add s11, a1, s11
-; NOREMAT-NEXT: sd s11, 248(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: add ra, a1, ra
-; NOREMAT-NEXT: sd ra, 240(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT: sd ra, 248(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: add a4, a1, a4
-; NOREMAT-NEXT: sd a4, 232(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT: sd a4, 240(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: add a5, a1, a5
-; NOREMAT-NEXT: sd a5, 224(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a0, a1, t1
+; NOREMAT-NEXT: sd a5, 232(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT: add a7, a1, a7
+; NOREMAT-NEXT: sd a7, 224(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT: add a0, a1, t4
; NOREMAT-NEXT: sd a0, 216(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: addiw a0, t1, 512
+; NOREMAT-NEXT: addiw a0, t4, 512
; NOREMAT-NEXT: sd a0, 192(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: addiw a0, t1, 1024
+; NOREMAT-NEXT: addiw a0, t4, 1024
; NOREMAT-NEXT: sd a0, 176(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: addiw a0, t1, 1536
+; NOREMAT-NEXT: addiw a0, t4, 1536
; NOREMAT-NEXT: sd a0, 160(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: slli t2, t2, 11
-; NOREMAT-NEXT: sd t2, 128(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT: slli s1, s1, 11
+; NOREMAT-NEXT: sd s1, 128(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: lui a0, 9
; NOREMAT-NEXT: addiw a2, a0, -1536
; NOREMAT-NEXT: sd a2, 88(sp) # 8-byte Folded Spill
@@ -470,7 +473,7 @@ define void @test(ptr %0, ptr %1, i64 %2) {
; NOREMAT-NEXT: addiw s11, a0, 512
; NOREMAT-NEXT: addiw s7, a0, 1024
; NOREMAT-NEXT: addiw s3, a0, 1536
-; NOREMAT-NEXT: slli s1, s1, 11
+; NOREMAT-NEXT: slli s1, t1, 11
; NOREMAT-NEXT: lui a0, 10
; NOREMAT-NEXT: addiw t2, a0, -1536
; NOREMAT-NEXT: addiw a7, a0, -1024
@@ -478,52 +481,52 @@ define void @test(ptr %0, ptr %1, i64 %2) {
; NOREMAT-NEXT: add a2, a1, a0
; NOREMAT-NEXT: sd a2, 200(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: addiw a0, a0, 512
-; NOREMAT-NEXT: add a2, a1, t4
-; NOREMAT-NEXT: add a3, a1, t6
-; NOREMAT-NEXT: add a5, a1, s2
-; NOREMAT-NEXT: add a6, a1, s4
-; NOREMAT-NEXT: add t0, a1, s5
-; NOREMAT-NEXT: add t1, a1, s6
-; NOREMAT-NEXT: add t3, a1, s8
-; NOREMAT-NEXT: add t4, a1, s9
-; NOREMAT-NEXT: ld t5, 624(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT: add t5, a1, t5
-; NOREMAT-NEXT: ld t6, 616(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT: add t6, a1, t6
-; NOREMAT-NEXT: ld s0, 608(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT: ld a2, 504(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT: add a2, a1, a2
+; NOREMAT-NEXT: add a3, a1, t5
+; NOREMAT-NEXT: add a5, a1, t6
+; NOREMAT-NEXT: add a6, a1, s2
+; NOREMAT-NEXT: add t0, a1, s4
+; NOREMAT-NEXT: add t1, a1, s5
+; NOREMAT-NEXT: add t3, a1, s6
+; NOREMAT-NEXT: add t4, a1, s8
+; NOREMAT-NEXT: add t5, a1, s9
+; NOREMAT-NEXT: add t6, a1, s10
+; NOREMAT-NEXT: ld s0, 624(sp) # 8-byte Folded Reload
; NOREMAT-NEXT: add s0, a1, s0
-; NOREMAT-NEXT: ld s2, 600(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT: ld s2, 616(sp) # 8-byte Folded Reload
; NOREMAT-NEXT: add s2, a1, s2
-; NOREMAT-NEXT: ld s4, 592(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT: ld s4, 608(sp) # 8-byte Folded Reload
; NOREMAT-NEXT: add s4, a1, s4
-; NOREMAT-NEXT: ld s5, 584(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT: ld s5, 600(sp) # 8-byte Folded Reload
; NOREMAT-NEXT: add s5, a1, s5
-; NOREMAT-NEXT: ld s6, 576(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT: ld s6, 592(sp) # 8-byte Folded Reload
; NOREMAT-NEXT: add s6, a1, s6
-; NOREMAT-NEXT: ld s8, 568(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT: ld s8, 584(sp) # 8-byte Folded Reload
; NOREMAT-NEXT: add s8, a1, s8
-; NOREMAT-NEXT: ld s9, 560(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT: ld s9, 576(sp) # 8-byte Folded Reload
; NOREMAT-NEXT: add s9, a1, s9
+; NOREMAT-NEXT: ld s10, 568(sp) # 8-byte Folded Reload
; NOREMAT-NEXT: add s10, a1, s10
-; NOREMAT-NEXT: ld ra, 552(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT: ld ra, 560(sp) # 8-byte Folded Reload
; NOREMAT-NEXT: add ra, a1, ra
; NOREMAT-NEXT: sd ra, 16(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: ld ra, 544(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT: ld ra, 552(sp) # 8-byte Folded Reload
; NOREMAT-NEXT: add ra, a1, ra
; NOREMAT-NEXT: sd ra, 24(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: ld ra, 536(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT: ld ra, 544(sp) # 8-byte Folded Reload
; NOREMAT-NEXT: add ra, a1, ra
; NOREMAT-NEXT: sd ra, 32(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: ld ra, 528(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT: ld ra, 536(sp) # 8-byte Folded Reload
; NOREMAT-NEXT: add ra, a1, ra
; NOREMAT-NEXT: sd ra, 48(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: ld ra, 520(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT: ld ra, 528(sp) # 8-byte Folded Reload
; NOREMAT-NEXT: add ra, a1, ra
; NOREMAT-NEXT: sd ra, 56(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: ld ra, 512(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT: ld ra, 520(sp) # 8-byte Folded Reload
; NOREMAT-NEXT: add ra, a1, ra
; NOREMAT-NEXT: sd ra, 64(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: ld ra, 504(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT: ld ra, 512(sp) # 8-byte Folded Reload
; NOREMAT-NEXT: add ra, a1, ra
; NOREMAT-NEXT: sd ra, 80(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: ld ra, 496(sp) # 8-byte Folded Reload
@@ -917,9 +920,10 @@ define void @test(ptr %0, ptr %1, i64 %2) {
; REMAT-NEXT: .cfi_offset s10, -96
; REMAT-NEXT: .cfi_offset s11, -104
; REMAT-NEXT: csrr a2, vlenb
-; REMAT-NEXT: slli a2, a2, 3
+; REMAT-NEXT: li a3, 14
+; REMAT-NEXT: mul a2, a2, a3
; REMAT-NEXT: sub sp, sp, a2
-; REMAT-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xa0, 0x04, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 544 + 8 * vlenb
+; REMAT-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xa0, 0x04, 0x22, 0x11, 0x0e, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 544 + 14 * vlenb
; REMAT-NEXT: li a4, 32
; REMAT-NEXT: addi a5, a0, 512
; REMAT-NEXT: addi a3, a0, 1024
@@ -956,13 +960,20 @@ define void @test(ptr %0, ptr %1, i64 %2) {
; REMAT-NEXT: slli s6, s6, 9
; REMAT-NEXT: li s7, 5
; REMAT-NEXT: slli s7, s7, 11
+; REMAT-NEXT: li s8, 21
+; REMAT-NEXT: slli s8, s8, 9
+; REMAT-NEXT: li s9, 11
+; REMAT-NEXT: slli s9, s9, 10
+; REMAT-NEXT: li s10, 23
+; REMAT-NEXT: slli s10, s10, 9
+; REMAT-NEXT: lui s11, 3
; REMAT-NEXT: vsetvli zero, a4, e32, m2, ta, ma
; REMAT-NEXT: vle32.v v8, (a5)
-; REMAT-NEXT: li a4, 21
+; REMAT-NEXT: li a4, 25
; REMAT-NEXT: slli a4, a4, 9
; REMAT-NEXT: vle32.v v10, (a3)
; REMAT-NEXT: vle32.v v12, (a3)
-; REMAT-NEXT: li a3, 11
+; REMAT-NEXT: li a3, 13
; REMAT-NEXT: slli a3, a3, 10
; REMAT-NEXT: vle32.v v14, (a2)
; REMAT-NEXT: vle32.v v16, (a2)
@@ -979,7 +990,7 @@ define void @test(ptr %0, ptr %1, i64 %2) {
; REMAT-NEXT: vle32.v v30, (a2)
; REMAT-NEXT: vle32.v v6, (a2)
; REMAT-NEXT: csrr a2, vlenb
-; REMAT-NEXT: li a5, 6
+; REMAT-NEXT: li a5, 12
; REMAT-NEXT: mul a2, a2, a5
; REMAT-NEXT: add a2, sp, a2
; REMAT-NEXT: addi a2, a2, 432
@@ -989,7 +1000,8 @@ define void @test(ptr %0, ptr %1, i64 %2) {
; REMAT-NEXT: vle32.v v2, (a2)
; REMAT-NEXT: vle32.v v6, (a2)
; REMAT-NEXT: csrr a2, vlenb
-; REMAT-NEXT: slli a2, a2, 2
+; REMAT-NEXT: li a5, 10
+; REMAT-NEXT: mul a2, a2, a5
; REMAT-NEXT: add a2, sp, a2
; REMAT-NEXT: addi a2, a2, 432
; REMAT-NEXT: vs2r.v v6, (a2) # Unknown-size Folded Spill
@@ -1003,11 +1015,16 @@ define void @test(ptr %0, ptr %1, i64 %2) {
; REMAT-NEXT: sf.vc.vv 3, 0, v12, v14
; REMAT-NEXT: vle32.v v0, (a2)
; REMAT-NEXT: add a2, a0, t5
-; REMAT-NEXT: vle32.v v8, (a2)
+; REMAT-NEXT: vle32.v v14, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v16, v18
-; REMAT-NEXT: vle32.v v18, (a2)
+; REMAT-NEXT: vle32.v v8, (a2)
+; REMAT-NEXT: csrr a2, vlenb
+; REMAT-NEXT: slli a2, a2, 3
+; REMAT-NEXT: add a2, sp, a2
+; REMAT-NEXT: addi a2, a2, 432
+; REMAT-NEXT: vs2r.v v8, (a2) # Unknown-size Folded Spill
; REMAT-NEXT: add a2, a0, t6
-; REMAT-NEXT: vle32.v v16, (a2)
+; REMAT-NEXT: vle32.v v18, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v20, v22
; REMAT-NEXT: vle32.v v20, (a2)
; REMAT-NEXT: add a2, a0, s0
@@ -1017,340 +1034,383 @@ define void @test(ptr %0, ptr %1, i64 %2) {
; REMAT-NEXT: add a2, a0, s1
; REMAT-NEXT: vle32.v v26, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v28, v30
-; REMAT-NEXT: vle32.v v14, (a2)
+; REMAT-NEXT: vle32.v v28, (a2)
; REMAT-NEXT: add a2, a0, s2
-; REMAT-NEXT: vle32.v v12, (a2)
+; REMAT-NEXT: vle32.v v8, (a2)
; REMAT-NEXT: csrr a5, vlenb
-; REMAT-NEXT: li a6, 6
+; REMAT-NEXT: li a6, 12
; REMAT-NEXT: mul a5, a5, a6
; REMAT-NEXT: add a5, sp, a5
; REMAT-NEXT: addi a5, a5, 432
-; REMAT-NEXT: vl2r.v v28, (a5) # Unknown-size Folded Reload
-; REMAT-NEXT: sf.vc.vv 3, 0, v28, v2
+; REMAT-NEXT: vl2r.v v12, (a5) # Unknown-size Folded Reload
+; REMAT-NEXT: sf.vc.vv 3, 0, v12, v2
; REMAT-NEXT: vle32.v v2, (a2)
; REMAT-NEXT: add a2, a0, s3
-; REMAT-NEXT: vle32.v v28, (a2)
+; REMAT-NEXT: vle32.v v12, (a2)
; REMAT-NEXT: csrr a5, vlenb
-; REMAT-NEXT: slli a5, a5, 2
+; REMAT-NEXT: li a6, 10
+; REMAT-NEXT: mul a5, a5, a6
; REMAT-NEXT: add a5, sp, a5
; REMAT-NEXT: addi a5, a5, 432
-; REMAT-NEXT: vl2r.v v30, (a5) # Unknown-size Folded Reload
-; REMAT-NEXT: sf.vc.vv 3, 0, v30, v4
-; REMAT-NEXT: vle32.v v4, (a2)
-; REMAT-NEXT: add a2, a0, s4
+; REMAT-NEXT: vl2r.v v16, (a5) # Unknown-size Folded Reload
+; REMAT-NEXT: sf.vc.vv 3, 0, v16, v4
; REMAT-NEXT: vle32.v v30, (a2)
+; REMAT-NEXT: add a2, a0, s4
+; REMAT-NEXT: vle32.v v16, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v6, v10
-; REMAT-NEXT: vle32.v v10, (a2)
-; REMAT-NEXT: add a2, a0, s5
; REMAT-NEXT: vle32.v v6, (a2)
-; REMAT-NEXT: sf.vc.vv 3, 0, v0, v8
-; REMAT-NEXT: vle32.v v0, (a2)
+; REMAT-NEXT: add a2, a0, s5
+; REMAT-NEXT: vle32.v v10, (a2)
+; REMAT-NEXT: sf.vc.vv 3, 0, v0, v14
+; REMAT-NEXT: vle32.v v4, (a2)
; REMAT-NEXT: add a2, a0, s6
-; REMAT-NEXT: vle32.v v8, (a2)
-; REMAT-NEXT: sf.vc.vv 3, 0, v18, v16
-; REMAT-NEXT: vle32.v v18, (a2)
+; REMAT-NEXT: vle32.v v14, (a2)
+; REMAT-NEXT: csrr a5, vlenb
+; REMAT-NEXT: slli a5, a5, 3
+; REMAT-NEXT: add a5, sp, a5
+; REMAT-NEXT: addi a5, a5, 432
+; REMAT-NEXT: vl2r.v v0, (a5) # Unknown-size Folded Reload
+; REMAT-NEXT: sf.vc.vv 3, 0, v0, v18
+; REMAT-NEXT: vle32.v v0, (a2)
; REMAT-NEXT: add a2, a0, s7
-; REMAT-NEXT: vle32.v v16, (a2)
+; REMAT-NEXT: vle32.v v18, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v20, v22
-; REMAT-NEXT: vle32.v v22, (a2)
-; REMAT-NEXT: add a2, a0, a4
+; REMAT-NEXT: vle32.v v20, (a2)
+; REMAT-NEXT: csrr a2, vlenb
+; REMAT-NEXT: slli a2, a2, 3
+; REMAT-NEXT: add a2, sp, a2
+; REMAT-NEXT: addi a2, a2, 432
+; REMAT-NEXT: vs2r.v v20, (a2) # Unknown-size Folded Spill
+; REMAT-NEXT: add a2, a0, s8
; REMAT-NEXT: vle32.v v20, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v24, v26
; REMAT-NEXT: vle32.v v24, (a2)
+; REMAT-NEXT: add a2, a0, s9
+; REMAT-NEXT: vle32.v v22, (a2)
+; REMAT-NEXT: sf.vc.vv 3, 0, v28, v8
+; REMAT-NEXT: vle32.v v26, (a2)
+; REMAT-NEXT: add a2, a0, s10
+; REMAT-NEXT: vle32.v v8, (a2)
+; REMAT-NEXT: sf.vc.vv 3, 0, v2, v12
+; REMAT-NEXT: vle32.v v12, (a2)
+; REMAT-NEXT: add a2, a0, s11
+; REMAT-NEXT: vle32.v v2, (a2)
+; REMAT-NEXT: sf.vc.vv 3, 0, v30, v16
+; REMAT-NEXT: vle32.v v16, (a2)
; REMAT-NEXT: addi a2, sp, 432
-; REMAT-NEXT: vs2r.v v24, (a2) # Unknown-size Folded Spill
+; REMAT-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill
+; REMAT-NEXT: add a2, a0, a4
+; REMAT-NEXT: vle32.v v16, (a2)
+; REMAT-NEXT: sf.vc.vv 3, 0, v6, v10
+; REMAT-NEXT: vle32.v v10, (a2)
+; REMAT-NEXT: csrr a2, vlenb
+; REMAT-NEXT: slli a2, a2, 1
+; REMAT-NEXT: add a2, sp, a2
+; REMAT-NEXT: addi a2, a2, 432
+; REMAT-NEXT: vs2r.v v10, (a2) # Unknown-size Folded Spill
; REMAT-NEXT: add a2, a0, a3
-; REMAT-NEXT: vle32.v v24, (a2)
-; REMAT-NEXT: sf.vc.vv 3, 0, v14, v12
-; REMAT-NEXT: vle32.v v12, (a2)
-; REMAT-NEXT: li a5, 23
+; REMAT-NEXT: vle32.v v10, (a2)
+; REMAT-NEXT: sf.vc.vv 3, 0, v4, v14
+; REMAT-NEXT: vle32.v v14, (a2)
+; REMAT-NEXT: csrr a2, vlenb
+; REMAT-NEXT: li a3, 12
+; REMAT-NEXT: mul a2, a2, a3
+; REMAT-NEXT: add a2, sp, a2
+; REMAT-NEXT: addi a2, a2, 432
+; REMAT-NEXT: vs2r.v v14, (a2) # Unknown-size Folded Spill
+; REMAT-NEXT: li a5, 27
; REMAT-NEXT: slli a5, a5, 9
; REMAT-NEXT: add a2, a0, a5
-; REMAT-NEXT: vle32.v v26, (a2)
-; REMAT-NEXT: sf.vc.vv 3, 0, v2, v28
; REMAT-NEXT: vle32.v v14, (a2)
+; REMAT-NEXT: sf.vc.vv 3, 0, v0, v18
+; REMAT-NEXT: vle32.v v18, (a2)
; REMAT-NEXT: csrr a2, vlenb
-; REMAT-NEXT: li a3, 6
+; REMAT-NEXT: li a3, 10
; REMAT-NEXT: mul a2, a2, a3
; REMAT-NEXT: add a2, sp, a2
; REMAT-NEXT: addi a2, a2, 432
-; REMAT-NEXT: vs2r.v v14, (a2) # Unknown-size Folded Spill
-; REMAT-NEXT: lui s8, 3
-; REMAT-NEXT: add a2, a0, s8
+; REMAT-NEXT: vs2r.v v18, (a2) # Unknown-size Folded Spill
+; REMAT-NEXT: li ra, 7
+; REMAT-NEXT: slli ra, ra, 11
+; REMAT-NEXT: add a2, a0, ra
; REMAT-NEXT: vle32.v v28, (a2)
-; REMAT-NEXT: sf.vc.vv 3, 0, v4, v30
-; REMAT-NEXT: vle32.v v14, (a2)
+; REMAT-NEXT: csrr a3, vlenb
+; REMAT-NEXT: slli a3, a3, 3
+; REMAT-NEXT: add a3, sp, a3
+; REMAT-NEXT: addi a3, a3, 432
+; REMAT-NEXT: vl2r.v v18, (a3) # Unknown-size Folded Reload
+; REMAT-NEXT: sf.vc.vv 3, 0, v18, v20
+; REMAT-NEXT: vle32.v v18, (a2)
; REMAT-NEXT: csrr a2, vlenb
-; REMAT-NEXT: slli a2, a2, 2
+; REMAT-NEXT: slli a2, a2, 3
; REMAT-NEXT: add a2, sp, a2
; REMAT-NEXT: addi a2, a2, 432
-; REMAT-NEXT: vs2r.v v14, (a2) # Unknown-size Folded Spill
-; REMAT-NEXT: li s9, 25
-; REMAT-NEXT: slli s9, s9, 9
-; REMAT-NEXT: add a2, a0, s9
+; REMAT-NEXT: vs2r.v v18, (a2) # Unknown-size Folded Spill
+; REMAT-NEXT: li a2, 29
+; REMAT-NEXT: slli a2, a2, 9
+; REMAT-NEXT: add a2, a0, a2
; REMAT-NEXT: vle32.v v30, (a2)
-; REMAT-NEXT: sf.vc.vv 3, 0, v10, v6
-; REMAT-NEXT: vle32.v v14, (a2)
-; REMAT-NEXT: li s10, 13
-; REMAT-NEXT: slli s10, s10, 10
-; REMAT-NEXT: add a2, a0, s10
+; REMAT-NEXT: sf.vc.vv 3, 0, v24, v22
+; REMAT-NEXT: vle32.v v18, (a2)
+; REMAT-NEXT: csrr a2, vlenb
+; REMAT-NEXT: li a3, 6
+; REMAT-NEXT: mul a2, a2, a3
+; REMAT-NEXT: add a2, sp, a2
+; REMAT-NEXT: addi a2, a2, 432
+; REMAT-NEXT: vs2r.v v18, (a2) # Unknown-size Folded Spill
+; REMAT-NEXT: li a2, 15
+; REMAT-NEXT: slli a2, a2, 10
+; REMAT-NEXT: add a2, a0, a2
; REMAT-NEXT: vle32.v v6, (a2)
-; REMAT-NEXT: sf.vc.vv 3, 0, v0, v8
+; REMAT-NEXT: sf.vc.vv 3, 0, v26, v8
; REMAT-NEXT: vle32.v v8, (a2)
; REMAT-NEXT: csrr a2, vlenb
-; REMAT-NEXT: slli a2, a2, 1
+; REMAT-NEXT: slli a2, a2, 2
; REMAT-NEXT: add a2, sp, a2
; REMAT-NEXT: addi a2, a2, 432
; REMAT-NEXT: vs2r.v v8, (a2) # Unknown-size Folded Spill
-; REMAT-NEXT: li s11, 27
-; REMAT-NEXT: slli s11, s11, 9
-; REMAT-NEXT: add a2, a0, s11
+; REMAT-NEXT: li a2, 31
+; REMAT-NEXT: slli a2, a2, 9
+; REMAT-NEXT: add a2, a0, a2
; REMAT-NEXT: vle32.v v4, (a2)
-; REMAT-NEXT: sf.vc.vv 3, 0, v18, v16
+; REMAT-NEXT: sf.vc.vv 3, 0, v12, v2
; REMAT-NEXT: vle32.v v18, (a2)
-; REMAT-NEXT: li ra, 7
-; REMAT-NEXT: slli ra, ra, 11
-; REMAT-NEXT: add a2, a0, ra
+; REMAT-NEXT: lui a2, 4
+; REMAT-NEXT: add a2, a0, a2
; REMAT-NEXT: vle32.v v2, (a2)
-; REMAT-NEXT: sf.vc.vv 3, 0, v22, v20
+; REMAT-NEXT: addi a3, sp, 432
+; REMAT-NEXT: vl2r.v v8, (a3) # Unknown-size Folded Reload
+; REMAT-NEXT: sf.vc.vv 3, 0, v8, v16
; REMAT-NEXT: vle32.v v20, (a2)
-; REMAT-NEXT: li a2, 29
-; REMAT-NEXT: slli a2, a2, 9
+; REMAT-NEXT: lui a2, 4
+; REMAT-NEXT: addiw a2, a2, 512
; REMAT-NEXT: add a2, a0, a2
; REMAT-NEXT: vle32.v v0, (a2)
-; REMAT-NEXT: addi a3, sp, 432
+; REMAT-NEXT: csrr a3, vlenb
+; REMAT-NEXT: slli a3, a3, 1
+; REMAT-NEXT: add a3, sp, a3
+; REMAT-NEXT: addi a3, a3, 432
; REMAT-NEXT: vl2r.v v8, (a3) # Unknown-size Folded Reload
-; REMAT-NEXT: sf.vc.vv 3, 0, v8, v24
+; REMAT-NEXT: sf.vc.vv 3, 0, v8, v10
; REMAT-NEXT: vle32.v v22, (a2)
-; REMAT-NEXT: li a2, 15
+; REMAT-NEXT: li a2, 17
; REMAT-NEXT: slli a2, a2, 10
; REMAT-NEXT: add a2, a0, a2
; REMAT-NEXT: vle32.v v24, (a2)
-; REMAT-NEXT: sf.vc.vv 3, 0, v12, v26
+; REMAT-NEXT: csrr a3, vlenb
+; REMAT-NEXT: li a4, 12
+; REMAT-NEXT: mul a3, a3, a4
+; REMAT-NEXT: add a3, sp, a3
+; REMAT-NEXT: addi a3, a3, 432
+; REMAT-NEXT: vl2r.v v8, (a3) # Unknown-size Folded Reload
+; REMAT-NEXT: sf.vc.vv 3, 0, v8, v14
; REMAT-NEXT: vle32.v v8, (a2)
-; REMAT-NEXT: li a2, 31
-; REMAT-NEXT: slli a2, a2, 9
+; REMAT-NEXT: lui a2, 4
+; REMAT-NEXT: addiw a2, a2, 1536
; REMAT-NEXT: add a2, a0, a2
; REMAT-NEXT: vle32.v v26, (a2)
; REMAT-NEXT: csrr a3, vlenb
-; REMAT-NEXT: li a4, 6
+; REMAT-NEXT: li a4, 10
; REMAT-NEXT: mul a3, a3, a4
; REMAT-NEXT: add a3, sp, a3
; REMAT-NEXT: addi a3, a3, 432
; REMAT-NEXT: vl2r.v v10, (a3) # Unknown-size Folded Reload
; REMAT-NEXT: sf.vc.vv 3, 0, v10, v28
; REMAT-NEXT: vle32.v v10, (a2)
-; REMAT-NEXT: lui a2, 4
+; REMAT-NEXT: li a2, 9
+; REMAT-NEXT: slli a2, a2, 11
; REMAT-NEXT: add a2, a0, a2
; REMAT-NEXT: vle32.v v28, (a2)
; REMAT-NEXT: csrr a3, vlenb
-; REMAT-NEXT: slli a3, a3, 2
+; REMAT-NEXT: slli a3, a3, 3
; REMAT-NEXT: add a3, sp, a3
; REMAT-NEXT: addi a3, a3, 432
; REMAT-NEXT: vl2r.v v12, (a3) # Unknown-size Folded Reload
; REMAT-NEXT: sf.vc.vv 3, 0, v12, v30
; REMAT-NEXT: vle32.v v12, (a2)
-; REMAT-NEXT: lui a2, 4
-; REMAT-NEXT: addiw a2, a2, 512
+; REMAT-NEXT: lui a2, 5
+; REMAT-NEXT: addiw a2, a2, -1536
; REMAT-NEXT: add a2, a0, a2
; REMAT-NEXT: vle32.v v30, (a2)
+; REMAT-NEXT: csrr a3, vlenb
+; REMAT-NEXT: li a4, 6
+; REMAT-NEXT: mul a3, a3, a4
+; REMAT-NEXT: add a3, sp, a3
+; REMAT-NEXT: addi a3, a3, 432
+; REMAT-NEXT: vl2r.v v14, (a3) # Unknown-size Folded Reload
; REMAT-NEXT: sf.vc.vv 3, 0, v14, v6
; REMAT-NEXT: vle32.v v14, (a2)
-; REMAT-NEXT: li a2, 17
+; REMAT-NEXT: li a2, 19
; REMAT-NEXT: slli a2, a2, 10
; REMAT-NEXT: add a2, a0, a2
; REMAT-NEXT: vle32.v v6, (a2)
; REMAT-NEXT: csrr a3, vlenb
-; REMAT-NEXT: slli a3, a3, 1
+; REMAT-NEXT: slli a3, a3, 2
; REMAT-NEXT: add a3, sp, a3
; REMAT-NEXT: addi a3, a3, 432
; REMAT-NEXT: vl2r.v v16, (a3) # Unknown-size Folded Reload
; REMAT-NEXT: sf.vc.vv 3, 0, v16, v4
; REMAT-NEXT: vle32.v v16, (a2)
-; REMAT-NEXT: lui a2, 4
-; REMAT-NEXT: addiw a2, a2, 1536
+; REMAT-NEXT: lui a2, 5
+; REMAT-NEXT: addiw a2, a2, -512
; REMAT-NEXT: add a2, a0, a2
; REMAT-NEXT: vle32.v v4, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v18, v2
; REMAT-NEXT: vle32.v v18, (a2)
-; REMAT-NEXT: li a2, 9
-; REMAT-NEXT: slli a2, a2, 11
+; REMAT-NEXT: lui a2, 5
; REMAT-NEXT: add a2, a0, a2
; REMAT-NEXT: vle32.v v2, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v20, v0
; REMAT-NEXT: vle32.v v20, (a2)
; REMAT-NEXT: lui a2, 5
-; REMAT-NEXT: addiw a2, a2, -1536
+; REMAT-NEXT: addiw a2, a2, 512
; REMAT-NEXT: add a2, a0, a2
; REMAT-NEXT: vle32.v v0, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v22, v24
; REMAT-NEXT: vle32.v v22, (a2)
-; REMAT-NEXT: li a2, 19
+; REMAT-NEXT: li a2, 21
; REMAT-NEXT: slli a2, a2, 10
; REMAT-NEXT: add a2, a0, a2
; REMAT-NEXT: vle32.v v24, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v8, v26
; REMAT-NEXT: vle32.v v8, (a2)
-; REMAT-NEXT: lui a2, 5
-; REMAT-NEXT: addiw a2, a2, -512
-; REMAT-NEXT: add a2, a0, a2
+; REMAT-NEXT: lui s4, 5
+; REMAT-NEXT: addiw s4, s4, 1536
+; REMAT-NEXT: add a2, a0, s4
; REMAT-NEXT: vle32.v v26, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v10, v28
; REMAT-NEXT: vle32.v v10, (a2)
-; REMAT-NEXT: lui a2, 5
+; REMAT-NEXT: li a2, 11
+; REMAT-NEXT: slli a2, a2, 11
; REMAT-NEXT: add a2, a0, a2
; REMAT-NEXT: vle32.v v28, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v12, v30
; REMAT-NEXT: vle32.v v12, (a2)
-; REMAT-NEXT: lui a2, 5
-; REMAT-NEXT: addiw a2, a2, 512
-; REMAT-NEXT: add a2, a0, a2
+; REMAT-NEXT: lui s3, 6
+; REMAT-NEXT: addiw s3, s3, -1536
+; REMAT-NEXT: add a2, a0, s3
; REMAT-NEXT: vle32.v v30, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v14, v6
; REMAT-NEXT: vle32.v v14, (a2)
-; REMAT-NEXT: li a2, 21
-; REMAT-NEXT: slli a2, a2, 10
-; REMAT-NEXT: add a2, a0, a2
+; REMAT-NEXT: li s2, 23
+; REMAT-NEXT: slli s2, s2, 10
+; REMAT-NEXT: add a2, a0, s2
; REMAT-NEXT: vle32.v v6, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v16, v4
; REMAT-NEXT: vle32.v v16, (a2)
-; REMAT-NEXT: lui a2, 5
-; REMAT-NEXT: addiw a2, a2, 1536
+; REMAT-NEXT: lui a2, 6
+; REMAT-NEXT: addiw a2, a2, -512
; REMAT-NEXT: add a2, a0, a2
; REMAT-NEXT: vle32.v v4, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v18, v2
; REMAT-NEXT: vle32.v v18, (a2)
-; REMAT-NEXT: li a2, 11
-; REMAT-NEXT: slli a2, a2, 11
+; REMAT-NEXT: lui a2, 6
; REMAT-NEXT: add a2, a0, a2
+; REMAT-NEXT: lui s1, 6
; REMAT-NEXT: vle32.v v2, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v20, v0
; REMAT-NEXT: vle32.v v20, (a2)
-; REMAT-NEXT: lui a2, 6
-; REMAT-NEXT: addiw a2, a2, -1536
-; REMAT-NEXT: add a2, a0, a2
+; REMAT-NEXT: lui s0, 6
+; REMAT-NEXT: addiw s0, s0, 512
+; REMAT-NEXT: add a2, a0, s0
; REMAT-NEXT: vle32.v v0, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v22, v24
; REMAT-NEXT: vle32.v v22, (a2)
-; REMAT-NEXT: li a2, 23
+; REMAT-NEXT: li a2, 25
; REMAT-NEXT: slli a2, a2, 10
; REMAT-NEXT: add a2, a0, a2
; REMAT-NEXT: vle32.v v24, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v8, v26
; REMAT-NEXT: vle32.v v8, (a2)
-; REMAT-NEXT: lui a2, 6
-; REMAT-NEXT: addiw a2, a2, -512
-; REMAT-NEXT: add a2, a0, a2
+; REMAT-NEXT: lui t6, 6
+; REMAT-NEXT: addiw t6, t6, 1536
+; REMAT-NEXT: add a2, a0, t6
; REMAT-NEXT: vle32.v v26, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v10, v28
; REMAT-NEXT: vle32.v v10, (a2)
-; REMAT-NEXT: lui a2, 6
-; REMAT-NEXT: add a2, a0, a2
-; REMAT-NEXT: lui s1, 6
+; REMAT-NEXT: li t5, 13
+; REMAT-NEXT: slli t5, t5, 11
+; REMAT-NEXT: add a2, a0, t5
; REMAT-NEXT: vle32.v v28, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v12, v30
; REMAT-NEXT: vle32.v v12, (a2)
-; REMAT-NEXT: lui s0, 6
-; REMAT-NEXT: addiw s0, s0, 512
-; REMAT-NEXT: add a2, a0, s0
+; REMAT-NEXT: lui a2, 7
+; REMAT-NEXT: addiw a2, a2, -1536
+; REMAT-NEXT: add a2, a0, a2
; REMAT-NEXT: vle32.v v30, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v14, v6
; REMAT-NEXT: vle32.v v14, (a2)
-; REMAT-NEXT: li a2, 25
-; REMAT-NEXT: slli a2, a2, 10
-; REMAT-NEXT: add a2, a0, a2
+; REMAT-NEXT: li t4, 27
+; REMAT-NEXT: slli t4, t4, 10
+; REMAT-NEXT: add a2, a0, t4
; REMAT-NEXT: vle32.v v6, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v16, v4
; REMAT-NEXT: vle32.v v16, (a2)
-; REMAT-NEXT: lui t6, 6
-; REMAT-NEXT: addiw t6, t6, 1536
-; REMAT-NEXT: add a2, a0, t6
+; REMAT-NEXT: lui a2, 7
+; REMAT-NEXT: addiw a2, a2, -512
+; REMAT-NEXT: add a2, a0, a2
; REMAT-NEXT: vle32.v v4, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v18, v2
; REMAT-NEXT: vle32.v v18, (a2)
-; REMAT-NEXT: li t5, 13
-; REMAT-NEXT: slli t5, t5, 11
-; REMAT-NEXT: add a2, a0, t5
+; REMAT-NEXT: lui a2, 7
+; REMAT-NEXT: add a2, a0, a2
+; REMAT-NEXT: lui t3, 7
; REMAT-NEXT: vle32.v v2, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v20, v0
; REMAT-NEXT: vle32.v v20, (a2)
-; REMAT-NEXT: lui a2, 7
-; REMAT-NEXT: addiw a2, a2, -1536
-; REMAT-NEXT: add a2, a0, a2
+; REMAT-NEXT: lui t2, 7
+; REMAT-NEXT: addiw t2, t2, 512
+; REMAT-NEXT: add a2, a0, t2
; REMAT-NEXT: vle32.v v0, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v22, v24
; REMAT-NEXT: vle32.v v22, (a2)
-; REMAT-NEXT: li t4, 27
-; REMAT-NEXT: slli t4, t4, 10
-; REMAT-NEXT: add a2, a0, t4
+; REMAT-NEXT: li t1, 29
+; REMAT-NEXT: slli t1, t1, 10
+; REMAT-NEXT: add a2, a0, t1
; REMAT-NEXT: vle32.v v24, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v8, v26
; REMAT-NEXT: vle32.v v8, (a2)
-; REMAT-NEXT: lui a2, 7
-; REMAT-NEXT: addiw a2, a2, -512
-; REMAT-NEXT: add a2, a0, a2
+; REMAT-NEXT: lui t0, 7
+; REMAT-NEXT: addiw t0, t0, 1536
+; REMAT-NEXT: add a2, a0, t0
; REMAT-NEXT: vle32.v v26, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v10, v28
; REMAT-NEXT: vle32.v v10, (a2)
-; REMAT-NEXT: lui a2, 7
-; REMAT-NEXT: add a2, a0, a2
-; REMAT-NEXT: lui t3, 7
+; REMAT-NEXT: li a7, 15
+; REMAT-NEXT: slli a7, a7, 11
+; REMAT-NEXT: add a2, a0, a7
; REMAT-NEXT: vle32.v v28, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v12, v30
; REMAT-NEXT: vle32.v v12, (a2)
-; REMAT-NEXT: lui t2, 7
-; REMAT-NEXT: addiw t2, t2, 512
-; REMAT-NEXT: add a2, a0, t2
+; REMAT-NEXT: lui a6, 8
+; REMAT-NEXT: addiw a6, a6, -1536
+; REMAT-NEXT: add a2, a0, a6
; REMAT-NEXT: vle32.v v30, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v14, v6
; REMAT-NEXT: vle32.v v14, (a2)
-; REMAT-NEXT: li t1, 29
-; REMAT-NEXT: slli t1, t1, 10
-; REMAT-NEXT: add a2, a0, t1
+; REMAT-NEXT: li a4, 31
+; REMAT-NEXT: slli a4, a4, 10
+; REMAT-NEXT: add a2, a0, a4
; REMAT-NEXT: vle32.v v6, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v16, v4
; REMAT-NEXT: vle32.v v16, (a2)
-; REMAT-NEXT: lui t0, 7
-; REMAT-NEXT: addiw t0, t0, 1536
-; REMAT-NEXT: add a2, a0, t0
+; REMAT-NEXT: lui a3, 8
+; REMAT-NEXT: addiw a3, a3, -512
+; REMAT-NEXT: add a2, a0, a3
; REMAT-NEXT: vle32.v v4, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v18, v2
; REMAT-NEXT: vle32.v v18, (a2)
-; REMAT-NEXT: li a7, 15
-; REMAT-NEXT: slli a7, a7, 11
-; REMAT-NEXT: add a2, a0, a7
-; REMAT-NEXT: vle32.v v2, (a2)
+; REMAT-NEXT: lui a2, 8
+; REMAT-NEXT: add a0, a0, a2
+; REMAT-NEXT: vle32.v v2, (a0)
; REMAT-NEXT: sf.vc.vv 3, 0, v20, v0
-; REMAT-NEXT: vle32.v v20, (a2)
-; REMAT-NEXT: lui a6, 8
-; REMAT-NEXT: addiw a6, a6, -1536
-; REMAT-NEXT: add a2, a0, a6
-; REMAT-NEXT: vle32.v v0, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v22, v24
-; REMAT-NEXT: vle32.v v22, (a2)
-; REMAT-NEXT: li a4, 31
-; REMAT-NEXT: slli a4, a4, 10
-; REMAT-NEXT: add a2, a0, a4
-; REMAT-NEXT: vle32.v v24, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v8, v26
-; REMAT-NEXT: vle32.v v8, (a2)
-; REMAT-NEXT: lui a3, 8
-; REMAT-NEXT: addiw a3, a3, -512
-; REMAT-NEXT: add a2, a0, a3
-; REMAT-NEXT: vle32.v v26, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v10, v28
-; REMAT-NEXT: vle32.v v10, (a2)
-; REMAT-NEXT: lui a2, 8
-; REMAT-NEXT: add a0, a0, a2
-; REMAT-NEXT: vle32.v v28, (a0)
; REMAT-NEXT: sf.vc.vv 3, 0, v12, v30
; REMAT-NEXT: sf.vc.vv 3, 0, v14, v6
; REMAT-NEXT: sf.vc.vv 3, 0, v16, v4
; REMAT-NEXT: sf.vc.vv 3, 0, v18, v2
-; REMAT-NEXT: sf.vc.vv 3, 0, v20, v0
-; REMAT-NEXT: sf.vc.vv 3, 0, v22, v24
-; REMAT-NEXT: sf.vc.vv 3, 0, v8, v26
-; REMAT-NEXT: sf.vc.vv 3, 0, v10, v28
; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0
; REMAT-NEXT: addi a0, a1, 1024
; REMAT-NEXT: vse32.v v8, (a0)
@@ -1397,36 +1457,41 @@ define void @test(ptr %0, ptr %1, i64 %2) {
; REMAT-NEXT: slli a0, a0, 10
; REMAT-NEXT: add a0, a1, a0
; REMAT-NEXT: sd a0, 336(sp) # 8-byte Folded Spill
-; REMAT-NEXT: add s2, a1, s2
-; REMAT-NEXT: sd s2, 328(sp) # 8-byte Folded Spill
-; REMAT-NEXT: add s3, a1, s3
-; REMAT-NEXT: sd s3, 320(sp) # 8-byte Folded Spill
-; REMAT-NEXT: add s4, a1, s4
-; REMAT-NEXT: sd s4, 312(sp) # 8-byte Folded Spill
+; REMAT-NEXT: li a0, 15
+; REMAT-NEXT: slli a0, a0, 9
+; REMAT-NEXT: add a0, a1, a0
+; REMAT-NEXT: sd a0, 328(sp) # 8-byte Folded Spill
+; REMAT-NEXT: lui a0, 2
+; REMAT-NEXT: add a0, a1, a0
+; REMAT-NEXT: sd a0, 320(sp) # 8-byte Folded Spill
+; REMAT-NEXT: li a0, 17
+; REMAT-NEXT: slli a0, a0, 9
+; REMAT-NEXT: add a0, a1, a0
+; REMAT-NEXT: sd a0, 312(sp) # 8-byte Folded Spill
; REMAT-NEXT: add s5, a1, s5
; REMAT-NEXT: sd s5, 304(sp) # 8-byte Folded Spill
; REMAT-NEXT: add s6, a1, s6
; REMAT-NEXT: sd s6, 296(sp) # 8-byte Folded Spill
; REMAT-NEXT: add s7, a1, s7
; REMAT-NEXT: sd s7, 288(sp) # 8-byte Folded Spill
-; REMAT-NEXT: li a0, 21
+; REMAT-NEXT: add s8, a1, s8
+; REMAT-NEXT: sd s8, 280(sp) # 8-byte Folded Spill
+; REMAT-NEXT: add s9, a1, s9
+; REMAT-NEXT: sd s9, 272(sp) # 8-byte Folded Spill
+; REMAT-NEXT: add s10, a1, s10
+; REMAT-NEXT: sd s10, 264(sp) # 8-byte Folded Spill
+; REMAT-NEXT: add s11, a1, s11
+; REMAT-NEXT: sd s11, 256(sp) # 8-byte Folded Spill
+; REMAT-NEXT: li a0, 25
; REMAT-NEXT: slli a0, a0, 9
; REMAT-NEXT: add a0, a1, a0
-; REMAT-NEXT: sd a0, 280(sp) # 8-byte Folded Spill
-; REMAT-NEXT: li a0, 11
+; REMAT-NEXT: sd a0, 248(sp) # 8-byte Folded Spill
+; REMAT-NEXT: li a0, 13
; REMAT-NEXT: slli a0, a0, 10
; REMAT-NEXT: add a0, a1, a0
-; REMAT-NEXT: sd a0, 272(sp) # 8-byte Folded Spill
+; REMAT-NEXT: sd a0, 240(sp) # 8-byte Folded Spill
; REMAT-NEXT: add a5, a1, a5
-; REMAT-NEXT: sd a5, 264(sp) # 8-byte Folded Spill
-; REMAT-NEXT: add s8, a1, s8
-; REMAT-NEXT: sd s8, 256(sp) # 8-byte Folded Spill
-; REMAT-NEXT: add s9, a1, s9
-; REMAT-NEXT: sd s9, 248(sp) # 8-byte Folded Spill
-; REMAT-NEXT: add s10, a1, s10
-; REMAT-NEXT: sd s10, 240(sp) # 8-byte Folded Spill
-; REMAT-NEXT: add s11, a1, s11
-; REMAT-NEXT: sd s11, 232(sp) # 8-byte Folded Spill
+; REMAT-NEXT: sd a5, 232(sp) # 8-byte Folded Spill
; REMAT-NEXT: add ra, a1, ra
; REMAT-NEXT: sd ra, 224(sp) # 8-byte Folded Spill
; REMAT-NEXT: li a0, 29
@@ -1483,22 +1548,16 @@ define void @test(ptr %0, ptr %1, i64 %2) {
; REMAT-NEXT: slli a0, a0, 10
; REMAT-NEXT: add a0, a1, a0
; REMAT-NEXT: sd a0, 112(sp) # 8-byte Folded Spill
-; REMAT-NEXT: lui a0, 5
-; REMAT-NEXT: addiw a0, a0, 1536
-; REMAT-NEXT: add a0, a1, a0
-; REMAT-NEXT: sd a0, 104(sp) # 8-byte Folded Spill
+; REMAT-NEXT: add s4, a1, s4
+; REMAT-NEXT: sd s4, 104(sp) # 8-byte Folded Spill
; REMAT-NEXT: li a0, 11
; REMAT-NEXT: slli a0, a0, 11
; REMAT-NEXT: add a0, a1, a0
; REMAT-NEXT: sd a0, 96(sp) # 8-byte Folded Spill
-; REMAT-NEXT: lui a0, 6
-; REMAT-NEXT: addiw a0, a0, -1536
-; REMAT-NEXT: add a0, a1, a0
-; REMAT-NEXT: sd a0, 88(sp) # 8-byte Folded Spill
-; REMAT-NEXT: li a0, 23
-; REMAT-NEXT: slli a0, a0, 10
-; REMAT-NEXT: add a0, a1, a0
-; REMAT-NEXT: sd a0, 80(sp) # 8-byte Folded Spill
+; REMAT-NEXT: add s3, a1, s3
+; REMAT-NEXT: sd s3, 88(sp) # 8-byte Folded Spill
+; REMAT-NEXT: add s2, a1, s2
+; REMAT-NEXT: sd s2, 80(sp) # 8-byte Folded Spill
; REMAT-NEXT: lui a0, 6
; REMAT-NEXT: addiw a0, a0, -512
; REMAT-NEXT: add a0, a1, a0
@@ -1795,7 +1854,8 @@ define void @test(ptr %0, ptr %1, i64 %2) {
; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0
; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0
; REMAT-NEXT: csrr a0, vlenb
-; REMAT-NEXT: slli a0, a0, 3
+; REMAT-NEXT: li a1, 14
+; REMAT-NEXT: mul a0, a0, a1
; REMAT-NEXT: add sp, sp, a0
; REMAT-NEXT: .cfi_def_cfa sp, 544
; REMAT-NEXT: ld ra, 536(sp) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll
index 0b5856a7000dd4..575a757149ebba 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll
@@ -5682,16 +5682,28 @@ define void @mscatter_baseidx_v8i64(<8 x i64> %val, ptr %base, <8 x i64> %idxs,
;
; RV32ZVE32F-LABEL: mscatter_baseidx_v8i64:
; RV32ZVE32F: # %bb.0:
-; RV32ZVE32F-NEXT: addi sp, sp, -16
-; RV32ZVE32F-NEXT: .cfi_def_cfa_offset 16
-; RV32ZVE32F-NEXT: sw s0, 12(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT: sw s1, 8(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT: sw s2, 4(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT: sw s3, 0(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT: addi sp, sp, -48
+; RV32ZVE32F-NEXT: .cfi_def_cfa_offset 48
+; RV32ZVE32F-NEXT: sw s0, 44(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT: sw s1, 40(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT: sw s2, 36(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT: sw s3, 32(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT: sw s4, 28(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT: sw s5, 24(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT: sw s6, 20(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT: sw s7, 16(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT: sw s8, 12(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT: sw s9, 8(sp) # 4-byte Folded Spill
; RV32ZVE32F-NEXT: .cfi_offset s0, -4
; RV32ZVE32F-NEXT: .cfi_offset s1, -8
; RV32ZVE32F-NEXT: .cfi_offset s2, -12
; RV32ZVE32F-NEXT: .cfi_offset s3, -16
+; RV32ZVE32F-NEXT: .cfi_offset s4, -20
+; RV32ZVE32F-NEXT: .cfi_offset s5, -24
+; RV32ZVE32F-NEXT: .cfi_offset s6, -28
+; RV32ZVE32F-NEXT: .cfi_offset s7, -32
+; RV32ZVE32F-NEXT: .cfi_offset s8, -36
+; RV32ZVE32F-NEXT: .cfi_offset s9, -40
; RV32ZVE32F-NEXT: .cfi_remember_state
; RV32ZVE32F-NEXT: lw a3, 56(a0)
; RV32ZVE32F-NEXT: lw a4, 60(a0)
@@ -5703,30 +5715,30 @@ define void @mscatter_baseidx_v8i64(<8 x i64> %val, ptr %base, <8 x i64> %idxs,
; RV32ZVE32F-NEXT: lw t4, 28(a0)
; RV32ZVE32F-NEXT: lw t1, 32(a0)
; RV32ZVE32F-NEXT: lw t2, 36(a0)
-; RV32ZVE32F-NEXT: lw t5, 0(a2)
-; RV32ZVE32F-NEXT: lw t6, 8(a2)
-; RV32ZVE32F-NEXT: lw s0, 16(a2)
-; RV32ZVE32F-NEXT: lw s1, 24(a2)
-; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; RV32ZVE32F-NEXT: vmv.v.x v8, t5
-; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, t6
-; RV32ZVE32F-NEXT: lw t5, 32(a2)
-; RV32ZVE32F-NEXT: lw t6, 40(a2)
-; RV32ZVE32F-NEXT: lw s2, 48(a2)
-; RV32ZVE32F-NEXT: lw s3, 56(a2)
-; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, s0
-; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, s1
-; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, t5
-; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, t6
; RV32ZVE32F-NEXT: lw s0, 8(a0)
; RV32ZVE32F-NEXT: lw s1, 12(a0)
; RV32ZVE32F-NEXT: lw t5, 16(a0)
; RV32ZVE32F-NEXT: lw t6, 20(a0)
-; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, s2
+; RV32ZVE32F-NEXT: lw s2, 32(a2)
+; RV32ZVE32F-NEXT: lw s3, 40(a2)
+; RV32ZVE32F-NEXT: lw s4, 48(a2)
+; RV32ZVE32F-NEXT: lw s5, 56(a2)
+; RV32ZVE32F-NEXT: lw s6, 0(a2)
+; RV32ZVE32F-NEXT: lw s7, 8(a2)
+; RV32ZVE32F-NEXT: lw s8, 16(a2)
+; RV32ZVE32F-NEXT: lw s9, 24(a2)
+; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; RV32ZVE32F-NEXT: vmv.v.x v8, s6
; RV32ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma
; RV32ZVE32F-NEXT: vmv.x.s a2, v0
; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, s7
+; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, s8
+; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, s9
+; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, s2
; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, s3
+; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, s4
+; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, s5
; RV32ZVE32F-NEXT: vsll.vi v8, v8, 3
; RV32ZVE32F-NEXT: andi s2, a2, 1
; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1
@@ -5759,15 +5771,27 @@ define void @mscatter_baseidx_v8i64(<8 x i64> %val, ptr %base, <8 x i64> %idxs,
; RV32ZVE32F-NEXT: sw a3, 0(a0)
; RV32ZVE32F-NEXT: sw a4, 4(a0)
; RV32ZVE32F-NEXT: .LBB51_9: # %else14
-; RV32ZVE32F-NEXT: lw s0, 12(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT: lw s1, 8(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT: lw s2, 4(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT: lw s3, 0(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT: lw s0, 44(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT: lw s1, 40(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT: lw s2, 36(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT: lw s3, 32(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT: lw s4, 28(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT: lw s5, 24(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT: lw s6, 20(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT: lw s7, 16(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT: lw s8, 12(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT: lw s9, 8(sp) # 4-byte Folded Reload
; RV32ZVE32F-NEXT: .cfi_restore s0
; RV32ZVE32F-NEXT: .cfi_restore s1
; RV32ZVE32F-NEXT: .cfi_restore s2
; RV32ZVE32F-NEXT: .cfi_restore s3
-; RV32ZVE32F-NEXT: addi sp, sp, 16
+; RV32ZVE32F-NEXT: .cfi_restore s4
+; RV32ZVE32F-NEXT: .cfi_restore s5
+; RV32ZVE32F-NEXT: .cfi_restore s6
+; RV32ZVE32F-NEXT: .cfi_restore s7
+; RV32ZVE32F-NEXT: .cfi_restore s8
+; RV32ZVE32F-NEXT: .cfi_restore s9
+; RV32ZVE32F-NEXT: addi sp, sp, 48
; RV32ZVE32F-NEXT: .cfi_def_cfa_offset 0
; RV32ZVE32F-NEXT: ret
; RV32ZVE32F-NEXT: .LBB51_10: # %cond.store
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll
index 036fee6a13ca4c..a11c02dd5e2cb4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll
@@ -1306,12 +1306,6 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
; ZVFHMIN32-NEXT: sb a0, 219(sp)
; ZVFHMIN32-NEXT: lh a0, 564(sp)
; ZVFHMIN32-NEXT: lh a1, 308(sp)
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT: sb a0, 218(sp)
-; ZVFHMIN32-NEXT: lh a0, 562(sp)
-; ZVFHMIN32-NEXT: lh a1, 306(sp)
; ZVFHMIN32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
; ZVFHMIN32-NEXT: vslidedown.vi v10, v8, 7
; ZVFHMIN32-NEXT: csrr a2, vlenb
@@ -1364,82 +1358,86 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
; ZVFHMIN32-NEXT: vs1r.v v10, (a2) # Unknown-size Folded Spill
; ZVFHMIN32-NEXT: vsetivli zero, 1, e16, m2, ta, ma
; ZVFHMIN32-NEXT: vslidedown.vi v26, v8, 15
-; ZVFHMIN32-NEXT: vslidedown.vi v28, v8, 14
-; ZVFHMIN32-NEXT: vslidedown.vi v10, v8, 13
-; ZVFHMIN32-NEXT: addi a2, sp, 848
+; ZVFHMIN32-NEXT: vslidedown.vi v20, v8, 14
+; ZVFHMIN32-NEXT: vslidedown.vi v28, v8, 13
+; ZVFHMIN32-NEXT: vslidedown.vi v10, v8, 12
+; ZVFHMIN32-NEXT: csrr a2, vlenb
+; ZVFHMIN32-NEXT: slli a2, a2, 1
+; ZVFHMIN32-NEXT: add a2, sp, a2
+; ZVFHMIN32-NEXT: addi a2, a2, 848
; ZVFHMIN32-NEXT: vs2r.v v10, (a2) # Unknown-size Folded Spill
-; ZVFHMIN32-NEXT: vslidedown.vi v6, v8, 12
-; ZVFHMIN32-NEXT: vslidedown.vi v2, v8, 11
-; ZVFHMIN32-NEXT: vslidedown.vi v22, v8, 10
-; ZVFHMIN32-NEXT: vslidedown.vi v20, v8, 9
-; ZVFHMIN32-NEXT: vslidedown.vi v18, v8, 8
-; ZVFHMIN32-NEXT: vmv.x.s a3, v16
+; ZVFHMIN32-NEXT: vslidedown.vi v4, v8, 11
+; ZVFHMIN32-NEXT: vslidedown.vi v2, v8, 10
+; ZVFHMIN32-NEXT: vslidedown.vi v30, v8, 9
+; ZVFHMIN32-NEXT: vslidedown.vi v22, v8, 8
+; ZVFHMIN32-NEXT: vmv.x.s t5, v16
; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT: sb a0, 217(sp)
-; ZVFHMIN32-NEXT: lh a0, 560(sp)
-; ZVFHMIN32-NEXT: lh a1, 304(sp)
+; ZVFHMIN32-NEXT: sb a0, 218(sp)
+; ZVFHMIN32-NEXT: lh a0, 562(sp)
+; ZVFHMIN32-NEXT: lh a1, 306(sp)
; ZVFHMIN32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; ZVFHMIN32-NEXT: vslidedown.vi v21, v16, 7
-; ZVFHMIN32-NEXT: vslidedown.vi v3, v16, 6
-; ZVFHMIN32-NEXT: vslidedown.vi v19, v16, 5
+; ZVFHMIN32-NEXT: vslidedown.vi v3, v16, 7
+; ZVFHMIN32-NEXT: vslidedown.vi v31, v16, 6
+; ZVFHMIN32-NEXT: vslidedown.vi v5, v16, 5
; ZVFHMIN32-NEXT: vslidedown.vi v23, v16, 4
; ZVFHMIN32-NEXT: vslidedown.vi v8, v16, 3
; ZVFHMIN32-NEXT: csrr a2, vlenb
-; ZVFHMIN32-NEXT: li a4, 10
-; ZVFHMIN32-NEXT: mul a2, a2, a4
+; ZVFHMIN32-NEXT: li a3, 18
+; ZVFHMIN32-NEXT: mul a2, a2, a3
; ZVFHMIN32-NEXT: add a2, sp, a2
; ZVFHMIN32-NEXT: addi a2, a2, 848
; ZVFHMIN32-NEXT: vs1r.v v8, (a2) # Unknown-size Folded Spill
; ZVFHMIN32-NEXT: vslidedown.vi v8, v16, 2
; ZVFHMIN32-NEXT: csrr a2, vlenb
-; ZVFHMIN32-NEXT: slli a2, a2, 4
+; ZVFHMIN32-NEXT: li a3, 22
+; ZVFHMIN32-NEXT: mul a2, a2, a3
; ZVFHMIN32-NEXT: add a2, sp, a2
; ZVFHMIN32-NEXT: addi a2, a2, 848
; ZVFHMIN32-NEXT: vs1r.v v8, (a2) # Unknown-size Folded Spill
; ZVFHMIN32-NEXT: vslidedown.vi v8, v16, 1
; ZVFHMIN32-NEXT: csrr a2, vlenb
-; ZVFHMIN32-NEXT: slli a4, a2, 4
-; ZVFHMIN32-NEXT: sub a2, a4, a2
+; ZVFHMIN32-NEXT: li a3, 21
+; ZVFHMIN32-NEXT: mul a2, a2, a3
; ZVFHMIN32-NEXT: add a2, sp, a2
; ZVFHMIN32-NEXT: addi a2, a2, 848
; ZVFHMIN32-NEXT: vs1r.v v8, (a2) # Unknown-size Folded Spill
; ZVFHMIN32-NEXT: vsetivli zero, 1, e16, m2, ta, ma
-; ZVFHMIN32-NEXT: vslidedown.vi v14, v16, 15
-; ZVFHMIN32-NEXT: vslidedown.vi v8, v16, 14
-; ZVFHMIN32-NEXT: vslidedown.vi v10, v16, 13
-; ZVFHMIN32-NEXT: vslidedown.vi v12, v16, 12
-; ZVFHMIN32-NEXT: vslidedown.vi v30, v16, 11
+; ZVFHMIN32-NEXT: vslidedown.vi v18, v16, 15
+; ZVFHMIN32-NEXT: vslidedown.vi v14, v16, 14
+; ZVFHMIN32-NEXT: vslidedown.vi v12, v16, 13
+; ZVFHMIN32-NEXT: vslidedown.vi v10, v16, 12
+; ZVFHMIN32-NEXT: vslidedown.vi v8, v16, 11
+; ZVFHMIN32-NEXT: vslidedown.vi v6, v16, 10
; ZVFHMIN32-NEXT: csrr a2, vlenb
-; ZVFHMIN32-NEXT: slli a4, a2, 4
-; ZVFHMIN32-NEXT: add a2, a4, a2
+; ZVFHMIN32-NEXT: li a3, 19
+; ZVFHMIN32-NEXT: mul a2, a2, a3
; ZVFHMIN32-NEXT: add a2, sp, a2
; ZVFHMIN32-NEXT: addi a2, a2, 848
-; ZVFHMIN32-NEXT: vs2r.v v30, (a2) # Unknown-size Folded Spill
-; ZVFHMIN32-NEXT: vslidedown.vi v30, v16, 10
+; ZVFHMIN32-NEXT: vs2r.v v6, (a2) # Unknown-size Folded Spill
+; ZVFHMIN32-NEXT: vslidedown.vi v6, v16, 9
; ZVFHMIN32-NEXT: csrr a2, vlenb
-; ZVFHMIN32-NEXT: li a4, 11
-; ZVFHMIN32-NEXT: mul a2, a2, a4
+; ZVFHMIN32-NEXT: li a3, 14
+; ZVFHMIN32-NEXT: mul a2, a2, a3
; ZVFHMIN32-NEXT: add a2, sp, a2
; ZVFHMIN32-NEXT: addi a2, a2, 848
-; ZVFHMIN32-NEXT: vs2r.v v30, (a2) # Unknown-size Folded Spill
-; ZVFHMIN32-NEXT: vslidedown.vi v4, v16, 9
-; ZVFHMIN32-NEXT: vslidedown.vi v30, v16, 8
+; ZVFHMIN32-NEXT: vs2r.v v6, (a2) # Unknown-size Folded Spill
+; ZVFHMIN32-NEXT: vslidedown.vi v6, v16, 8
; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT: sb a0, 216(sp)
-; ZVFHMIN32-NEXT: lh a0, 558(sp)
-; ZVFHMIN32-NEXT: lh a1, 302(sp)
+; ZVFHMIN32-NEXT: sb a0, 217(sp)
+; ZVFHMIN32-NEXT: lh a0, 560(sp)
+; ZVFHMIN32-NEXT: lh a1, 304(sp)
; ZVFHMIN32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; ZVFHMIN32-NEXT: vslidedown.vi v11, v0, 7
-; ZVFHMIN32-NEXT: vslidedown.vi v7, v0, 6
-; ZVFHMIN32-NEXT: vslidedown.vi v9, v0, 5
+; ZVFHMIN32-NEXT: vslidedown.vi v9, v0, 7
+; ZVFHMIN32-NEXT: vslidedown.vi v11, v0, 6
+; ZVFHMIN32-NEXT: vslidedown.vi v13, v0, 5
; ZVFHMIN32-NEXT: vslidedown.vi v29, v0, 4
-; ZVFHMIN32-NEXT: vslidedown.vi v31, v0, 3
-; ZVFHMIN32-NEXT: vslidedown.vi v5, v0, 2
-; ZVFHMIN32-NEXT: vslidedown.vi v27, v0, 1
+; ZVFHMIN32-NEXT: vslidedown.vi v27, v0, 3
+; ZVFHMIN32-NEXT: vslidedown.vi v7, v0, 2
+; ZVFHMIN32-NEXT: vslidedown.vi v21, v0, 1
; ZVFHMIN32-NEXT: vsetivli zero, 1, e16, m2, ta, ma
; ZVFHMIN32-NEXT: vslidedown.vi v16, v0, 15
; ZVFHMIN32-NEXT: csrr a2, vlenb
@@ -1449,88 +1447,99 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
; ZVFHMIN32-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill
; ZVFHMIN32-NEXT: vslidedown.vi v16, v0, 14
; ZVFHMIN32-NEXT: csrr a2, vlenb
-; ZVFHMIN32-NEXT: slli a2, a2, 1
+; ZVFHMIN32-NEXT: slli a2, a2, 3
; ZVFHMIN32-NEXT: add a2, sp, a2
; ZVFHMIN32-NEXT: addi a2, a2, 848
; ZVFHMIN32-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill
; ZVFHMIN32-NEXT: vslidedown.vi v16, v0, 13
; ZVFHMIN32-NEXT: csrr a2, vlenb
-; ZVFHMIN32-NEXT: li a4, 6
-; ZVFHMIN32-NEXT: mul a2, a2, a4
+; ZVFHMIN32-NEXT: li a3, 6
+; ZVFHMIN32-NEXT: mul a2, a2, a3
; ZVFHMIN32-NEXT: add a2, sp, a2
; ZVFHMIN32-NEXT: addi a2, a2, 848
; ZVFHMIN32-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill
; ZVFHMIN32-NEXT: vslidedown.vi v16, v0, 12
; ZVFHMIN32-NEXT: csrr a2, vlenb
-; ZVFHMIN32-NEXT: slli a2, a2, 3
+; ZVFHMIN32-NEXT: li a3, 12
+; ZVFHMIN32-NEXT: mul a2, a2, a3
; ZVFHMIN32-NEXT: add a2, sp, a2
; ZVFHMIN32-NEXT: addi a2, a2, 848
; ZVFHMIN32-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill
; ZVFHMIN32-NEXT: vslidedown.vi v16, v0, 11
; ZVFHMIN32-NEXT: csrr a2, vlenb
-; ZVFHMIN32-NEXT: li a4, 13
-; ZVFHMIN32-NEXT: mul a2, a2, a4
+; ZVFHMIN32-NEXT: li a3, 10
+; ZVFHMIN32-NEXT: mul a2, a2, a3
; ZVFHMIN32-NEXT: add a2, sp, a2
; ZVFHMIN32-NEXT: addi a2, a2, 848
; ZVFHMIN32-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill
; ZVFHMIN32-NEXT: vslidedown.vi v16, v0, 10
; ZVFHMIN32-NEXT: csrr a2, vlenb
-; ZVFHMIN32-NEXT: li a4, 19
-; ZVFHMIN32-NEXT: mul a2, a2, a4
+; ZVFHMIN32-NEXT: slli a2, a2, 4
; ZVFHMIN32-NEXT: add a2, sp, a2
; ZVFHMIN32-NEXT: addi a2, a2, 848
; ZVFHMIN32-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill
; ZVFHMIN32-NEXT: vslidedown.vi v16, v0, 9
-; ZVFHMIN32-NEXT: csrr a2, vlenb
-; ZVFHMIN32-NEXT: li a4, 21
-; ZVFHMIN32-NEXT: mul a2, a2, a4
-; ZVFHMIN32-NEXT: add a2, sp, a2
-; ZVFHMIN32-NEXT: addi a2, a2, 848
-; ZVFHMIN32-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill
; ZVFHMIN32-NEXT: vslidedown.vi v0, v0, 8
+; ZVFHMIN32-NEXT: addi a2, sp, 848
+; ZVFHMIN32-NEXT: vs2r.v v0, (a2) # Unknown-size Folded Spill
+; ZVFHMIN32-NEXT: vmv.x.s t4, v26
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT: sb a0, 216(sp)
+; ZVFHMIN32-NEXT: lh a0, 558(sp)
+; ZVFHMIN32-NEXT: lh a1, 302(sp)
+; ZVFHMIN32-NEXT: vmv.x.s t3, v20
+; ZVFHMIN32-NEXT: vmv.x.s t1, v28
; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
; ZVFHMIN32-NEXT: sb a0, 215(sp)
; ZVFHMIN32-NEXT: lh a0, 556(sp)
; ZVFHMIN32-NEXT: lh a1, 300(sp)
-; ZVFHMIN32-NEXT: vmv.x.s t3, v26
-; ZVFHMIN32-NEXT: vmv.x.s t2, v28
+; ZVFHMIN32-NEXT: csrr a2, vlenb
+; ZVFHMIN32-NEXT: slli a2, a2, 1
+; ZVFHMIN32-NEXT: add a2, sp, a2
+; ZVFHMIN32-NEXT: addi a2, a2, 848
+; ZVFHMIN32-NEXT: vl2r.v v0, (a2) # Unknown-size Folded Reload
+; ZVFHMIN32-NEXT: vmv.x.s t2, v0
+; ZVFHMIN32-NEXT: vmv.x.s t0, v4
; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
; ZVFHMIN32-NEXT: sb a0, 214(sp)
; ZVFHMIN32-NEXT: lh a0, 554(sp)
; ZVFHMIN32-NEXT: lh a1, 298(sp)
-; ZVFHMIN32-NEXT: addi a2, sp, 848
-; ZVFHMIN32-NEXT: vl2r.v v16, (a2) # Unknown-size Folded Reload
-; ZVFHMIN32-NEXT: vmv.x.s t1, v16
-; ZVFHMIN32-NEXT: vmv.x.s t0, v6
+; ZVFHMIN32-NEXT: vmv.x.s a7, v2
+; ZVFHMIN32-NEXT: vmv.x.s a6, v30
; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
; ZVFHMIN32-NEXT: sb a0, 213(sp)
; ZVFHMIN32-NEXT: lh a0, 552(sp)
; ZVFHMIN32-NEXT: lh a1, 296(sp)
-; ZVFHMIN32-NEXT: vmv.x.s a7, v2
-; ZVFHMIN32-NEXT: vmv.x.s a6, v22
+; ZVFHMIN32-NEXT: vmv.x.s a2, v22
+; ZVFHMIN32-NEXT: sw a2, 104(sp) # 4-byte Folded Spill
+; ZVFHMIN32-NEXT: vmv.x.s a2, v18
+; ZVFHMIN32-NEXT: sw a2, 108(sp) # 4-byte Folded Spill
; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
; ZVFHMIN32-NEXT: sb a0, 212(sp)
; ZVFHMIN32-NEXT: lh a0, 550(sp)
; ZVFHMIN32-NEXT: lh a1, 294(sp)
-; ZVFHMIN32-NEXT: vmv.x.s a5, v20
-; ZVFHMIN32-NEXT: vmv.x.s a2, v18
-; ZVFHMIN32-NEXT: sw a2, 108(sp) # 4-byte Folded Spill
+; ZVFHMIN32-NEXT: vmv.x.s a2, v14
+; ZVFHMIN32-NEXT: sw a2, 112(sp) # 4-byte Folded Spill
+; ZVFHMIN32-NEXT: vmv.x.s a2, v12
+; ZVFHMIN32-NEXT: sw a2, 116(sp) # 4-byte Folded Spill
; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
; ZVFHMIN32-NEXT: sb a0, 211(sp)
; ZVFHMIN32-NEXT: lh a0, 548(sp)
; ZVFHMIN32-NEXT: lh a1, 292(sp)
-; ZVFHMIN32-NEXT: vmv.x.s a2, v14
-; ZVFHMIN32-NEXT: sw a2, 116(sp) # 4-byte Folded Spill
+; ZVFHMIN32-NEXT: vmv.x.s a2, v10
+; ZVFHMIN32-NEXT: sw a2, 120(sp) # 4-byte Folded Spill
; ZVFHMIN32-NEXT: vmv.x.s a2, v8
; ZVFHMIN32-NEXT: sw a2, 124(sp) # 4-byte Folded Spill
; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
@@ -1539,208 +1548,204 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
; ZVFHMIN32-NEXT: sb a0, 210(sp)
; ZVFHMIN32-NEXT: lh a0, 546(sp)
; ZVFHMIN32-NEXT: lh a1, 290(sp)
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a3
-; ZVFHMIN32-NEXT: vmv.x.s a3, v24
+; ZVFHMIN32-NEXT: fmv.h.x fa5, t5
+; ZVFHMIN32-NEXT: vmv.x.s t5, v24
; ZVFHMIN32-NEXT: fmv.h.x fa4, a0
; ZVFHMIN32-NEXT: fmv.h.x fa3, a1
; ZVFHMIN32-NEXT: feq.h a0, fa4, fa3
; ZVFHMIN32-NEXT: sb a0, 209(sp)
; ZVFHMIN32-NEXT: lh a0, 544(sp)
; ZVFHMIN32-NEXT: lh a1, 288(sp)
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a3
-; ZVFHMIN32-NEXT: feq.h a3, fa5, fa4
+; ZVFHMIN32-NEXT: fmv.h.x fa4, t5
+; ZVFHMIN32-NEXT: feq.h t5, fa5, fa4
; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT: sb a3, 192(sp)
+; ZVFHMIN32-NEXT: sb t5, 192(sp)
; ZVFHMIN32-NEXT: sb a0, 208(sp)
; ZVFHMIN32-NEXT: lh a0, 738(sp)
; ZVFHMIN32-NEXT: lh a1, 482(sp)
-; ZVFHMIN32-NEXT: vmv.x.s a2, v10
-; ZVFHMIN32-NEXT: sw a2, 112(sp) # 4-byte Folded Spill
-; ZVFHMIN32-NEXT: vmv.x.s a2, v12
-; ZVFHMIN32-NEXT: sw a2, 120(sp) # 4-byte Folded Spill
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT: sb a0, 177(sp)
-; ZVFHMIN32-NEXT: lh a0, 736(sp)
-; ZVFHMIN32-NEXT: lh a1, 480(sp)
; ZVFHMIN32-NEXT: csrr a2, vlenb
; ZVFHMIN32-NEXT: li a3, 29
; ZVFHMIN32-NEXT: mul a2, a2, a3
; ZVFHMIN32-NEXT: add a2, sp, a2
-; ZVFHMIN32-NEXT: lh s5, 848(a2) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT: lh s7, 848(a2) # 8-byte Folded Reload
; ZVFHMIN32-NEXT: csrr a2, vlenb
; ZVFHMIN32-NEXT: li a3, 28
; ZVFHMIN32-NEXT: mul a2, a2, a3
; ZVFHMIN32-NEXT: add a2, sp, a2
-; ZVFHMIN32-NEXT: lh s2, 848(a2) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT: lh s4, 848(a2) # 8-byte Folded Reload
; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT: sb a0, 176(sp)
-; ZVFHMIN32-NEXT: lh a0, 734(sp)
-; ZVFHMIN32-NEXT: lh a1, 478(sp)
+; ZVFHMIN32-NEXT: sb a0, 177(sp)
+; ZVFHMIN32-NEXT: lh a0, 736(sp)
+; ZVFHMIN32-NEXT: lh a1, 480(sp)
; ZVFHMIN32-NEXT: csrr a2, vlenb
; ZVFHMIN32-NEXT: li a3, 27
; ZVFHMIN32-NEXT: mul a2, a2, a3
; ZVFHMIN32-NEXT: add a2, sp, a2
-; ZVFHMIN32-NEXT: lh s6, 848(a2) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT: lh s8, 848(a2) # 8-byte Folded Reload
; ZVFHMIN32-NEXT: csrr a2, vlenb
; ZVFHMIN32-NEXT: li a3, 26
; ZVFHMIN32-NEXT: mul a2, a2, a3
; ZVFHMIN32-NEXT: add a2, sp, a2
-; ZVFHMIN32-NEXT: lh s3, 848(a2) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT: lh s5, 848(a2) # 8-byte Folded Reload
; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT: sb a0, 175(sp)
-; ZVFHMIN32-NEXT: lh a0, 732(sp)
-; ZVFHMIN32-NEXT: lh a1, 476(sp)
+; ZVFHMIN32-NEXT: sb a0, 176(sp)
+; ZVFHMIN32-NEXT: lh a0, 734(sp)
+; ZVFHMIN32-NEXT: lh a1, 478(sp)
; ZVFHMIN32-NEXT: csrr a2, vlenb
; ZVFHMIN32-NEXT: li a3, 25
; ZVFHMIN32-NEXT: mul a2, a2, a3
; ZVFHMIN32-NEXT: add a2, sp, a2
-; ZVFHMIN32-NEXT: lh s7, 848(a2) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT: lh s9, 848(a2) # 8-byte Folded Reload
; ZVFHMIN32-NEXT: csrr a2, vlenb
; ZVFHMIN32-NEXT: li a3, 24
; ZVFHMIN32-NEXT: mul a2, a2, a3
; ZVFHMIN32-NEXT: add a2, sp, a2
-; ZVFHMIN32-NEXT: lh s4, 848(a2) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT: lh s6, 848(a2) # 8-byte Folded Reload
; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT: sb a0, 174(sp)
-; ZVFHMIN32-NEXT: lh a0, 730(sp)
-; ZVFHMIN32-NEXT: lh a1, 474(sp)
+; ZVFHMIN32-NEXT: sb a0, 175(sp)
+; ZVFHMIN32-NEXT: lh a0, 732(sp)
+; ZVFHMIN32-NEXT: lh a1, 476(sp)
; ZVFHMIN32-NEXT: csrr a2, vlenb
; ZVFHMIN32-NEXT: li a3, 23
; ZVFHMIN32-NEXT: mul a2, a2, a3
; ZVFHMIN32-NEXT: add a2, sp, a2
-; ZVFHMIN32-NEXT: lh s8, 848(a2) # 8-byte Folded Reload
-; ZVFHMIN32-NEXT: vmv.x.s t4, v21
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT: sb a0, 173(sp)
-; ZVFHMIN32-NEXT: lh a0, 728(sp)
-; ZVFHMIN32-NEXT: lh a1, 472(sp)
-; ZVFHMIN32-NEXT: vmv.x.s t6, v3
-; ZVFHMIN32-NEXT: vmv.x.s t5, v19
+; ZVFHMIN32-NEXT: lh s3, 848(a2) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT: vmv.x.s t5, v3
; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT: sb a0, 172(sp)
-; ZVFHMIN32-NEXT: lh a0, 726(sp)
-; ZVFHMIN32-NEXT: lh a1, 470(sp)
-; ZVFHMIN32-NEXT: vmv.x.s s10, v11
-; ZVFHMIN32-NEXT: vmv.x.s s11, v7
+; ZVFHMIN32-NEXT: sb a0, 174(sp)
+; ZVFHMIN32-NEXT: lh a0, 730(sp)
+; ZVFHMIN32-NEXT: lh a1, 474(sp)
+; ZVFHMIN32-NEXT: vmv.x.s s2, v31
+; ZVFHMIN32-NEXT: vmv.x.s t6, v5
; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT: sb a0, 171(sp)
-; ZVFHMIN32-NEXT: lh a0, 724(sp)
-; ZVFHMIN32-NEXT: lh s9, 468(sp)
-; ZVFHMIN32-NEXT: vmv.x.s a4, v9
-; ZVFHMIN32-NEXT: vmv.x.s ra, v29
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT: fmv.h.x fa4, s9
+; ZVFHMIN32-NEXT: sb a0, 173(sp)
+; ZVFHMIN32-NEXT: lh a1, 728(sp)
+; ZVFHMIN32-NEXT: lh s10, 472(sp)
+; ZVFHMIN32-NEXT: vmv.x.s a3, v9
+; ZVFHMIN32-NEXT: vmv.x.s a4, v11
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT: fmv.h.x fa4, s10
+; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT: sb a1, 172(sp)
+; ZVFHMIN32-NEXT: lh a1, 726(sp)
+; ZVFHMIN32-NEXT: lh s10, 470(sp)
+; ZVFHMIN32-NEXT: vmv.x.s a2, v13
+; ZVFHMIN32-NEXT: vmv.x.s s11, v29
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT: fmv.h.x fa4, s10
+; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT: sb a1, 171(sp)
+; ZVFHMIN32-NEXT: lh ra, 724(sp)
+; ZVFHMIN32-NEXT: lh a0, 468(sp)
+; ZVFHMIN32-NEXT: vmv.x.s a5, v27
+; ZVFHMIN32-NEXT: vmv.x.s s10, v7
+; ZVFHMIN32-NEXT: fmv.h.x fa5, ra
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a0
; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
; ZVFHMIN32-NEXT: sb a0, 170(sp)
; ZVFHMIN32-NEXT: lh a0, 722(sp)
; ZVFHMIN32-NEXT: lh a1, 466(sp)
-; ZVFHMIN32-NEXT: vmv.x.s s9, v31
-; ZVFHMIN32-NEXT: vmv.x.s a3, v5
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT: sb a0, 169(sp)
-; ZVFHMIN32-NEXT: lh a0, 720(sp)
-; ZVFHMIN32-NEXT: lh a1, 464(sp)
-; ZVFHMIN32-NEXT: vmv.x.s a2, v27
-; ZVFHMIN32-NEXT: fmv.h.x fa5, s5
+; ZVFHMIN32-NEXT: vmv.x.s ra, v21
+; ZVFHMIN32-NEXT: fmv.h.x fa5, s7
; ZVFHMIN32-NEXT: fmv.h.x fa4, a0
; ZVFHMIN32-NEXT: fmv.h.x fa3, a1
; ZVFHMIN32-NEXT: feq.h a0, fa4, fa3
-; ZVFHMIN32-NEXT: sb a0, 168(sp)
-; ZVFHMIN32-NEXT: lh a0, 718(sp)
-; ZVFHMIN32-NEXT: lh a1, 462(sp)
-; ZVFHMIN32-NEXT: fmv.h.x fa4, s2
-; ZVFHMIN32-NEXT: fmv.h.x fa3, s6
+; ZVFHMIN32-NEXT: sb a0, 169(sp)
+; ZVFHMIN32-NEXT: lh a0, 720(sp)
+; ZVFHMIN32-NEXT: lh a1, 464(sp)
+; ZVFHMIN32-NEXT: fmv.h.x fa4, s4
+; ZVFHMIN32-NEXT: fmv.h.x fa3, s8
; ZVFHMIN32-NEXT: fmv.h.x fa2, a0
; ZVFHMIN32-NEXT: fmv.h.x fa1, a1
; ZVFHMIN32-NEXT: feq.h a0, fa2, fa1
-; ZVFHMIN32-NEXT: sb a0, 167(sp)
-; ZVFHMIN32-NEXT: lh a0, 716(sp)
-; ZVFHMIN32-NEXT: lh a1, 460(sp)
-; ZVFHMIN32-NEXT: fmv.h.x fa2, s3
-; ZVFHMIN32-NEXT: fmv.h.x fa1, s7
+; ZVFHMIN32-NEXT: sb a0, 168(sp)
+; ZVFHMIN32-NEXT: lh a0, 718(sp)
+; ZVFHMIN32-NEXT: lh a1, 462(sp)
+; ZVFHMIN32-NEXT: fmv.h.x fa2, s5
+; ZVFHMIN32-NEXT: fmv.h.x fa1, s9
; ZVFHMIN32-NEXT: fmv.h.x fa0, a0
; ZVFHMIN32-NEXT: fmv.h.x ft0, a1
; ZVFHMIN32-NEXT: feq.h a0, fa0, ft0
-; ZVFHMIN32-NEXT: sb a0, 166(sp)
-; ZVFHMIN32-NEXT: lh a0, 714(sp)
-; ZVFHMIN32-NEXT: lh a1, 458(sp)
-; ZVFHMIN32-NEXT: fmv.h.x fa0, s4
-; ZVFHMIN32-NEXT: fmv.h.x ft0, s8
+; ZVFHMIN32-NEXT: sb a0, 167(sp)
+; ZVFHMIN32-NEXT: lh a0, 716(sp)
+; ZVFHMIN32-NEXT: fmv.h.x fa0, s6
+; ZVFHMIN32-NEXT: lh a1, 460(sp)
+; ZVFHMIN32-NEXT: fmv.h.x ft0, a3
; ZVFHMIN32-NEXT: fmv.h.x ft1, a0
-; ZVFHMIN32-NEXT: fmv.h.x ft2, a1
-; ZVFHMIN32-NEXT: feq.h a0, ft1, ft2
-; ZVFHMIN32-NEXT: sb a0, 165(sp)
-; ZVFHMIN32-NEXT: lh a0, 712(sp)
-; ZVFHMIN32-NEXT: lh a1, 456(sp)
-; ZVFHMIN32-NEXT: fmv.h.x ft1, s10
-; ZVFHMIN32-NEXT: fmv.h.x ft2, s11
-; ZVFHMIN32-NEXT: fmv.h.x ft3, a0
-; ZVFHMIN32-NEXT: fmv.h.x ft4, a1
-; ZVFHMIN32-NEXT: feq.h a0, ft3, ft4
-; ZVFHMIN32-NEXT: sb a0, 164(sp)
-; ZVFHMIN32-NEXT: lh a0, 710(sp)
-; ZVFHMIN32-NEXT: fmv.h.x ft3, a4
-; ZVFHMIN32-NEXT: lh a1, 454(sp)
-; ZVFHMIN32-NEXT: fmv.h.x ft4, ra
-; ZVFHMIN32-NEXT: fmv.h.x ft5, a0
-; ZVFHMIN32-NEXT: feq.h a0, fa5, ft1
+; ZVFHMIN32-NEXT: feq.h a0, fa5, ft0
; ZVFHMIN32-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT: feq.h a1, ft5, fa5
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a3
-; ZVFHMIN32-NEXT: sb a1, 163(sp)
-; ZVFHMIN32-NEXT: lh a1, 708(sp)
-; ZVFHMIN32-NEXT: fmv.h.x ft1, a2
-; ZVFHMIN32-NEXT: lh a2, 452(sp)
-; ZVFHMIN32-NEXT: feq.h a3, fa0, fa5
+; ZVFHMIN32-NEXT: feq.h a1, ft1, fa5
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a4
+; ZVFHMIN32-NEXT: sb a1, 166(sp)
+; ZVFHMIN32-NEXT: lh a1, 714(sp)
+; ZVFHMIN32-NEXT: fmv.h.x ft0, a2
+; ZVFHMIN32-NEXT: lh a2, 458(sp)
+; ZVFHMIN32-NEXT: feq.h a3, fa4, fa5
; ZVFHMIN32-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT: feq.h a1, ft0, ft1
-; ZVFHMIN32-NEXT: fmv.h.x fa0, a2
-; ZVFHMIN32-NEXT: feq.h a2, fa5, fa0
-; ZVFHMIN32-NEXT: fmv.h.x fa5, s9
+; ZVFHMIN32-NEXT: feq.h a1, fa3, ft0
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a2
+; ZVFHMIN32-NEXT: feq.h a2, fa5, fa4
+; ZVFHMIN32-NEXT: fmv.h.x fa5, s3
+; ZVFHMIN32-NEXT: sb a2, 165(sp)
+; ZVFHMIN32-NEXT: lh a2, 712(sp)
+; ZVFHMIN32-NEXT: lh a4, 456(sp)
+; ZVFHMIN32-NEXT: fmv.h.x fa4, s11
+; ZVFHMIN32-NEXT: feq.h s3, fa2, fa4
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a2
+; ZVFHMIN32-NEXT: fmv.h.x fa3, a4
+; ZVFHMIN32-NEXT: feq.h a2, fa4, fa3
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a5
+; ZVFHMIN32-NEXT: sb a2, 164(sp)
+; ZVFHMIN32-NEXT: lh a2, 710(sp)
+; ZVFHMIN32-NEXT: lh a4, 454(sp)
+; ZVFHMIN32-NEXT: fmv.h.x fa3, s10
+; ZVFHMIN32-NEXT: feq.h a5, fa1, fa4
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a2
+; ZVFHMIN32-NEXT: fmv.h.x fa2, a4
+; ZVFHMIN32-NEXT: feq.h a2, fa4, fa2
+; ZVFHMIN32-NEXT: fmv.h.x fa4, ra
+; ZVFHMIN32-NEXT: sb a2, 163(sp)
+; ZVFHMIN32-NEXT: lh a2, 708(sp)
+; ZVFHMIN32-NEXT: lh a4, 452(sp)
+; ZVFHMIN32-NEXT: feq.h s4, fa0, fa3
+; ZVFHMIN32-NEXT: feq.h s5, fa5, fa4
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a2
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a4
+; ZVFHMIN32-NEXT: feq.h a2, fa5, fa4
; ZVFHMIN32-NEXT: sb a2, 162(sp)
; ZVFHMIN32-NEXT: lh a2, 706(sp)
; ZVFHMIN32-NEXT: lh a4, 450(sp)
-; ZVFHMIN32-NEXT: sb a1, 129(sp)
-; ZVFHMIN32-NEXT: feq.h a1, fa1, fa5
-; ZVFHMIN32-NEXT: sb a3, 130(sp)
-; ZVFHMIN32-NEXT: feq.h a3, fa2, ft4
-; ZVFHMIN32-NEXT: sb a1, 131(sp)
-; ZVFHMIN32-NEXT: feq.h a1, fa4, ft2
-; ZVFHMIN32-NEXT: sb a3, 132(sp)
-; ZVFHMIN32-NEXT: feq.h a3, fa3, ft3
+; ZVFHMIN32-NEXT: sb s5, 129(sp)
+; ZVFHMIN32-NEXT: sb s4, 130(sp)
+; ZVFHMIN32-NEXT: sb a5, 131(sp)
+; ZVFHMIN32-NEXT: sb s3, 132(sp)
; ZVFHMIN32-NEXT: fmv.h.x fa5, a2
; ZVFHMIN32-NEXT: fmv.h.x fa4, a4
; ZVFHMIN32-NEXT: feq.h a2, fa5, fa4
-; ZVFHMIN32-NEXT: sb a3, 133(sp)
-; ZVFHMIN32-NEXT: sb a1, 134(sp)
+; ZVFHMIN32-NEXT: sb a1, 133(sp)
+; ZVFHMIN32-NEXT: sb a3, 134(sp)
; ZVFHMIN32-NEXT: sb a0, 135(sp)
; ZVFHMIN32-NEXT: sb a2, 161(sp)
; ZVFHMIN32-NEXT: lh a0, 610(sp)
; ZVFHMIN32-NEXT: lh a1, 354(sp)
-; ZVFHMIN32-NEXT: vmv.x.s s4, v23
+; ZVFHMIN32-NEXT: vmv.x.s s6, v23
; ZVFHMIN32-NEXT: csrr a2, vlenb
-; ZVFHMIN32-NEXT: li a3, 10
+; ZVFHMIN32-NEXT: li a3, 18
; ZVFHMIN32-NEXT: mul a2, a2, a3
; ZVFHMIN32-NEXT: add a2, sp, a2
-; ZVFHMIN32-NEXT: lh s2, 848(a2) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT: lh s5, 848(a2) # 8-byte Folded Reload
; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
@@ -1748,12 +1753,13 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
; ZVFHMIN32-NEXT: lh a0, 608(sp)
; ZVFHMIN32-NEXT: lh a1, 352(sp)
; ZVFHMIN32-NEXT: csrr a2, vlenb
-; ZVFHMIN32-NEXT: slli a2, a2, 4
+; ZVFHMIN32-NEXT: li a3, 22
+; ZVFHMIN32-NEXT: mul a2, a2, a3
; ZVFHMIN32-NEXT: add a2, sp, a2
-; ZVFHMIN32-NEXT: lh s5, 848(a2) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT: lh s4, 848(a2) # 8-byte Folded Reload
; ZVFHMIN32-NEXT: csrr a2, vlenb
-; ZVFHMIN32-NEXT: slli a3, a2, 4
-; ZVFHMIN32-NEXT: sub a2, a3, a2
+; ZVFHMIN32-NEXT: li a3, 21
+; ZVFHMIN32-NEXT: mul a2, a2, a3
; ZVFHMIN32-NEXT: add a2, sp, a2
; ZVFHMIN32-NEXT: lh s3, 848(a2) # 8-byte Folded Reload
; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
@@ -1762,148 +1768,148 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
; ZVFHMIN32-NEXT: sb a0, 240(sp)
; ZVFHMIN32-NEXT: lh a0, 606(sp)
; ZVFHMIN32-NEXT: lh a1, 350(sp)
-; ZVFHMIN32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 7
-; ZVFHMIN32-NEXT: vmv.x.s s6, v8
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT: fmv.h.x fa3, t5
+; ZVFHMIN32-NEXT: fmv.h.x fa5, s2
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a0
+; ZVFHMIN32-NEXT: fmv.h.x fa2, a1
+; ZVFHMIN32-NEXT: feq.h a0, fa4, fa2
; ZVFHMIN32-NEXT: sb a0, 239(sp)
; ZVFHMIN32-NEXT: lh a0, 604(sp)
; ZVFHMIN32-NEXT: lh a1, 348(sp)
-; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 6
-; ZVFHMIN32-NEXT: vmv.x.s s7, v8
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT: fmv.h.x fa4, t6
+; ZVFHMIN32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 7
+; ZVFHMIN32-NEXT: fmv.h.x fa2, a0
+; ZVFHMIN32-NEXT: fmv.h.x fa1, a1
+; ZVFHMIN32-NEXT: feq.h a0, fa2, fa1
; ZVFHMIN32-NEXT: sb a0, 238(sp)
; ZVFHMIN32-NEXT: lh a0, 602(sp)
; ZVFHMIN32-NEXT: lh a1, 346(sp)
-; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 5
-; ZVFHMIN32-NEXT: vmv.x.s s8, v8
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT: vmv.x.s a2, v8
+; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 6
+; ZVFHMIN32-NEXT: fmv.h.x fa2, a0
+; ZVFHMIN32-NEXT: fmv.h.x fa1, a1
+; ZVFHMIN32-NEXT: feq.h a0, fa2, fa1
; ZVFHMIN32-NEXT: sb a0, 237(sp)
; ZVFHMIN32-NEXT: lh a0, 600(sp)
; ZVFHMIN32-NEXT: lh a1, 344(sp)
-; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 4
-; ZVFHMIN32-NEXT: vmv.x.s s9, v8
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT: vmv.x.s a3, v8
+; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 5
+; ZVFHMIN32-NEXT: fmv.h.x fa2, a0
+; ZVFHMIN32-NEXT: fmv.h.x fa1, a1
+; ZVFHMIN32-NEXT: feq.h a0, fa2, fa1
; ZVFHMIN32-NEXT: sb a0, 236(sp)
; ZVFHMIN32-NEXT: lh a0, 598(sp)
; ZVFHMIN32-NEXT: lh a1, 342(sp)
-; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 3
-; ZVFHMIN32-NEXT: vmv.x.s s10, v8
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT: vmv.x.s a4, v8
+; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 4
+; ZVFHMIN32-NEXT: fmv.h.x fa2, a0
+; ZVFHMIN32-NEXT: fmv.h.x fa1, a1
+; ZVFHMIN32-NEXT: feq.h a0, fa2, fa1
; ZVFHMIN32-NEXT: sb a0, 235(sp)
; ZVFHMIN32-NEXT: lh a0, 596(sp)
; ZVFHMIN32-NEXT: lh a1, 340(sp)
-; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 2
-; ZVFHMIN32-NEXT: vmv.x.s s11, v8
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT: vmv.x.s a5, v8
+; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 3
+; ZVFHMIN32-NEXT: fmv.h.x fa2, a0
+; ZVFHMIN32-NEXT: fmv.h.x fa1, a1
+; ZVFHMIN32-NEXT: feq.h a0, fa2, fa1
; ZVFHMIN32-NEXT: sb a0, 234(sp)
; ZVFHMIN32-NEXT: lh a0, 594(sp)
; ZVFHMIN32-NEXT: lh a1, 338(sp)
-; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 1
-; ZVFHMIN32-NEXT: vmv.x.s ra, v8
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT: vmv.x.s t6, v8
+; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 2
+; ZVFHMIN32-NEXT: fmv.h.x fa2, a0
+; ZVFHMIN32-NEXT: fmv.h.x fa1, a1
+; ZVFHMIN32-NEXT: feq.h a0, fa2, fa1
; ZVFHMIN32-NEXT: sb a0, 233(sp)
; ZVFHMIN32-NEXT: lh a0, 592(sp)
; ZVFHMIN32-NEXT: lh a1, 336(sp)
-; ZVFHMIN32-NEXT: fmv.h.x fa5, t4
-; ZVFHMIN32-NEXT: fmv.h.x fa4, t6
-; ZVFHMIN32-NEXT: fmv.h.x fa3, a0
-; ZVFHMIN32-NEXT: fmv.h.x fa2, a1
-; ZVFHMIN32-NEXT: feq.h a0, fa3, fa2
+; ZVFHMIN32-NEXT: vmv.x.s s2, v8
+; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 1
+; ZVFHMIN32-NEXT: fmv.h.x fa2, a0
+; ZVFHMIN32-NEXT: fmv.h.x fa1, a1
+; ZVFHMIN32-NEXT: feq.h a0, fa2, fa1
; ZVFHMIN32-NEXT: sb a0, 232(sp)
; ZVFHMIN32-NEXT: lh a0, 590(sp)
+; ZVFHMIN32-NEXT: fmv.h.x fa2, a2
; ZVFHMIN32-NEXT: lh a1, 334(sp)
-; ZVFHMIN32-NEXT: fmv.h.x fa3, t5
-; ZVFHMIN32-NEXT: fmv.h.x fa2, s4
+; ZVFHMIN32-NEXT: vmv.x.s a2, v8
; ZVFHMIN32-NEXT: fmv.h.x fa1, a0
-; ZVFHMIN32-NEXT: fmv.h.x fa0, a1
-; ZVFHMIN32-NEXT: feq.h a0, fa1, fa0
+; ZVFHMIN32-NEXT: feq.h t5, fa3, fa2
+; ZVFHMIN32-NEXT: fmv.h.x fa3, a1
+; ZVFHMIN32-NEXT: feq.h a0, fa1, fa3
+; ZVFHMIN32-NEXT: fmv.h.x fa3, a3
; ZVFHMIN32-NEXT: sb a0, 231(sp)
; ZVFHMIN32-NEXT: lh a0, 588(sp)
+; ZVFHMIN32-NEXT: fmv.h.x fa2, a4
; ZVFHMIN32-NEXT: lh a1, 332(sp)
-; ZVFHMIN32-NEXT: fmv.h.x fa1, s2
-; ZVFHMIN32-NEXT: fmv.h.x fa0, s5
-; ZVFHMIN32-NEXT: fmv.h.x ft0, a0
-; ZVFHMIN32-NEXT: fmv.h.x ft1, a1
-; ZVFHMIN32-NEXT: feq.h a0, ft0, ft1
-; ZVFHMIN32-NEXT: sb a0, 230(sp)
-; ZVFHMIN32-NEXT: lh a0, 586(sp)
-; ZVFHMIN32-NEXT: fmv.h.x ft0, s3
-; ZVFHMIN32-NEXT: lh a1, 330(sp)
-; ZVFHMIN32-NEXT: fmv.h.x ft1, s6
-; ZVFHMIN32-NEXT: fmv.h.x ft2, a0
-; ZVFHMIN32-NEXT: feq.h a0, fa5, ft1
+; ZVFHMIN32-NEXT: feq.h a3, fa5, fa3
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT: feq.h a0, fa4, fa2
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT: fmv.h.x fa5, s6
+; ZVFHMIN32-NEXT: sb a1, 230(sp)
+; ZVFHMIN32-NEXT: lh a1, 586(sp)
+; ZVFHMIN32-NEXT: lh a4, 330(sp)
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a5
+; ZVFHMIN32-NEXT: feq.h a5, fa5, fa4
; ZVFHMIN32-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT: feq.h a1, ft2, fa5
-; ZVFHMIN32-NEXT: fmv.h.x fa5, s7
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a4
+; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT: fmv.h.x fa5, s5
; ZVFHMIN32-NEXT: sb a1, 229(sp)
; ZVFHMIN32-NEXT: lh a1, 584(sp)
-; ZVFHMIN32-NEXT: fmv.h.x ft1, s8
-; ZVFHMIN32-NEXT: lh a2, 328(sp)
-; ZVFHMIN32-NEXT: feq.h a3, fa4, fa5
+; ZVFHMIN32-NEXT: lh a4, 328(sp)
+; ZVFHMIN32-NEXT: fmv.h.x fa4, t6
+; ZVFHMIN32-NEXT: feq.h t6, fa5, fa4
; ZVFHMIN32-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT: feq.h a1, fa3, ft1
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a2
-; ZVFHMIN32-NEXT: feq.h a2, fa5, fa4
-; ZVFHMIN32-NEXT: fmv.h.x fa5, s9
-; ZVFHMIN32-NEXT: sb a2, 228(sp)
-; ZVFHMIN32-NEXT: lh a2, 582(sp)
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a4
+; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT: fmv.h.x fa5, s4
+; ZVFHMIN32-NEXT: sb a1, 228(sp)
+; ZVFHMIN32-NEXT: lh a1, 582(sp)
; ZVFHMIN32-NEXT: lh a4, 326(sp)
-; ZVFHMIN32-NEXT: fmv.h.x fa4, s10
-; ZVFHMIN32-NEXT: feq.h t4, fa2, fa5
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a2
-; ZVFHMIN32-NEXT: fmv.h.x fa3, a4
-; ZVFHMIN32-NEXT: feq.h a2, fa5, fa3
-; ZVFHMIN32-NEXT: fmv.h.x fa5, s11
-; ZVFHMIN32-NEXT: fmv.h.x fa3, ra
-; ZVFHMIN32-NEXT: sb a2, 227(sp)
-; ZVFHMIN32-NEXT: lh a2, 580(sp)
+; ZVFHMIN32-NEXT: fmv.h.x fa4, s2
+; ZVFHMIN32-NEXT: feq.h s2, fa5, fa4
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a4
+; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT: fmv.h.x fa5, s3
+; ZVFHMIN32-NEXT: sb a1, 227(sp)
+; ZVFHMIN32-NEXT: lh a1, 580(sp)
; ZVFHMIN32-NEXT: lh a4, 324(sp)
-; ZVFHMIN32-NEXT: feq.h t5, fa0, fa5
-; ZVFHMIN32-NEXT: feq.h t6, ft0, fa3
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a2
-; ZVFHMIN32-NEXT: fmv.h.x fa3, a4
-; ZVFHMIN32-NEXT: feq.h a2, fa5, fa3
-; ZVFHMIN32-NEXT: sb a2, 226(sp)
-; ZVFHMIN32-NEXT: lh a2, 578(sp)
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a2
+; ZVFHMIN32-NEXT: feq.h a2, fa5, fa4
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a4
+; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT: sb a1, 226(sp)
+; ZVFHMIN32-NEXT: lh a1, 578(sp)
; ZVFHMIN32-NEXT: lh a4, 322(sp)
-; ZVFHMIN32-NEXT: sb t6, 193(sp)
-; ZVFHMIN32-NEXT: feq.h t6, fa1, fa4
-; ZVFHMIN32-NEXT: sb t5, 194(sp)
+; ZVFHMIN32-NEXT: sb a2, 193(sp)
+; ZVFHMIN32-NEXT: sb s2, 194(sp)
; ZVFHMIN32-NEXT: sb t6, 195(sp)
-; ZVFHMIN32-NEXT: sb t4, 196(sp)
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a2
+; ZVFHMIN32-NEXT: sb a5, 196(sp)
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a1
; ZVFHMIN32-NEXT: fmv.h.x fa4, a4
-; ZVFHMIN32-NEXT: feq.h a2, fa5, fa4
-; ZVFHMIN32-NEXT: sb a1, 197(sp)
+; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT: sb a0, 197(sp)
; ZVFHMIN32-NEXT: sb a3, 198(sp)
-; ZVFHMIN32-NEXT: sb a0, 199(sp)
-; ZVFHMIN32-NEXT: sb a2, 225(sp)
+; ZVFHMIN32-NEXT: sb t5, 199(sp)
+; ZVFHMIN32-NEXT: sb a1, 225(sp)
; ZVFHMIN32-NEXT: lh a0, 766(sp)
; ZVFHMIN32-NEXT: lh a1, 510(sp)
; ZVFHMIN32-NEXT: csrr a2, vlenb
-; ZVFHMIN32-NEXT: slli a3, a2, 4
-; ZVFHMIN32-NEXT: add a2, a3, a2
+; ZVFHMIN32-NEXT: li a3, 19
+; ZVFHMIN32-NEXT: mul a2, a2, a3
; ZVFHMIN32-NEXT: add a2, sp, a2
; ZVFHMIN32-NEXT: addi a2, a2, 848
; ZVFHMIN32-NEXT: vl2r.v v8, (a2) # Unknown-size Folded Reload
; ZVFHMIN32-NEXT: vmv.x.s s2, v8
; ZVFHMIN32-NEXT: csrr a2, vlenb
-; ZVFHMIN32-NEXT: li a3, 11
+; ZVFHMIN32-NEXT: li a3, 14
; ZVFHMIN32-NEXT: mul a2, a2, a3
; ZVFHMIN32-NEXT: add a2, sp, a2
; ZVFHMIN32-NEXT: addi a2, a2, 848
@@ -1915,305 +1921,301 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
; ZVFHMIN32-NEXT: sb a0, 191(sp)
; ZVFHMIN32-NEXT: lh a0, 764(sp)
; ZVFHMIN32-NEXT: lh a1, 508(sp)
-; ZVFHMIN32-NEXT: vmv.x.s t5, v4
-; ZVFHMIN32-NEXT: vmv.x.s t4, v30
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT: sb a0, 190(sp)
-; ZVFHMIN32-NEXT: lh a0, 762(sp)
-; ZVFHMIN32-NEXT: lh a1, 506(sp)
+; ZVFHMIN32-NEXT: vmv.x.s t5, v6
; ZVFHMIN32-NEXT: csrr a2, vlenb
; ZVFHMIN32-NEXT: slli a2, a2, 2
; ZVFHMIN32-NEXT: add a2, sp, a2
; ZVFHMIN32-NEXT: addi a2, a2, 848
; ZVFHMIN32-NEXT: vl2r.v v8, (a2) # Unknown-size Folded Reload
; ZVFHMIN32-NEXT: vmv.x.s a2, v8
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT: sb a0, 190(sp)
+; ZVFHMIN32-NEXT: lh a0, 762(sp)
+; ZVFHMIN32-NEXT: lh a1, 506(sp)
; ZVFHMIN32-NEXT: csrr a3, vlenb
-; ZVFHMIN32-NEXT: slli a3, a3, 1
+; ZVFHMIN32-NEXT: slli a3, a3, 3
; ZVFHMIN32-NEXT: add a3, sp, a3
; ZVFHMIN32-NEXT: addi a3, a3, 848
; ZVFHMIN32-NEXT: vl2r.v v8, (a3) # Unknown-size Folded Reload
; ZVFHMIN32-NEXT: vmv.x.s a3, v8
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT: sb a0, 189(sp)
-; ZVFHMIN32-NEXT: lh a0, 760(sp)
-; ZVFHMIN32-NEXT: lh a1, 504(sp)
-; ZVFHMIN32-NEXT: fmv.h.x fa5, t3
; ZVFHMIN32-NEXT: csrr a4, vlenb
-; ZVFHMIN32-NEXT: li t3, 6
-; ZVFHMIN32-NEXT: mul a4, a4, t3
+; ZVFHMIN32-NEXT: li a5, 6
+; ZVFHMIN32-NEXT: mul a4, a4, a5
; ZVFHMIN32-NEXT: add a4, sp, a4
; ZVFHMIN32-NEXT: addi a4, a4, 848
; ZVFHMIN32-NEXT: vl2r.v v8, (a4) # Unknown-size Folded Reload
; ZVFHMIN32-NEXT: vmv.x.s a4, v8
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a0
-; ZVFHMIN32-NEXT: fmv.h.x fa3, a1
-; ZVFHMIN32-NEXT: feq.h a0, fa4, fa3
-; ZVFHMIN32-NEXT: sb a0, 188(sp)
-; ZVFHMIN32-NEXT: lh a0, 758(sp)
-; ZVFHMIN32-NEXT: lh a1, 502(sp)
-; ZVFHMIN32-NEXT: fmv.h.x fa4, t2
-; ZVFHMIN32-NEXT: csrr t2, vlenb
-; ZVFHMIN32-NEXT: slli t2, t2, 3
-; ZVFHMIN32-NEXT: add t2, sp, t2
-; ZVFHMIN32-NEXT: addi t2, t2, 848
-; ZVFHMIN32-NEXT: vl2r.v v8, (t2) # Unknown-size Folded Reload
-; ZVFHMIN32-NEXT: vmv.x.s t2, v8
-; ZVFHMIN32-NEXT: fmv.h.x fa3, a0
-; ZVFHMIN32-NEXT: fmv.h.x fa2, a1
-; ZVFHMIN32-NEXT: feq.h a0, fa3, fa2
-; ZVFHMIN32-NEXT: sb a0, 187(sp)
-; ZVFHMIN32-NEXT: lh a0, 756(sp)
-; ZVFHMIN32-NEXT: lh a1, 500(sp)
-; ZVFHMIN32-NEXT: fmv.h.x fa3, t1
-; ZVFHMIN32-NEXT: csrr t1, vlenb
-; ZVFHMIN32-NEXT: li t3, 13
-; ZVFHMIN32-NEXT: mul t1, t1, t3
-; ZVFHMIN32-NEXT: add t1, sp, t1
-; ZVFHMIN32-NEXT: addi t1, t1, 848
-; ZVFHMIN32-NEXT: vl2r.v v8, (t1) # Unknown-size Folded Reload
-; ZVFHMIN32-NEXT: vmv.x.s t3, v8
-; ZVFHMIN32-NEXT: fmv.h.x fa2, a0
-; ZVFHMIN32-NEXT: fmv.h.x fa1, a1
-; ZVFHMIN32-NEXT: feq.h a0, fa2, fa1
-; ZVFHMIN32-NEXT: sb a0, 186(sp)
-; ZVFHMIN32-NEXT: lh a0, 754(sp)
-; ZVFHMIN32-NEXT: fmv.h.x fa2, t0
-; ZVFHMIN32-NEXT: lh a1, 498(sp)
-; ZVFHMIN32-NEXT: csrr t0, vlenb
-; ZVFHMIN32-NEXT: li t1, 19
-; ZVFHMIN32-NEXT: mul t0, t0, t1
-; ZVFHMIN32-NEXT: add t0, sp, t0
-; ZVFHMIN32-NEXT: addi t0, t0, 848
-; ZVFHMIN32-NEXT: vl2r.v v8, (t0) # Unknown-size Folded Reload
-; ZVFHMIN32-NEXT: vmv.x.s s3, v8
-; ZVFHMIN32-NEXT: fmv.h.x fa1, a0
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT: sb a0, 189(sp)
+; ZVFHMIN32-NEXT: lh a1, 760(sp)
+; ZVFHMIN32-NEXT: lh a5, 504(sp)
+; ZVFHMIN32-NEXT: csrr a0, vlenb
+; ZVFHMIN32-NEXT: li s3, 12
+; ZVFHMIN32-NEXT: mul a0, a0, s3
+; ZVFHMIN32-NEXT: add a0, sp, a0
+; ZVFHMIN32-NEXT: addi a0, a0, 848
+; ZVFHMIN32-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN32-NEXT: vmv.x.s s5, v8
; ZVFHMIN32-NEXT: csrr a0, vlenb
-; ZVFHMIN32-NEXT: li t0, 21
-; ZVFHMIN32-NEXT: mul a0, a0, t0
+; ZVFHMIN32-NEXT: li s3, 10
+; ZVFHMIN32-NEXT: mul a0, a0, s3
; ZVFHMIN32-NEXT: add a0, sp, a0
; ZVFHMIN32-NEXT: addi a0, a0, 848
; ZVFHMIN32-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload
; ZVFHMIN32-NEXT: vmv.x.s a0, v8
-; ZVFHMIN32-NEXT: fmv.h.x fa0, a1
-; ZVFHMIN32-NEXT: feq.h a1, fa1, fa0
-; ZVFHMIN32-NEXT: fmv.h.x fa1, a2
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a5
+; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT: sb a1, 188(sp)
+; ZVFHMIN32-NEXT: lh a1, 758(sp)
+; ZVFHMIN32-NEXT: lh a5, 502(sp)
+; ZVFHMIN32-NEXT: csrr s3, vlenb
+; ZVFHMIN32-NEXT: slli s3, s3, 4
+; ZVFHMIN32-NEXT: add s3, sp, s3
+; ZVFHMIN32-NEXT: addi s3, s3, 848
+; ZVFHMIN32-NEXT: vl2r.v v8, (s3) # Unknown-size Folded Reload
+; ZVFHMIN32-NEXT: vmv.x.s s4, v8
+; ZVFHMIN32-NEXT: vmv.x.s s3, v16
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a5
+; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT: fmv.h.x fa5, t4
+; ZVFHMIN32-NEXT: sb a1, 187(sp)
+; ZVFHMIN32-NEXT: lh a1, 756(sp)
+; ZVFHMIN32-NEXT: lh a5, 500(sp)
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a2
+; ZVFHMIN32-NEXT: feq.h t4, fa5, fa4
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a5
+; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT: fmv.h.x fa5, t3
+; ZVFHMIN32-NEXT: sb a1, 186(sp)
+; ZVFHMIN32-NEXT: lh a1, 754(sp)
+; ZVFHMIN32-NEXT: lh a2, 498(sp)
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a3
+; ZVFHMIN32-NEXT: feq.h t3, fa5, fa4
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a2
+; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT: fmv.h.x fa5, t1
; ZVFHMIN32-NEXT: sb a1, 185(sp)
; ZVFHMIN32-NEXT: lh a1, 752(sp)
-; ZVFHMIN32-NEXT: fmv.h.x fa0, a3
; ZVFHMIN32-NEXT: lh a2, 496(sp)
-; ZVFHMIN32-NEXT: feq.h t0, fa5, fa1
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a4
+; ZVFHMIN32-NEXT: feq.h t1, fa5, fa4
; ZVFHMIN32-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT: feq.h t1, fa4, fa0
; ZVFHMIN32-NEXT: fmv.h.x fa4, a2
; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a4
+; ZVFHMIN32-NEXT: fmv.h.x fa5, t2
; ZVFHMIN32-NEXT: sb a1, 184(sp)
; ZVFHMIN32-NEXT: lh a1, 750(sp)
-; ZVFHMIN32-NEXT: fmv.h.x fa4, t2
; ZVFHMIN32-NEXT: lh a2, 494(sp)
-; ZVFHMIN32-NEXT: feq.h a3, fa3, fa5
+; ZVFHMIN32-NEXT: fmv.h.x fa4, s5
+; ZVFHMIN32-NEXT: feq.h a3, fa5, fa4
; ZVFHMIN32-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT: feq.h a1, fa2, fa4
; ZVFHMIN32-NEXT: fmv.h.x fa4, a2
-; ZVFHMIN32-NEXT: feq.h a2, fa5, fa4
+; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT: fmv.h.x fa5, t0
+; ZVFHMIN32-NEXT: sb a1, 183(sp)
+; ZVFHMIN32-NEXT: lh a1, 748(sp)
+; ZVFHMIN32-NEXT: lh a2, 492(sp)
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a0
+; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a2
+; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4
; ZVFHMIN32-NEXT: fmv.h.x fa5, a7
-; ZVFHMIN32-NEXT: sb a2, 183(sp)
-; ZVFHMIN32-NEXT: lh a2, 748(sp)
-; ZVFHMIN32-NEXT: lh a4, 492(sp)
-; ZVFHMIN32-NEXT: fmv.h.x fa4, t3
-; ZVFHMIN32-NEXT: feq.h a7, fa5, fa4
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a2
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a4
-; ZVFHMIN32-NEXT: feq.h a2, fa5, fa4
+; ZVFHMIN32-NEXT: sb a1, 182(sp)
+; ZVFHMIN32-NEXT: lh a1, 746(sp)
+; ZVFHMIN32-NEXT: lh a2, 490(sp)
+; ZVFHMIN32-NEXT: fmv.h.x fa4, s4
+; ZVFHMIN32-NEXT: feq.h a4, fa5, fa4
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a2
+; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4
; ZVFHMIN32-NEXT: fmv.h.x fa5, a6
-; ZVFHMIN32-NEXT: sb a2, 182(sp)
-; ZVFHMIN32-NEXT: lh a2, 746(sp)
-; ZVFHMIN32-NEXT: lh a4, 490(sp)
+; ZVFHMIN32-NEXT: sb a1, 181(sp)
+; ZVFHMIN32-NEXT: lh a1, 744(sp)
+; ZVFHMIN32-NEXT: lh a2, 488(sp)
; ZVFHMIN32-NEXT: fmv.h.x fa4, s3
; ZVFHMIN32-NEXT: feq.h a6, fa5, fa4
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a2
+; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT: lw a2, 104(sp) # 4-byte Folded Reload
; ZVFHMIN32-NEXT: fmv.h.x fa5, a2
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a4
-; ZVFHMIN32-NEXT: feq.h a2, fa5, fa4
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a5
-; ZVFHMIN32-NEXT: sb a2, 181(sp)
-; ZVFHMIN32-NEXT: lh a2, 744(sp)
-; ZVFHMIN32-NEXT: lh a4, 488(sp)
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a0
-; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a2
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a4
-; ZVFHMIN32-NEXT: feq.h a2, fa5, fa4
-; ZVFHMIN32-NEXT: lw a4, 108(sp) # 4-byte Folded Reload
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a4
-; ZVFHMIN32-NEXT: vmv.x.s a5, v0
+; ZVFHMIN32-NEXT: addi a2, sp, 848
+; ZVFHMIN32-NEXT: vl2r.v v8, (a2) # Unknown-size Folded Reload
+; ZVFHMIN32-NEXT: vmv.x.s a2, v8
; ZVFHMIN32-NEXT: vsetivli zero, 1, e16, m2, ta, ma
; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 15
-; ZVFHMIN32-NEXT: vmv.x.s a4, v8
-; ZVFHMIN32-NEXT: sb a2, 180(sp)
-; ZVFHMIN32-NEXT: lh a2, 742(sp)
-; ZVFHMIN32-NEXT: lh t2, 486(sp)
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a5
-; ZVFHMIN32-NEXT: feq.h a5, fa5, fa4
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a2
-; ZVFHMIN32-NEXT: fmv.h.x fa4, t2
+; ZVFHMIN32-NEXT: vmv.x.s a5, v8
+; ZVFHMIN32-NEXT: sb a1, 180(sp)
+; ZVFHMIN32-NEXT: lh a1, 742(sp)
+; ZVFHMIN32-NEXT: lh a7, 486(sp)
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a2
; ZVFHMIN32-NEXT: feq.h a2, fa5, fa4
-; ZVFHMIN32-NEXT: sb a2, 179(sp)
-; ZVFHMIN32-NEXT: lh a2, 740(sp)
-; ZVFHMIN32-NEXT: lh t2, 484(sp)
-; ZVFHMIN32-NEXT: sb a1, 140(sp)
-; ZVFHMIN32-NEXT: sb a3, 141(sp)
-; ZVFHMIN32-NEXT: sb t1, 142(sp)
-; ZVFHMIN32-NEXT: sb t0, 143(sp)
-; ZVFHMIN32-NEXT: sb a5, 136(sp)
-; ZVFHMIN32-NEXT: sb a0, 137(sp)
-; ZVFHMIN32-NEXT: sb a6, 138(sp)
-; ZVFHMIN32-NEXT: sb a7, 139(sp)
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a2
-; ZVFHMIN32-NEXT: fmv.h.x fa4, t2
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a7
+; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT: sb a1, 179(sp)
+; ZVFHMIN32-NEXT: lh a1, 740(sp)
+; ZVFHMIN32-NEXT: lh a7, 484(sp)
+; ZVFHMIN32-NEXT: sb a3, 140(sp)
+; ZVFHMIN32-NEXT: sb t1, 141(sp)
+; ZVFHMIN32-NEXT: sb t3, 142(sp)
+; ZVFHMIN32-NEXT: sb t4, 143(sp)
+; ZVFHMIN32-NEXT: sb a2, 136(sp)
+; ZVFHMIN32-NEXT: sb a6, 137(sp)
+; ZVFHMIN32-NEXT: sb a4, 138(sp)
+; ZVFHMIN32-NEXT: sb a0, 139(sp)
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a7
; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
; ZVFHMIN32-NEXT: sb a0, 178(sp)
-; ZVFHMIN32-NEXT: lh a0, 638(sp)
-; ZVFHMIN32-NEXT: lh a1, 382(sp)
+; ZVFHMIN32-NEXT: lh a1, 638(sp)
+; ZVFHMIN32-NEXT: lh a2, 382(sp)
; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 14
-; ZVFHMIN32-NEXT: vmv.x.s t2, v8
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT: sb a0, 255(sp)
-; ZVFHMIN32-NEXT: lh a0, 636(sp)
-; ZVFHMIN32-NEXT: lh a1, 380(sp)
+; ZVFHMIN32-NEXT: vmv.x.s a0, v8
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a2
+; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT: sb a1, 255(sp)
+; ZVFHMIN32-NEXT: lh a1, 636(sp)
+; ZVFHMIN32-NEXT: lh a2, 380(sp)
; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 13
-; ZVFHMIN32-NEXT: vmv.x.s t1, v8
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT: sb a0, 254(sp)
-; ZVFHMIN32-NEXT: lh a0, 634(sp)
-; ZVFHMIN32-NEXT: lh a1, 378(sp)
+; ZVFHMIN32-NEXT: vmv.x.s t2, v8
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a2
+; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT: sb a1, 254(sp)
+; ZVFHMIN32-NEXT: lh a1, 634(sp)
+; ZVFHMIN32-NEXT: lh a2, 378(sp)
; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 12
-; ZVFHMIN32-NEXT: vmv.x.s t0, v8
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT: sb a0, 253(sp)
-; ZVFHMIN32-NEXT: lh a0, 632(sp)
-; ZVFHMIN32-NEXT: lh a1, 376(sp)
+; ZVFHMIN32-NEXT: vmv.x.s t1, v8
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a2
+; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT: sb a1, 253(sp)
+; ZVFHMIN32-NEXT: lh a1, 632(sp)
+; ZVFHMIN32-NEXT: lh a2, 376(sp)
; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 11
-; ZVFHMIN32-NEXT: vmv.x.s a7, v8
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT: sb a0, 252(sp)
-; ZVFHMIN32-NEXT: lh a0, 630(sp)
-; ZVFHMIN32-NEXT: lh a1, 374(sp)
+; ZVFHMIN32-NEXT: vmv.x.s t0, v8
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a2
+; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT: sb a1, 252(sp)
+; ZVFHMIN32-NEXT: lh a1, 630(sp)
+; ZVFHMIN32-NEXT: lh a2, 374(sp)
; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 10
-; ZVFHMIN32-NEXT: vmv.x.s a6, v8
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT: sb a0, 251(sp)
-; ZVFHMIN32-NEXT: lh a0, 628(sp)
-; ZVFHMIN32-NEXT: lh a1, 372(sp)
+; ZVFHMIN32-NEXT: vmv.x.s a7, v8
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a2
+; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT: sb a1, 251(sp)
+; ZVFHMIN32-NEXT: lh a1, 628(sp)
+; ZVFHMIN32-NEXT: lh a2, 372(sp)
; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 9
-; ZVFHMIN32-NEXT: vmv.x.s a5, v8
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT: vmv.x.s a6, v8
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a2
+; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT: lw a2, 108(sp) # 4-byte Folded Reload
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a2
+; ZVFHMIN32-NEXT: sb a1, 250(sp)
+; ZVFHMIN32-NEXT: lh a1, 626(sp)
+; ZVFHMIN32-NEXT: lh a2, 370(sp)
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a5
+; ZVFHMIN32-NEXT: feq.h a4, fa5, fa4
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a2
+; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT: lw a2, 112(sp) # 4-byte Folded Reload
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a2
+; ZVFHMIN32-NEXT: sb a1, 249(sp)
+; ZVFHMIN32-NEXT: lh a1, 624(sp)
+; ZVFHMIN32-NEXT: lh a2, 368(sp)
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a0
+; ZVFHMIN32-NEXT: feq.h a3, fa5, fa4
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a2
; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
; ZVFHMIN32-NEXT: lw a1, 116(sp) # 4-byte Folded Reload
; ZVFHMIN32-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT: sb a0, 250(sp)
-; ZVFHMIN32-NEXT: lh a0, 626(sp)
-; ZVFHMIN32-NEXT: lh a1, 370(sp)
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a4
+; ZVFHMIN32-NEXT: sb a0, 248(sp)
+; ZVFHMIN32-NEXT: lh a0, 622(sp)
+; ZVFHMIN32-NEXT: lh a1, 366(sp)
+; ZVFHMIN32-NEXT: fmv.h.x fa4, t2
; ZVFHMIN32-NEXT: feq.h a2, fa5, fa4
; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT: lw a1, 124(sp) # 4-byte Folded Reload
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT: sb a0, 249(sp)
-; ZVFHMIN32-NEXT: lh a1, 624(sp)
-; ZVFHMIN32-NEXT: lh a3, 368(sp)
-; ZVFHMIN32-NEXT: fmv.h.x fa4, t2
-; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT: lw a1, 120(sp) # 4-byte Folded Reload
; ZVFHMIN32-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a3
-; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4
-; ZVFHMIN32-NEXT: lw a3, 112(sp) # 4-byte Folded Reload
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a3
-; ZVFHMIN32-NEXT: sb a1, 248(sp)
-; ZVFHMIN32-NEXT: lh a1, 622(sp)
-; ZVFHMIN32-NEXT: lh a3, 366(sp)
+; ZVFHMIN32-NEXT: sb a0, 247(sp)
+; ZVFHMIN32-NEXT: lh a0, 620(sp)
+; ZVFHMIN32-NEXT: lh a1, 364(sp)
; ZVFHMIN32-NEXT: fmv.h.x fa4, t1
-; ZVFHMIN32-NEXT: feq.h a4, fa5, fa4
+; ZVFHMIN32-NEXT: feq.h a5, fa5, fa4
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT: lw a1, 124(sp) # 4-byte Folded Reload
; ZVFHMIN32-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a3
-; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4
-; ZVFHMIN32-NEXT: lw a3, 120(sp) # 4-byte Folded Reload
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a3
-; ZVFHMIN32-NEXT: sb a1, 247(sp)
-; ZVFHMIN32-NEXT: lh a1, 620(sp)
-; ZVFHMIN32-NEXT: lh a3, 364(sp)
+; ZVFHMIN32-NEXT: sb a0, 246(sp)
+; ZVFHMIN32-NEXT: lh a0, 618(sp)
+; ZVFHMIN32-NEXT: lh a1, 362(sp)
; ZVFHMIN32-NEXT: fmv.h.x fa4, t0
; ZVFHMIN32-NEXT: feq.h t0, fa5, fa4
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a3
-; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
; ZVFHMIN32-NEXT: fmv.h.x fa5, s2
-; ZVFHMIN32-NEXT: sb a1, 246(sp)
-; ZVFHMIN32-NEXT: lh a1, 618(sp)
-; ZVFHMIN32-NEXT: lh a3, 362(sp)
+; ZVFHMIN32-NEXT: sb a0, 245(sp)
+; ZVFHMIN32-NEXT: lh a0, 616(sp)
+; ZVFHMIN32-NEXT: lh a1, 360(sp)
; ZVFHMIN32-NEXT: fmv.h.x fa4, a7
; ZVFHMIN32-NEXT: feq.h a7, fa5, fa4
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a3
-; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
; ZVFHMIN32-NEXT: fmv.h.x fa5, t6
-; ZVFHMIN32-NEXT: sb a1, 245(sp)
-; ZVFHMIN32-NEXT: lh a1, 616(sp)
-; ZVFHMIN32-NEXT: lh a3, 360(sp)
+; ZVFHMIN32-NEXT: sb a0, 244(sp)
+; ZVFHMIN32-NEXT: lh a0, 614(sp)
+; ZVFHMIN32-NEXT: lh a1, 358(sp)
; ZVFHMIN32-NEXT: fmv.h.x fa4, a6
; ZVFHMIN32-NEXT: feq.h a6, fa5, fa4
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a3
-; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
; ZVFHMIN32-NEXT: fmv.h.x fa5, t5
-; ZVFHMIN32-NEXT: sb a1, 244(sp)
-; ZVFHMIN32-NEXT: lh a1, 614(sp)
-; ZVFHMIN32-NEXT: lh a3, 358(sp)
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a5
-; ZVFHMIN32-NEXT: feq.h a5, fa5, fa4
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a3
-; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4
-; ZVFHMIN32-NEXT: fmv.h.x fa5, t4
; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 8
-; ZVFHMIN32-NEXT: vmv.x.s a3, v8
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a3
-; ZVFHMIN32-NEXT: sb a1, 243(sp)
-; ZVFHMIN32-NEXT: lh a1, 612(sp)
-; ZVFHMIN32-NEXT: lh a3, 356(sp)
-; ZVFHMIN32-NEXT: sb t0, 204(sp)
-; ZVFHMIN32-NEXT: sb a4, 205(sp)
-; ZVFHMIN32-NEXT: sb a0, 206(sp)
-; ZVFHMIN32-NEXT: sb a2, 207(sp)
+; ZVFHMIN32-NEXT: vmv.x.s a1, v8
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT: sb a0, 243(sp)
+; ZVFHMIN32-NEXT: lh a0, 612(sp)
+; ZVFHMIN32-NEXT: lh a1, 356(sp)
+; ZVFHMIN32-NEXT: sb a5, 204(sp)
+; ZVFHMIN32-NEXT: sb a2, 205(sp)
+; ZVFHMIN32-NEXT: sb a3, 206(sp)
+; ZVFHMIN32-NEXT: sb a4, 207(sp)
+; ZVFHMIN32-NEXT: feq.h a2, fa5, fa4
+; ZVFHMIN32-NEXT: sb a2, 200(sp)
+; ZVFHMIN32-NEXT: sb a6, 201(sp)
+; ZVFHMIN32-NEXT: sb a7, 202(sp)
+; ZVFHMIN32-NEXT: sb t0, 203(sp)
+; ZVFHMIN32-NEXT: li a2, 128
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT: sb a0, 200(sp)
-; ZVFHMIN32-NEXT: sb a5, 201(sp)
-; ZVFHMIN32-NEXT: sb a6, 202(sp)
-; ZVFHMIN32-NEXT: sb a7, 203(sp)
-; ZVFHMIN32-NEXT: li a0, 128
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a3
-; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4
-; ZVFHMIN32-NEXT: sb a1, 242(sp)
-; ZVFHMIN32-NEXT: addi a1, sp, 128
-; ZVFHMIN32-NEXT: vsetvli zero, a0, e8, m8, ta, ma
-; ZVFHMIN32-NEXT: vle8.v v8, (a1)
+; ZVFHMIN32-NEXT: sb a0, 242(sp)
+; ZVFHMIN32-NEXT: addi a0, sp, 128
+; ZVFHMIN32-NEXT: vsetvli zero, a2, e8, m8, ta, ma
+; ZVFHMIN32-NEXT: vle8.v v8, (a0)
; ZVFHMIN32-NEXT: vand.vi v8, v8, 1
; ZVFHMIN32-NEXT: vmsne.vi v0, v8, 0
; ZVFHMIN32-NEXT: addi sp, s0, -896
@@ -2440,12 +2442,6 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
; ZVFHMIN64-NEXT: sb a0, 219(sp)
; ZVFHMIN64-NEXT: lh a0, 564(sp)
; ZVFHMIN64-NEXT: lh a1, 308(sp)
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT: sb a0, 218(sp)
-; ZVFHMIN64-NEXT: lh a0, 562(sp)
-; ZVFHMIN64-NEXT: lh a1, 306(sp)
; ZVFHMIN64-NEXT: vsetivli zero, 1, e16, m1, ta, ma
; ZVFHMIN64-NEXT: vslidedown.vi v10, v8, 7
; ZVFHMIN64-NEXT: csrr a2, vlenb
@@ -2498,82 +2494,86 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
; ZVFHMIN64-NEXT: vs1r.v v10, (a2) # Unknown-size Folded Spill
; ZVFHMIN64-NEXT: vsetivli zero, 1, e16, m2, ta, ma
; ZVFHMIN64-NEXT: vslidedown.vi v26, v8, 15
-; ZVFHMIN64-NEXT: vslidedown.vi v28, v8, 14
-; ZVFHMIN64-NEXT: vslidedown.vi v10, v8, 13
-; ZVFHMIN64-NEXT: addi a2, sp, 800
+; ZVFHMIN64-NEXT: vslidedown.vi v20, v8, 14
+; ZVFHMIN64-NEXT: vslidedown.vi v28, v8, 13
+; ZVFHMIN64-NEXT: vslidedown.vi v10, v8, 12
+; ZVFHMIN64-NEXT: csrr a2, vlenb
+; ZVFHMIN64-NEXT: slli a2, a2, 1
+; ZVFHMIN64-NEXT: add a2, sp, a2
+; ZVFHMIN64-NEXT: addi a2, a2, 800
; ZVFHMIN64-NEXT: vs2r.v v10, (a2) # Unknown-size Folded Spill
-; ZVFHMIN64-NEXT: vslidedown.vi v6, v8, 12
-; ZVFHMIN64-NEXT: vslidedown.vi v2, v8, 11
-; ZVFHMIN64-NEXT: vslidedown.vi v22, v8, 10
-; ZVFHMIN64-NEXT: vslidedown.vi v20, v8, 9
-; ZVFHMIN64-NEXT: vslidedown.vi v18, v8, 8
-; ZVFHMIN64-NEXT: vmv.x.s a3, v16
+; ZVFHMIN64-NEXT: vslidedown.vi v4, v8, 11
+; ZVFHMIN64-NEXT: vslidedown.vi v2, v8, 10
+; ZVFHMIN64-NEXT: vslidedown.vi v30, v8, 9
+; ZVFHMIN64-NEXT: vslidedown.vi v22, v8, 8
+; ZVFHMIN64-NEXT: vmv.x.s t5, v16
; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT: sb a0, 217(sp)
-; ZVFHMIN64-NEXT: lh a0, 560(sp)
-; ZVFHMIN64-NEXT: lh a1, 304(sp)
+; ZVFHMIN64-NEXT: sb a0, 218(sp)
+; ZVFHMIN64-NEXT: lh a0, 562(sp)
+; ZVFHMIN64-NEXT: lh a1, 306(sp)
; ZVFHMIN64-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; ZVFHMIN64-NEXT: vslidedown.vi v21, v16, 7
-; ZVFHMIN64-NEXT: vslidedown.vi v3, v16, 6
-; ZVFHMIN64-NEXT: vslidedown.vi v19, v16, 5
+; ZVFHMIN64-NEXT: vslidedown.vi v3, v16, 7
+; ZVFHMIN64-NEXT: vslidedown.vi v31, v16, 6
+; ZVFHMIN64-NEXT: vslidedown.vi v5, v16, 5
; ZVFHMIN64-NEXT: vslidedown.vi v23, v16, 4
; ZVFHMIN64-NEXT: vslidedown.vi v8, v16, 3
; ZVFHMIN64-NEXT: csrr a2, vlenb
-; ZVFHMIN64-NEXT: li a4, 10
-; ZVFHMIN64-NEXT: mul a2, a2, a4
+; ZVFHMIN64-NEXT: li a3, 18
+; ZVFHMIN64-NEXT: mul a2, a2, a3
; ZVFHMIN64-NEXT: add a2, sp, a2
; ZVFHMIN64-NEXT: addi a2, a2, 800
; ZVFHMIN64-NEXT: vs1r.v v8, (a2) # Unknown-size Folded Spill
; ZVFHMIN64-NEXT: vslidedown.vi v8, v16, 2
; ZVFHMIN64-NEXT: csrr a2, vlenb
-; ZVFHMIN64-NEXT: slli a2, a2, 4
+; ZVFHMIN64-NEXT: li a3, 22
+; ZVFHMIN64-NEXT: mul a2, a2, a3
; ZVFHMIN64-NEXT: add a2, sp, a2
; ZVFHMIN64-NEXT: addi a2, a2, 800
; ZVFHMIN64-NEXT: vs1r.v v8, (a2) # Unknown-size Folded Spill
; ZVFHMIN64-NEXT: vslidedown.vi v8, v16, 1
; ZVFHMIN64-NEXT: csrr a2, vlenb
-; ZVFHMIN64-NEXT: slli a4, a2, 4
-; ZVFHMIN64-NEXT: sub a2, a4, a2
+; ZVFHMIN64-NEXT: li a3, 21
+; ZVFHMIN64-NEXT: mul a2, a2, a3
; ZVFHMIN64-NEXT: add a2, sp, a2
; ZVFHMIN64-NEXT: addi a2, a2, 800
; ZVFHMIN64-NEXT: vs1r.v v8, (a2) # Unknown-size Folded Spill
; ZVFHMIN64-NEXT: vsetivli zero, 1, e16, m2, ta, ma
-; ZVFHMIN64-NEXT: vslidedown.vi v14, v16, 15
-; ZVFHMIN64-NEXT: vslidedown.vi v8, v16, 14
-; ZVFHMIN64-NEXT: vslidedown.vi v10, v16, 13
-; ZVFHMIN64-NEXT: vslidedown.vi v12, v16, 12
-; ZVFHMIN64-NEXT: vslidedown.vi v30, v16, 11
+; ZVFHMIN64-NEXT: vslidedown.vi v18, v16, 15
+; ZVFHMIN64-NEXT: vslidedown.vi v14, v16, 14
+; ZVFHMIN64-NEXT: vslidedown.vi v12, v16, 13
+; ZVFHMIN64-NEXT: vslidedown.vi v10, v16, 12
+; ZVFHMIN64-NEXT: vslidedown.vi v8, v16, 11
+; ZVFHMIN64-NEXT: vslidedown.vi v6, v16, 10
; ZVFHMIN64-NEXT: csrr a2, vlenb
-; ZVFHMIN64-NEXT: slli a4, a2, 4
-; ZVFHMIN64-NEXT: add a2, a4, a2
+; ZVFHMIN64-NEXT: li a3, 19
+; ZVFHMIN64-NEXT: mul a2, a2, a3
; ZVFHMIN64-NEXT: add a2, sp, a2
; ZVFHMIN64-NEXT: addi a2, a2, 800
-; ZVFHMIN64-NEXT: vs2r.v v30, (a2) # Unknown-size Folded Spill
-; ZVFHMIN64-NEXT: vslidedown.vi v30, v16, 10
+; ZVFHMIN64-NEXT: vs2r.v v6, (a2) # Unknown-size Folded Spill
+; ZVFHMIN64-NEXT: vslidedown.vi v6, v16, 9
; ZVFHMIN64-NEXT: csrr a2, vlenb
-; ZVFHMIN64-NEXT: li a4, 11
-; ZVFHMIN64-NEXT: mul a2, a2, a4
+; ZVFHMIN64-NEXT: li a3, 14
+; ZVFHMIN64-NEXT: mul a2, a2, a3
; ZVFHMIN64-NEXT: add a2, sp, a2
; ZVFHMIN64-NEXT: addi a2, a2, 800
-; ZVFHMIN64-NEXT: vs2r.v v30, (a2) # Unknown-size Folded Spill
-; ZVFHMIN64-NEXT: vslidedown.vi v4, v16, 9
-; ZVFHMIN64-NEXT: vslidedown.vi v30, v16, 8
+; ZVFHMIN64-NEXT: vs2r.v v6, (a2) # Unknown-size Folded Spill
+; ZVFHMIN64-NEXT: vslidedown.vi v6, v16, 8
; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT: sb a0, 216(sp)
-; ZVFHMIN64-NEXT: lh a0, 558(sp)
-; ZVFHMIN64-NEXT: lh a1, 302(sp)
-; ZVFHMIN64-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; ZVFHMIN64-NEXT: vslidedown.vi v11, v0, 7
-; ZVFHMIN64-NEXT: vslidedown.vi v7, v0, 6
-; ZVFHMIN64-NEXT: vslidedown.vi v9, v0, 5
-; ZVFHMIN64-NEXT: vslidedown.vi v29, v0, 4
-; ZVFHMIN64-NEXT: vslidedown.vi v31, v0, 3
-; ZVFHMIN64-NEXT: vslidedown.vi v5, v0, 2
-; ZVFHMIN64-NEXT: vslidedown.vi v27, v0, 1
+; ZVFHMIN64-NEXT: sb a0, 217(sp)
+; ZVFHMIN64-NEXT: lh a0, 560(sp)
+; ZVFHMIN64-NEXT: lh a1, 304(sp)
+; ZVFHMIN64-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; ZVFHMIN64-NEXT: vslidedown.vi v9, v0, 7
+; ZVFHMIN64-NEXT: vslidedown.vi v11, v0, 6
+; ZVFHMIN64-NEXT: vslidedown.vi v13, v0, 5
+; ZVFHMIN64-NEXT: vslidedown.vi v29, v0, 4
+; ZVFHMIN64-NEXT: vslidedown.vi v27, v0, 3
+; ZVFHMIN64-NEXT: vslidedown.vi v7, v0, 2
+; ZVFHMIN64-NEXT: vslidedown.vi v21, v0, 1
; ZVFHMIN64-NEXT: vsetivli zero, 1, e16, m2, ta, ma
; ZVFHMIN64-NEXT: vslidedown.vi v16, v0, 15
; ZVFHMIN64-NEXT: csrr a2, vlenb
@@ -2583,88 +2583,99 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
; ZVFHMIN64-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill
; ZVFHMIN64-NEXT: vslidedown.vi v16, v0, 14
; ZVFHMIN64-NEXT: csrr a2, vlenb
-; ZVFHMIN64-NEXT: slli a2, a2, 1
+; ZVFHMIN64-NEXT: slli a2, a2, 3
; ZVFHMIN64-NEXT: add a2, sp, a2
; ZVFHMIN64-NEXT: addi a2, a2, 800
; ZVFHMIN64-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill
; ZVFHMIN64-NEXT: vslidedown.vi v16, v0, 13
; ZVFHMIN64-NEXT: csrr a2, vlenb
-; ZVFHMIN64-NEXT: li a4, 6
-; ZVFHMIN64-NEXT: mul a2, a2, a4
+; ZVFHMIN64-NEXT: li a3, 6
+; ZVFHMIN64-NEXT: mul a2, a2, a3
; ZVFHMIN64-NEXT: add a2, sp, a2
; ZVFHMIN64-NEXT: addi a2, a2, 800
; ZVFHMIN64-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill
; ZVFHMIN64-NEXT: vslidedown.vi v16, v0, 12
; ZVFHMIN64-NEXT: csrr a2, vlenb
-; ZVFHMIN64-NEXT: slli a2, a2, 3
+; ZVFHMIN64-NEXT: li a3, 12
+; ZVFHMIN64-NEXT: mul a2, a2, a3
; ZVFHMIN64-NEXT: add a2, sp, a2
; ZVFHMIN64-NEXT: addi a2, a2, 800
; ZVFHMIN64-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill
; ZVFHMIN64-NEXT: vslidedown.vi v16, v0, 11
; ZVFHMIN64-NEXT: csrr a2, vlenb
-; ZVFHMIN64-NEXT: li a4, 13
-; ZVFHMIN64-NEXT: mul a2, a2, a4
+; ZVFHMIN64-NEXT: li a3, 10
+; ZVFHMIN64-NEXT: mul a2, a2, a3
; ZVFHMIN64-NEXT: add a2, sp, a2
; ZVFHMIN64-NEXT: addi a2, a2, 800
; ZVFHMIN64-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill
; ZVFHMIN64-NEXT: vslidedown.vi v16, v0, 10
; ZVFHMIN64-NEXT: csrr a2, vlenb
-; ZVFHMIN64-NEXT: li a4, 19
-; ZVFHMIN64-NEXT: mul a2, a2, a4
+; ZVFHMIN64-NEXT: slli a2, a2, 4
; ZVFHMIN64-NEXT: add a2, sp, a2
; ZVFHMIN64-NEXT: addi a2, a2, 800
; ZVFHMIN64-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill
; ZVFHMIN64-NEXT: vslidedown.vi v16, v0, 9
-; ZVFHMIN64-NEXT: csrr a2, vlenb
-; ZVFHMIN64-NEXT: li a4, 21
-; ZVFHMIN64-NEXT: mul a2, a2, a4
-; ZVFHMIN64-NEXT: add a2, sp, a2
-; ZVFHMIN64-NEXT: addi a2, a2, 800
-; ZVFHMIN64-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill
; ZVFHMIN64-NEXT: vslidedown.vi v0, v0, 8
+; ZVFHMIN64-NEXT: addi a2, sp, 800
+; ZVFHMIN64-NEXT: vs2r.v v0, (a2) # Unknown-size Folded Spill
+; ZVFHMIN64-NEXT: vmv.x.s t4, v26
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT: sb a0, 216(sp)
+; ZVFHMIN64-NEXT: lh a0, 558(sp)
+; ZVFHMIN64-NEXT: lh a1, 302(sp)
+; ZVFHMIN64-NEXT: vmv.x.s t3, v20
+; ZVFHMIN64-NEXT: vmv.x.s t1, v28
; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
; ZVFHMIN64-NEXT: sb a0, 215(sp)
; ZVFHMIN64-NEXT: lh a0, 556(sp)
; ZVFHMIN64-NEXT: lh a1, 300(sp)
-; ZVFHMIN64-NEXT: vmv.x.s t3, v26
-; ZVFHMIN64-NEXT: vmv.x.s t2, v28
+; ZVFHMIN64-NEXT: csrr a2, vlenb
+; ZVFHMIN64-NEXT: slli a2, a2, 1
+; ZVFHMIN64-NEXT: add a2, sp, a2
+; ZVFHMIN64-NEXT: addi a2, a2, 800
+; ZVFHMIN64-NEXT: vl2r.v v0, (a2) # Unknown-size Folded Reload
+; ZVFHMIN64-NEXT: vmv.x.s t2, v0
+; ZVFHMIN64-NEXT: vmv.x.s t0, v4
; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
; ZVFHMIN64-NEXT: sb a0, 214(sp)
; ZVFHMIN64-NEXT: lh a0, 554(sp)
; ZVFHMIN64-NEXT: lh a1, 298(sp)
-; ZVFHMIN64-NEXT: addi a2, sp, 800
-; ZVFHMIN64-NEXT: vl2r.v v16, (a2) # Unknown-size Folded Reload
-; ZVFHMIN64-NEXT: vmv.x.s t1, v16
-; ZVFHMIN64-NEXT: vmv.x.s t0, v6
+; ZVFHMIN64-NEXT: vmv.x.s a7, v2
+; ZVFHMIN64-NEXT: vmv.x.s a6, v30
; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
; ZVFHMIN64-NEXT: sb a0, 213(sp)
; ZVFHMIN64-NEXT: lh a0, 552(sp)
; ZVFHMIN64-NEXT: lh a1, 296(sp)
-; ZVFHMIN64-NEXT: vmv.x.s a7, v2
-; ZVFHMIN64-NEXT: vmv.x.s a6, v22
+; ZVFHMIN64-NEXT: vmv.x.s a2, v22
+; ZVFHMIN64-NEXT: sd a2, 80(sp) # 8-byte Folded Spill
+; ZVFHMIN64-NEXT: vmv.x.s a2, v18
+; ZVFHMIN64-NEXT: sd a2, 88(sp) # 8-byte Folded Spill
; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
; ZVFHMIN64-NEXT: sb a0, 212(sp)
; ZVFHMIN64-NEXT: lh a0, 550(sp)
; ZVFHMIN64-NEXT: lh a1, 294(sp)
-; ZVFHMIN64-NEXT: vmv.x.s a5, v20
-; ZVFHMIN64-NEXT: vmv.x.s a2, v18
-; ZVFHMIN64-NEXT: sd a2, 88(sp) # 8-byte Folded Spill
+; ZVFHMIN64-NEXT: vmv.x.s a2, v14
+; ZVFHMIN64-NEXT: sd a2, 96(sp) # 8-byte Folded Spill
+; ZVFHMIN64-NEXT: vmv.x.s a2, v12
+; ZVFHMIN64-NEXT: sd a2, 104(sp) # 8-byte Folded Spill
; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
; ZVFHMIN64-NEXT: sb a0, 211(sp)
; ZVFHMIN64-NEXT: lh a0, 548(sp)
; ZVFHMIN64-NEXT: lh a1, 292(sp)
-; ZVFHMIN64-NEXT: vmv.x.s a2, v14
-; ZVFHMIN64-NEXT: sd a2, 104(sp) # 8-byte Folded Spill
+; ZVFHMIN64-NEXT: vmv.x.s a2, v10
+; ZVFHMIN64-NEXT: sd a2, 112(sp) # 8-byte Folded Spill
; ZVFHMIN64-NEXT: vmv.x.s a2, v8
; ZVFHMIN64-NEXT: sd a2, 120(sp) # 8-byte Folded Spill
; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
@@ -2673,208 +2684,204 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
; ZVFHMIN64-NEXT: sb a0, 210(sp)
; ZVFHMIN64-NEXT: lh a0, 546(sp)
; ZVFHMIN64-NEXT: lh a1, 290(sp)
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a3
-; ZVFHMIN64-NEXT: vmv.x.s a3, v24
+; ZVFHMIN64-NEXT: fmv.h.x fa5, t5
+; ZVFHMIN64-NEXT: vmv.x.s t5, v24
; ZVFHMIN64-NEXT: fmv.h.x fa4, a0
; ZVFHMIN64-NEXT: fmv.h.x fa3, a1
; ZVFHMIN64-NEXT: feq.h a0, fa4, fa3
; ZVFHMIN64-NEXT: sb a0, 209(sp)
; ZVFHMIN64-NEXT: lh a0, 544(sp)
; ZVFHMIN64-NEXT: lh a1, 288(sp)
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a3
-; ZVFHMIN64-NEXT: feq.h a3, fa5, fa4
+; ZVFHMIN64-NEXT: fmv.h.x fa4, t5
+; ZVFHMIN64-NEXT: feq.h t5, fa5, fa4
; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT: sb a3, 192(sp)
+; ZVFHMIN64-NEXT: sb t5, 192(sp)
; ZVFHMIN64-NEXT: sb a0, 208(sp)
; ZVFHMIN64-NEXT: lh a0, 738(sp)
; ZVFHMIN64-NEXT: lh a1, 482(sp)
-; ZVFHMIN64-NEXT: vmv.x.s a2, v10
-; ZVFHMIN64-NEXT: sd a2, 96(sp) # 8-byte Folded Spill
-; ZVFHMIN64-NEXT: vmv.x.s a2, v12
-; ZVFHMIN64-NEXT: sd a2, 112(sp) # 8-byte Folded Spill
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT: sb a0, 177(sp)
-; ZVFHMIN64-NEXT: lh a0, 736(sp)
-; ZVFHMIN64-NEXT: lh a1, 480(sp)
; ZVFHMIN64-NEXT: csrr a2, vlenb
; ZVFHMIN64-NEXT: li a3, 29
; ZVFHMIN64-NEXT: mul a2, a2, a3
; ZVFHMIN64-NEXT: add a2, sp, a2
-; ZVFHMIN64-NEXT: lh s5, 800(a2) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT: lh s7, 800(a2) # 8-byte Folded Reload
; ZVFHMIN64-NEXT: csrr a2, vlenb
; ZVFHMIN64-NEXT: li a3, 28
; ZVFHMIN64-NEXT: mul a2, a2, a3
; ZVFHMIN64-NEXT: add a2, sp, a2
-; ZVFHMIN64-NEXT: lh s2, 800(a2) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT: lh s4, 800(a2) # 8-byte Folded Reload
; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT: sb a0, 176(sp)
-; ZVFHMIN64-NEXT: lh a0, 734(sp)
-; ZVFHMIN64-NEXT: lh a1, 478(sp)
+; ZVFHMIN64-NEXT: sb a0, 177(sp)
+; ZVFHMIN64-NEXT: lh a0, 736(sp)
+; ZVFHMIN64-NEXT: lh a1, 480(sp)
; ZVFHMIN64-NEXT: csrr a2, vlenb
; ZVFHMIN64-NEXT: li a3, 27
; ZVFHMIN64-NEXT: mul a2, a2, a3
; ZVFHMIN64-NEXT: add a2, sp, a2
-; ZVFHMIN64-NEXT: lh s6, 800(a2) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT: lh s8, 800(a2) # 8-byte Folded Reload
; ZVFHMIN64-NEXT: csrr a2, vlenb
; ZVFHMIN64-NEXT: li a3, 26
; ZVFHMIN64-NEXT: mul a2, a2, a3
; ZVFHMIN64-NEXT: add a2, sp, a2
-; ZVFHMIN64-NEXT: lh s3, 800(a2) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT: lh s5, 800(a2) # 8-byte Folded Reload
; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT: sb a0, 175(sp)
-; ZVFHMIN64-NEXT: lh a0, 732(sp)
-; ZVFHMIN64-NEXT: lh a1, 476(sp)
+; ZVFHMIN64-NEXT: sb a0, 176(sp)
+; ZVFHMIN64-NEXT: lh a0, 734(sp)
+; ZVFHMIN64-NEXT: lh a1, 478(sp)
; ZVFHMIN64-NEXT: csrr a2, vlenb
; ZVFHMIN64-NEXT: li a3, 25
; ZVFHMIN64-NEXT: mul a2, a2, a3
; ZVFHMIN64-NEXT: add a2, sp, a2
-; ZVFHMIN64-NEXT: lh s7, 800(a2) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT: lh s9, 800(a2) # 8-byte Folded Reload
; ZVFHMIN64-NEXT: csrr a2, vlenb
; ZVFHMIN64-NEXT: li a3, 24
; ZVFHMIN64-NEXT: mul a2, a2, a3
; ZVFHMIN64-NEXT: add a2, sp, a2
-; ZVFHMIN64-NEXT: lh s4, 800(a2) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT: lh s6, 800(a2) # 8-byte Folded Reload
; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT: sb a0, 174(sp)
-; ZVFHMIN64-NEXT: lh a0, 730(sp)
-; ZVFHMIN64-NEXT: lh a1, 474(sp)
+; ZVFHMIN64-NEXT: sb a0, 175(sp)
+; ZVFHMIN64-NEXT: lh a0, 732(sp)
+; ZVFHMIN64-NEXT: lh a1, 476(sp)
; ZVFHMIN64-NEXT: csrr a2, vlenb
; ZVFHMIN64-NEXT: li a3, 23
; ZVFHMIN64-NEXT: mul a2, a2, a3
; ZVFHMIN64-NEXT: add a2, sp, a2
-; ZVFHMIN64-NEXT: lh s8, 800(a2) # 8-byte Folded Reload
-; ZVFHMIN64-NEXT: vmv.x.s t4, v21
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT: sb a0, 173(sp)
-; ZVFHMIN64-NEXT: lh a0, 728(sp)
-; ZVFHMIN64-NEXT: lh a1, 472(sp)
-; ZVFHMIN64-NEXT: vmv.x.s t6, v3
-; ZVFHMIN64-NEXT: vmv.x.s t5, v19
+; ZVFHMIN64-NEXT: lh s3, 800(a2) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT: vmv.x.s t5, v3
; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT: sb a0, 172(sp)
-; ZVFHMIN64-NEXT: lh a0, 726(sp)
-; ZVFHMIN64-NEXT: lh a1, 470(sp)
-; ZVFHMIN64-NEXT: vmv.x.s s10, v11
-; ZVFHMIN64-NEXT: vmv.x.s s11, v7
+; ZVFHMIN64-NEXT: sb a0, 174(sp)
+; ZVFHMIN64-NEXT: lh a0, 730(sp)
+; ZVFHMIN64-NEXT: lh a1, 474(sp)
+; ZVFHMIN64-NEXT: vmv.x.s s2, v31
+; ZVFHMIN64-NEXT: vmv.x.s t6, v5
; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT: sb a0, 171(sp)
-; ZVFHMIN64-NEXT: lh a0, 724(sp)
-; ZVFHMIN64-NEXT: lh s9, 468(sp)
-; ZVFHMIN64-NEXT: vmv.x.s a4, v9
-; ZVFHMIN64-NEXT: vmv.x.s ra, v29
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT: fmv.h.x fa4, s9
+; ZVFHMIN64-NEXT: sb a0, 173(sp)
+; ZVFHMIN64-NEXT: lh a1, 728(sp)
+; ZVFHMIN64-NEXT: lh s10, 472(sp)
+; ZVFHMIN64-NEXT: vmv.x.s a3, v9
+; ZVFHMIN64-NEXT: vmv.x.s a4, v11
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT: fmv.h.x fa4, s10
+; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT: sb a1, 172(sp)
+; ZVFHMIN64-NEXT: lh a1, 726(sp)
+; ZVFHMIN64-NEXT: lh s10, 470(sp)
+; ZVFHMIN64-NEXT: vmv.x.s a2, v13
+; ZVFHMIN64-NEXT: vmv.x.s s11, v29
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT: fmv.h.x fa4, s10
+; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT: sb a1, 171(sp)
+; ZVFHMIN64-NEXT: lh ra, 724(sp)
+; ZVFHMIN64-NEXT: lh a0, 468(sp)
+; ZVFHMIN64-NEXT: vmv.x.s a5, v27
+; ZVFHMIN64-NEXT: vmv.x.s s10, v7
+; ZVFHMIN64-NEXT: fmv.h.x fa5, ra
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a0
; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
; ZVFHMIN64-NEXT: sb a0, 170(sp)
; ZVFHMIN64-NEXT: lh a0, 722(sp)
; ZVFHMIN64-NEXT: lh a1, 466(sp)
-; ZVFHMIN64-NEXT: vmv.x.s s9, v31
-; ZVFHMIN64-NEXT: vmv.x.s a3, v5
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT: sb a0, 169(sp)
-; ZVFHMIN64-NEXT: lh a0, 720(sp)
-; ZVFHMIN64-NEXT: lh a1, 464(sp)
-; ZVFHMIN64-NEXT: vmv.x.s a2, v27
-; ZVFHMIN64-NEXT: fmv.h.x fa5, s5
+; ZVFHMIN64-NEXT: vmv.x.s ra, v21
+; ZVFHMIN64-NEXT: fmv.h.x fa5, s7
; ZVFHMIN64-NEXT: fmv.h.x fa4, a0
; ZVFHMIN64-NEXT: fmv.h.x fa3, a1
; ZVFHMIN64-NEXT: feq.h a0, fa4, fa3
-; ZVFHMIN64-NEXT: sb a0, 168(sp)
-; ZVFHMIN64-NEXT: lh a0, 718(sp)
-; ZVFHMIN64-NEXT: lh a1, 462(sp)
-; ZVFHMIN64-NEXT: fmv.h.x fa4, s2
-; ZVFHMIN64-NEXT: fmv.h.x fa3, s6
+; ZVFHMIN64-NEXT: sb a0, 169(sp)
+; ZVFHMIN64-NEXT: lh a0, 720(sp)
+; ZVFHMIN64-NEXT: lh a1, 464(sp)
+; ZVFHMIN64-NEXT: fmv.h.x fa4, s4
+; ZVFHMIN64-NEXT: fmv.h.x fa3, s8
; ZVFHMIN64-NEXT: fmv.h.x fa2, a0
; ZVFHMIN64-NEXT: fmv.h.x fa1, a1
; ZVFHMIN64-NEXT: feq.h a0, fa2, fa1
-; ZVFHMIN64-NEXT: sb a0, 167(sp)
-; ZVFHMIN64-NEXT: lh a0, 716(sp)
-; ZVFHMIN64-NEXT: lh a1, 460(sp)
-; ZVFHMIN64-NEXT: fmv.h.x fa2, s3
-; ZVFHMIN64-NEXT: fmv.h.x fa1, s7
+; ZVFHMIN64-NEXT: sb a0, 168(sp)
+; ZVFHMIN64-NEXT: lh a0, 718(sp)
+; ZVFHMIN64-NEXT: lh a1, 462(sp)
+; ZVFHMIN64-NEXT: fmv.h.x fa2, s5
+; ZVFHMIN64-NEXT: fmv.h.x fa1, s9
; ZVFHMIN64-NEXT: fmv.h.x fa0, a0
; ZVFHMIN64-NEXT: fmv.h.x ft0, a1
; ZVFHMIN64-NEXT: feq.h a0, fa0, ft0
-; ZVFHMIN64-NEXT: sb a0, 166(sp)
-; ZVFHMIN64-NEXT: lh a0, 714(sp)
-; ZVFHMIN64-NEXT: lh a1, 458(sp)
-; ZVFHMIN64-NEXT: fmv.h.x fa0, s4
-; ZVFHMIN64-NEXT: fmv.h.x ft0, s8
+; ZVFHMIN64-NEXT: sb a0, 167(sp)
+; ZVFHMIN64-NEXT: lh a0, 716(sp)
+; ZVFHMIN64-NEXT: fmv.h.x fa0, s6
+; ZVFHMIN64-NEXT: lh a1, 460(sp)
+; ZVFHMIN64-NEXT: fmv.h.x ft0, a3
; ZVFHMIN64-NEXT: fmv.h.x ft1, a0
-; ZVFHMIN64-NEXT: fmv.h.x ft2, a1
-; ZVFHMIN64-NEXT: feq.h a0, ft1, ft2
-; ZVFHMIN64-NEXT: sb a0, 165(sp)
-; ZVFHMIN64-NEXT: lh a0, 712(sp)
-; ZVFHMIN64-NEXT: lh a1, 456(sp)
-; ZVFHMIN64-NEXT: fmv.h.x ft1, s10
-; ZVFHMIN64-NEXT: fmv.h.x ft2, s11
-; ZVFHMIN64-NEXT: fmv.h.x ft3, a0
-; ZVFHMIN64-NEXT: fmv.h.x ft4, a1
-; ZVFHMIN64-NEXT: feq.h a0, ft3, ft4
-; ZVFHMIN64-NEXT: sb a0, 164(sp)
-; ZVFHMIN64-NEXT: lh a0, 710(sp)
-; ZVFHMIN64-NEXT: fmv.h.x ft3, a4
-; ZVFHMIN64-NEXT: lh a1, 454(sp)
-; ZVFHMIN64-NEXT: fmv.h.x ft4, ra
-; ZVFHMIN64-NEXT: fmv.h.x ft5, a0
-; ZVFHMIN64-NEXT: feq.h a0, fa5, ft1
+; ZVFHMIN64-NEXT: feq.h a0, fa5, ft0
; ZVFHMIN64-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT: feq.h a1, ft5, fa5
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a3
-; ZVFHMIN64-NEXT: sb a1, 163(sp)
-; ZVFHMIN64-NEXT: lh a1, 708(sp)
-; ZVFHMIN64-NEXT: fmv.h.x ft1, a2
-; ZVFHMIN64-NEXT: lh a2, 452(sp)
-; ZVFHMIN64-NEXT: feq.h a3, fa0, fa5
+; ZVFHMIN64-NEXT: feq.h a1, ft1, fa5
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a4
+; ZVFHMIN64-NEXT: sb a1, 166(sp)
+; ZVFHMIN64-NEXT: lh a1, 714(sp)
+; ZVFHMIN64-NEXT: fmv.h.x ft0, a2
+; ZVFHMIN64-NEXT: lh a2, 458(sp)
+; ZVFHMIN64-NEXT: feq.h a3, fa4, fa5
; ZVFHMIN64-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT: feq.h a1, ft0, ft1
-; ZVFHMIN64-NEXT: fmv.h.x fa0, a2
-; ZVFHMIN64-NEXT: feq.h a2, fa5, fa0
-; ZVFHMIN64-NEXT: fmv.h.x fa5, s9
+; ZVFHMIN64-NEXT: feq.h a1, fa3, ft0
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a2
+; ZVFHMIN64-NEXT: feq.h a2, fa5, fa4
+; ZVFHMIN64-NEXT: fmv.h.x fa5, s3
+; ZVFHMIN64-NEXT: sb a2, 165(sp)
+; ZVFHMIN64-NEXT: lh a2, 712(sp)
+; ZVFHMIN64-NEXT: lh a4, 456(sp)
+; ZVFHMIN64-NEXT: fmv.h.x fa4, s11
+; ZVFHMIN64-NEXT: feq.h s3, fa2, fa4
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a2
+; ZVFHMIN64-NEXT: fmv.h.x fa3, a4
+; ZVFHMIN64-NEXT: feq.h a2, fa4, fa3
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a5
+; ZVFHMIN64-NEXT: sb a2, 164(sp)
+; ZVFHMIN64-NEXT: lh a2, 710(sp)
+; ZVFHMIN64-NEXT: lh a4, 454(sp)
+; ZVFHMIN64-NEXT: fmv.h.x fa3, s10
+; ZVFHMIN64-NEXT: feq.h a5, fa1, fa4
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a2
+; ZVFHMIN64-NEXT: fmv.h.x fa2, a4
+; ZVFHMIN64-NEXT: feq.h a2, fa4, fa2
+; ZVFHMIN64-NEXT: fmv.h.x fa4, ra
+; ZVFHMIN64-NEXT: sb a2, 163(sp)
+; ZVFHMIN64-NEXT: lh a2, 708(sp)
+; ZVFHMIN64-NEXT: lh a4, 452(sp)
+; ZVFHMIN64-NEXT: feq.h s4, fa0, fa3
+; ZVFHMIN64-NEXT: feq.h s5, fa5, fa4
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a2
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a4
+; ZVFHMIN64-NEXT: feq.h a2, fa5, fa4
; ZVFHMIN64-NEXT: sb a2, 162(sp)
; ZVFHMIN64-NEXT: lh a2, 706(sp)
; ZVFHMIN64-NEXT: lh a4, 450(sp)
-; ZVFHMIN64-NEXT: sb a1, 129(sp)
-; ZVFHMIN64-NEXT: feq.h a1, fa1, fa5
-; ZVFHMIN64-NEXT: sb a3, 130(sp)
-; ZVFHMIN64-NEXT: feq.h a3, fa2, ft4
-; ZVFHMIN64-NEXT: sb a1, 131(sp)
-; ZVFHMIN64-NEXT: feq.h a1, fa4, ft2
-; ZVFHMIN64-NEXT: sb a3, 132(sp)
-; ZVFHMIN64-NEXT: feq.h a3, fa3, ft3
+; ZVFHMIN64-NEXT: sb s5, 129(sp)
+; ZVFHMIN64-NEXT: sb s4, 130(sp)
+; ZVFHMIN64-NEXT: sb a5, 131(sp)
+; ZVFHMIN64-NEXT: sb s3, 132(sp)
; ZVFHMIN64-NEXT: fmv.h.x fa5, a2
; ZVFHMIN64-NEXT: fmv.h.x fa4, a4
; ZVFHMIN64-NEXT: feq.h a2, fa5, fa4
-; ZVFHMIN64-NEXT: sb a3, 133(sp)
-; ZVFHMIN64-NEXT: sb a1, 134(sp)
+; ZVFHMIN64-NEXT: sb a1, 133(sp)
+; ZVFHMIN64-NEXT: sb a3, 134(sp)
; ZVFHMIN64-NEXT: sb a0, 135(sp)
; ZVFHMIN64-NEXT: sb a2, 161(sp)
; ZVFHMIN64-NEXT: lh a0, 610(sp)
; ZVFHMIN64-NEXT: lh a1, 354(sp)
-; ZVFHMIN64-NEXT: vmv.x.s s4, v23
+; ZVFHMIN64-NEXT: vmv.x.s s6, v23
; ZVFHMIN64-NEXT: csrr a2, vlenb
-; ZVFHMIN64-NEXT: li a3, 10
+; ZVFHMIN64-NEXT: li a3, 18
; ZVFHMIN64-NEXT: mul a2, a2, a3
; ZVFHMIN64-NEXT: add a2, sp, a2
-; ZVFHMIN64-NEXT: lh s2, 800(a2) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT: lh s5, 800(a2) # 8-byte Folded Reload
; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
@@ -2882,12 +2889,13 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
; ZVFHMIN64-NEXT: lh a0, 608(sp)
; ZVFHMIN64-NEXT: lh a1, 352(sp)
; ZVFHMIN64-NEXT: csrr a2, vlenb
-; ZVFHMIN64-NEXT: slli a2, a2, 4
+; ZVFHMIN64-NEXT: li a3, 22
+; ZVFHMIN64-NEXT: mul a2, a2, a3
; ZVFHMIN64-NEXT: add a2, sp, a2
-; ZVFHMIN64-NEXT: lh s5, 800(a2) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT: lh s4, 800(a2) # 8-byte Folded Reload
; ZVFHMIN64-NEXT: csrr a2, vlenb
-; ZVFHMIN64-NEXT: slli a3, a2, 4
-; ZVFHMIN64-NEXT: sub a2, a3, a2
+; ZVFHMIN64-NEXT: li a3, 21
+; ZVFHMIN64-NEXT: mul a2, a2, a3
; ZVFHMIN64-NEXT: add a2, sp, a2
; ZVFHMIN64-NEXT: lh s3, 800(a2) # 8-byte Folded Reload
; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
@@ -2896,148 +2904,148 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
; ZVFHMIN64-NEXT: sb a0, 240(sp)
; ZVFHMIN64-NEXT: lh a0, 606(sp)
; ZVFHMIN64-NEXT: lh a1, 350(sp)
-; ZVFHMIN64-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 7
-; ZVFHMIN64-NEXT: vmv.x.s s6, v8
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT: fmv.h.x fa3, t5
+; ZVFHMIN64-NEXT: fmv.h.x fa5, s2
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a0
+; ZVFHMIN64-NEXT: fmv.h.x fa2, a1
+; ZVFHMIN64-NEXT: feq.h a0, fa4, fa2
; ZVFHMIN64-NEXT: sb a0, 239(sp)
; ZVFHMIN64-NEXT: lh a0, 604(sp)
; ZVFHMIN64-NEXT: lh a1, 348(sp)
-; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 6
-; ZVFHMIN64-NEXT: vmv.x.s s7, v8
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT: fmv.h.x fa4, t6
+; ZVFHMIN64-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 7
+; ZVFHMIN64-NEXT: fmv.h.x fa2, a0
+; ZVFHMIN64-NEXT: fmv.h.x fa1, a1
+; ZVFHMIN64-NEXT: feq.h a0, fa2, fa1
; ZVFHMIN64-NEXT: sb a0, 238(sp)
; ZVFHMIN64-NEXT: lh a0, 602(sp)
; ZVFHMIN64-NEXT: lh a1, 346(sp)
-; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 5
-; ZVFHMIN64-NEXT: vmv.x.s s8, v8
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT: vmv.x.s a2, v8
+; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 6
+; ZVFHMIN64-NEXT: fmv.h.x fa2, a0
+; ZVFHMIN64-NEXT: fmv.h.x fa1, a1
+; ZVFHMIN64-NEXT: feq.h a0, fa2, fa1
; ZVFHMIN64-NEXT: sb a0, 237(sp)
; ZVFHMIN64-NEXT: lh a0, 600(sp)
; ZVFHMIN64-NEXT: lh a1, 344(sp)
-; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 4
-; ZVFHMIN64-NEXT: vmv.x.s s9, v8
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT: vmv.x.s a3, v8
+; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 5
+; ZVFHMIN64-NEXT: fmv.h.x fa2, a0
+; ZVFHMIN64-NEXT: fmv.h.x fa1, a1
+; ZVFHMIN64-NEXT: feq.h a0, fa2, fa1
; ZVFHMIN64-NEXT: sb a0, 236(sp)
; ZVFHMIN64-NEXT: lh a0, 598(sp)
; ZVFHMIN64-NEXT: lh a1, 342(sp)
-; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 3
-; ZVFHMIN64-NEXT: vmv.x.s s10, v8
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT: vmv.x.s a4, v8
+; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 4
+; ZVFHMIN64-NEXT: fmv.h.x fa2, a0
+; ZVFHMIN64-NEXT: fmv.h.x fa1, a1
+; ZVFHMIN64-NEXT: feq.h a0, fa2, fa1
; ZVFHMIN64-NEXT: sb a0, 235(sp)
; ZVFHMIN64-NEXT: lh a0, 596(sp)
; ZVFHMIN64-NEXT: lh a1, 340(sp)
-; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 2
-; ZVFHMIN64-NEXT: vmv.x.s s11, v8
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT: vmv.x.s a5, v8
+; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 3
+; ZVFHMIN64-NEXT: fmv.h.x fa2, a0
+; ZVFHMIN64-NEXT: fmv.h.x fa1, a1
+; ZVFHMIN64-NEXT: feq.h a0, fa2, fa1
; ZVFHMIN64-NEXT: sb a0, 234(sp)
; ZVFHMIN64-NEXT: lh a0, 594(sp)
; ZVFHMIN64-NEXT: lh a1, 338(sp)
-; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 1
-; ZVFHMIN64-NEXT: vmv.x.s ra, v8
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT: vmv.x.s t6, v8
+; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 2
+; ZVFHMIN64-NEXT: fmv.h.x fa2, a0
+; ZVFHMIN64-NEXT: fmv.h.x fa1, a1
+; ZVFHMIN64-NEXT: feq.h a0, fa2, fa1
; ZVFHMIN64-NEXT: sb a0, 233(sp)
; ZVFHMIN64-NEXT: lh a0, 592(sp)
; ZVFHMIN64-NEXT: lh a1, 336(sp)
-; ZVFHMIN64-NEXT: fmv.h.x fa5, t4
-; ZVFHMIN64-NEXT: fmv.h.x fa4, t6
-; ZVFHMIN64-NEXT: fmv.h.x fa3, a0
-; ZVFHMIN64-NEXT: fmv.h.x fa2, a1
-; ZVFHMIN64-NEXT: feq.h a0, fa3, fa2
+; ZVFHMIN64-NEXT: vmv.x.s s2, v8
+; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 1
+; ZVFHMIN64-NEXT: fmv.h.x fa2, a0
+; ZVFHMIN64-NEXT: fmv.h.x fa1, a1
+; ZVFHMIN64-NEXT: feq.h a0, fa2, fa1
; ZVFHMIN64-NEXT: sb a0, 232(sp)
; ZVFHMIN64-NEXT: lh a0, 590(sp)
+; ZVFHMIN64-NEXT: fmv.h.x fa2, a2
; ZVFHMIN64-NEXT: lh a1, 334(sp)
-; ZVFHMIN64-NEXT: fmv.h.x fa3, t5
-; ZVFHMIN64-NEXT: fmv.h.x fa2, s4
+; ZVFHMIN64-NEXT: vmv.x.s a2, v8
; ZVFHMIN64-NEXT: fmv.h.x fa1, a0
-; ZVFHMIN64-NEXT: fmv.h.x fa0, a1
-; ZVFHMIN64-NEXT: feq.h a0, fa1, fa0
+; ZVFHMIN64-NEXT: feq.h t5, fa3, fa2
+; ZVFHMIN64-NEXT: fmv.h.x fa3, a1
+; ZVFHMIN64-NEXT: feq.h a0, fa1, fa3
+; ZVFHMIN64-NEXT: fmv.h.x fa3, a3
; ZVFHMIN64-NEXT: sb a0, 231(sp)
; ZVFHMIN64-NEXT: lh a0, 588(sp)
+; ZVFHMIN64-NEXT: fmv.h.x fa2, a4
; ZVFHMIN64-NEXT: lh a1, 332(sp)
-; ZVFHMIN64-NEXT: fmv.h.x fa1, s2
-; ZVFHMIN64-NEXT: fmv.h.x fa0, s5
-; ZVFHMIN64-NEXT: fmv.h.x ft0, a0
-; ZVFHMIN64-NEXT: fmv.h.x ft1, a1
-; ZVFHMIN64-NEXT: feq.h a0, ft0, ft1
-; ZVFHMIN64-NEXT: sb a0, 230(sp)
-; ZVFHMIN64-NEXT: lh a0, 586(sp)
-; ZVFHMIN64-NEXT: fmv.h.x ft0, s3
-; ZVFHMIN64-NEXT: lh a1, 330(sp)
-; ZVFHMIN64-NEXT: fmv.h.x ft1, s6
-; ZVFHMIN64-NEXT: fmv.h.x ft2, a0
-; ZVFHMIN64-NEXT: feq.h a0, fa5, ft1
+; ZVFHMIN64-NEXT: feq.h a3, fa5, fa3
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT: feq.h a0, fa4, fa2
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT: fmv.h.x fa5, s6
+; ZVFHMIN64-NEXT: sb a1, 230(sp)
+; ZVFHMIN64-NEXT: lh a1, 586(sp)
+; ZVFHMIN64-NEXT: lh a4, 330(sp)
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a5
+; ZVFHMIN64-NEXT: feq.h a5, fa5, fa4
; ZVFHMIN64-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT: feq.h a1, ft2, fa5
-; ZVFHMIN64-NEXT: fmv.h.x fa5, s7
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a4
+; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT: fmv.h.x fa5, s5
; ZVFHMIN64-NEXT: sb a1, 229(sp)
; ZVFHMIN64-NEXT: lh a1, 584(sp)
-; ZVFHMIN64-NEXT: fmv.h.x ft1, s8
-; ZVFHMIN64-NEXT: lh a2, 328(sp)
-; ZVFHMIN64-NEXT: feq.h a3, fa4, fa5
+; ZVFHMIN64-NEXT: lh a4, 328(sp)
+; ZVFHMIN64-NEXT: fmv.h.x fa4, t6
+; ZVFHMIN64-NEXT: feq.h t6, fa5, fa4
; ZVFHMIN64-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT: feq.h a1, fa3, ft1
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a2
-; ZVFHMIN64-NEXT: feq.h a2, fa5, fa4
-; ZVFHMIN64-NEXT: fmv.h.x fa5, s9
-; ZVFHMIN64-NEXT: sb a2, 228(sp)
-; ZVFHMIN64-NEXT: lh a2, 582(sp)
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a4
+; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT: fmv.h.x fa5, s4
+; ZVFHMIN64-NEXT: sb a1, 228(sp)
+; ZVFHMIN64-NEXT: lh a1, 582(sp)
; ZVFHMIN64-NEXT: lh a4, 326(sp)
-; ZVFHMIN64-NEXT: fmv.h.x fa4, s10
-; ZVFHMIN64-NEXT: feq.h t4, fa2, fa5
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a2
-; ZVFHMIN64-NEXT: fmv.h.x fa3, a4
-; ZVFHMIN64-NEXT: feq.h a2, fa5, fa3
-; ZVFHMIN64-NEXT: fmv.h.x fa5, s11
-; ZVFHMIN64-NEXT: fmv.h.x fa3, ra
-; ZVFHMIN64-NEXT: sb a2, 227(sp)
-; ZVFHMIN64-NEXT: lh a2, 580(sp)
+; ZVFHMIN64-NEXT: fmv.h.x fa4, s2
+; ZVFHMIN64-NEXT: feq.h s2, fa5, fa4
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a4
+; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT: fmv.h.x fa5, s3
+; ZVFHMIN64-NEXT: sb a1, 227(sp)
+; ZVFHMIN64-NEXT: lh a1, 580(sp)
; ZVFHMIN64-NEXT: lh a4, 324(sp)
-; ZVFHMIN64-NEXT: feq.h t5, fa0, fa5
-; ZVFHMIN64-NEXT: feq.h t6, ft0, fa3
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a2
-; ZVFHMIN64-NEXT: fmv.h.x fa3, a4
-; ZVFHMIN64-NEXT: feq.h a2, fa5, fa3
-; ZVFHMIN64-NEXT: sb a2, 226(sp)
-; ZVFHMIN64-NEXT: lh a2, 578(sp)
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a2
+; ZVFHMIN64-NEXT: feq.h a2, fa5, fa4
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a4
+; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT: sb a1, 226(sp)
+; ZVFHMIN64-NEXT: lh a1, 578(sp)
; ZVFHMIN64-NEXT: lh a4, 322(sp)
-; ZVFHMIN64-NEXT: sb t6, 193(sp)
-; ZVFHMIN64-NEXT: feq.h t6, fa1, fa4
-; ZVFHMIN64-NEXT: sb t5, 194(sp)
+; ZVFHMIN64-NEXT: sb a2, 193(sp)
+; ZVFHMIN64-NEXT: sb s2, 194(sp)
; ZVFHMIN64-NEXT: sb t6, 195(sp)
-; ZVFHMIN64-NEXT: sb t4, 196(sp)
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a2
+; ZVFHMIN64-NEXT: sb a5, 196(sp)
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a1
; ZVFHMIN64-NEXT: fmv.h.x fa4, a4
-; ZVFHMIN64-NEXT: feq.h a2, fa5, fa4
-; ZVFHMIN64-NEXT: sb a1, 197(sp)
+; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT: sb a0, 197(sp)
; ZVFHMIN64-NEXT: sb a3, 198(sp)
-; ZVFHMIN64-NEXT: sb a0, 199(sp)
-; ZVFHMIN64-NEXT: sb a2, 225(sp)
+; ZVFHMIN64-NEXT: sb t5, 199(sp)
+; ZVFHMIN64-NEXT: sb a1, 225(sp)
; ZVFHMIN64-NEXT: lh a0, 766(sp)
; ZVFHMIN64-NEXT: lh a1, 510(sp)
; ZVFHMIN64-NEXT: csrr a2, vlenb
-; ZVFHMIN64-NEXT: slli a3, a2, 4
-; ZVFHMIN64-NEXT: add a2, a3, a2
+; ZVFHMIN64-NEXT: li a3, 19
+; ZVFHMIN64-NEXT: mul a2, a2, a3
; ZVFHMIN64-NEXT: add a2, sp, a2
; ZVFHMIN64-NEXT: addi a2, a2, 800
; ZVFHMIN64-NEXT: vl2r.v v8, (a2) # Unknown-size Folded Reload
; ZVFHMIN64-NEXT: vmv.x.s s2, v8
; ZVFHMIN64-NEXT: csrr a2, vlenb
-; ZVFHMIN64-NEXT: li a3, 11
+; ZVFHMIN64-NEXT: li a3, 14
; ZVFHMIN64-NEXT: mul a2, a2, a3
; ZVFHMIN64-NEXT: add a2, sp, a2
; ZVFHMIN64-NEXT: addi a2, a2, 800
@@ -3049,305 +3057,301 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
; ZVFHMIN64-NEXT: sb a0, 191(sp)
; ZVFHMIN64-NEXT: lh a0, 764(sp)
; ZVFHMIN64-NEXT: lh a1, 508(sp)
-; ZVFHMIN64-NEXT: vmv.x.s t5, v4
-; ZVFHMIN64-NEXT: vmv.x.s t4, v30
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT: sb a0, 190(sp)
-; ZVFHMIN64-NEXT: lh a0, 762(sp)
-; ZVFHMIN64-NEXT: lh a1, 506(sp)
+; ZVFHMIN64-NEXT: vmv.x.s t5, v6
; ZVFHMIN64-NEXT: csrr a2, vlenb
; ZVFHMIN64-NEXT: slli a2, a2, 2
; ZVFHMIN64-NEXT: add a2, sp, a2
; ZVFHMIN64-NEXT: addi a2, a2, 800
; ZVFHMIN64-NEXT: vl2r.v v8, (a2) # Unknown-size Folded Reload
; ZVFHMIN64-NEXT: vmv.x.s a2, v8
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT: sb a0, 190(sp)
+; ZVFHMIN64-NEXT: lh a0, 762(sp)
+; ZVFHMIN64-NEXT: lh a1, 506(sp)
; ZVFHMIN64-NEXT: csrr a3, vlenb
-; ZVFHMIN64-NEXT: slli a3, a3, 1
+; ZVFHMIN64-NEXT: slli a3, a3, 3
; ZVFHMIN64-NEXT: add a3, sp, a3
; ZVFHMIN64-NEXT: addi a3, a3, 800
; ZVFHMIN64-NEXT: vl2r.v v8, (a3) # Unknown-size Folded Reload
; ZVFHMIN64-NEXT: vmv.x.s a3, v8
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT: sb a0, 189(sp)
-; ZVFHMIN64-NEXT: lh a0, 760(sp)
-; ZVFHMIN64-NEXT: lh a1, 504(sp)
-; ZVFHMIN64-NEXT: fmv.h.x fa5, t3
; ZVFHMIN64-NEXT: csrr a4, vlenb
-; ZVFHMIN64-NEXT: li t3, 6
-; ZVFHMIN64-NEXT: mul a4, a4, t3
+; ZVFHMIN64-NEXT: li a5, 6
+; ZVFHMIN64-NEXT: mul a4, a4, a5
; ZVFHMIN64-NEXT: add a4, sp, a4
; ZVFHMIN64-NEXT: addi a4, a4, 800
; ZVFHMIN64-NEXT: vl2r.v v8, (a4) # Unknown-size Folded Reload
; ZVFHMIN64-NEXT: vmv.x.s a4, v8
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a0
-; ZVFHMIN64-NEXT: fmv.h.x fa3, a1
-; ZVFHMIN64-NEXT: feq.h a0, fa4, fa3
-; ZVFHMIN64-NEXT: sb a0, 188(sp)
-; ZVFHMIN64-NEXT: lh a0, 758(sp)
-; ZVFHMIN64-NEXT: lh a1, 502(sp)
-; ZVFHMIN64-NEXT: fmv.h.x fa4, t2
-; ZVFHMIN64-NEXT: csrr t2, vlenb
-; ZVFHMIN64-NEXT: slli t2, t2, 3
-; ZVFHMIN64-NEXT: add t2, sp, t2
-; ZVFHMIN64-NEXT: addi t2, t2, 800
-; ZVFHMIN64-NEXT: vl2r.v v8, (t2) # Unknown-size Folded Reload
-; ZVFHMIN64-NEXT: vmv.x.s t2, v8
-; ZVFHMIN64-NEXT: fmv.h.x fa3, a0
-; ZVFHMIN64-NEXT: fmv.h.x fa2, a1
-; ZVFHMIN64-NEXT: feq.h a0, fa3, fa2
-; ZVFHMIN64-NEXT: sb a0, 187(sp)
-; ZVFHMIN64-NEXT: lh a0, 756(sp)
-; ZVFHMIN64-NEXT: lh a1, 500(sp)
-; ZVFHMIN64-NEXT: fmv.h.x fa3, t1
-; ZVFHMIN64-NEXT: csrr t1, vlenb
-; ZVFHMIN64-NEXT: li t3, 13
-; ZVFHMIN64-NEXT: mul t1, t1, t3
-; ZVFHMIN64-NEXT: add t1, sp, t1
-; ZVFHMIN64-NEXT: addi t1, t1, 800
-; ZVFHMIN64-NEXT: vl2r.v v8, (t1) # Unknown-size Folded Reload
-; ZVFHMIN64-NEXT: vmv.x.s t3, v8
-; ZVFHMIN64-NEXT: fmv.h.x fa2, a0
-; ZVFHMIN64-NEXT: fmv.h.x fa1, a1
-; ZVFHMIN64-NEXT: feq.h a0, fa2, fa1
-; ZVFHMIN64-NEXT: sb a0, 186(sp)
-; ZVFHMIN64-NEXT: lh a0, 754(sp)
-; ZVFHMIN64-NEXT: fmv.h.x fa2, t0
-; ZVFHMIN64-NEXT: lh a1, 498(sp)
-; ZVFHMIN64-NEXT: csrr t0, vlenb
-; ZVFHMIN64-NEXT: li t1, 19
-; ZVFHMIN64-NEXT: mul t0, t0, t1
-; ZVFHMIN64-NEXT: add t0, sp, t0
-; ZVFHMIN64-NEXT: addi t0, t0, 800
-; ZVFHMIN64-NEXT: vl2r.v v8, (t0) # Unknown-size Folded Reload
-; ZVFHMIN64-NEXT: vmv.x.s s3, v8
-; ZVFHMIN64-NEXT: fmv.h.x fa1, a0
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT: sb a0, 189(sp)
+; ZVFHMIN64-NEXT: lh a1, 760(sp)
+; ZVFHMIN64-NEXT: lh a5, 504(sp)
+; ZVFHMIN64-NEXT: csrr a0, vlenb
+; ZVFHMIN64-NEXT: li s3, 12
+; ZVFHMIN64-NEXT: mul a0, a0, s3
+; ZVFHMIN64-NEXT: add a0, sp, a0
+; ZVFHMIN64-NEXT: addi a0, a0, 800
+; ZVFHMIN64-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN64-NEXT: vmv.x.s s5, v8
; ZVFHMIN64-NEXT: csrr a0, vlenb
-; ZVFHMIN64-NEXT: li t0, 21
-; ZVFHMIN64-NEXT: mul a0, a0, t0
+; ZVFHMIN64-NEXT: li s3, 10
+; ZVFHMIN64-NEXT: mul a0, a0, s3
; ZVFHMIN64-NEXT: add a0, sp, a0
; ZVFHMIN64-NEXT: addi a0, a0, 800
; ZVFHMIN64-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload
; ZVFHMIN64-NEXT: vmv.x.s a0, v8
-; ZVFHMIN64-NEXT: fmv.h.x fa0, a1
-; ZVFHMIN64-NEXT: feq.h a1, fa1, fa0
-; ZVFHMIN64-NEXT: fmv.h.x fa1, a2
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a5
+; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT: sb a1, 188(sp)
+; ZVFHMIN64-NEXT: lh a1, 758(sp)
+; ZVFHMIN64-NEXT: lh a5, 502(sp)
+; ZVFHMIN64-NEXT: csrr s3, vlenb
+; ZVFHMIN64-NEXT: slli s3, s3, 4
+; ZVFHMIN64-NEXT: add s3, sp, s3
+; ZVFHMIN64-NEXT: addi s3, s3, 800
+; ZVFHMIN64-NEXT: vl2r.v v8, (s3) # Unknown-size Folded Reload
+; ZVFHMIN64-NEXT: vmv.x.s s4, v8
+; ZVFHMIN64-NEXT: vmv.x.s s3, v16
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a5
+; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT: fmv.h.x fa5, t4
+; ZVFHMIN64-NEXT: sb a1, 187(sp)
+; ZVFHMIN64-NEXT: lh a1, 756(sp)
+; ZVFHMIN64-NEXT: lh a5, 500(sp)
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a2
+; ZVFHMIN64-NEXT: feq.h t4, fa5, fa4
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a5
+; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT: fmv.h.x fa5, t3
+; ZVFHMIN64-NEXT: sb a1, 186(sp)
+; ZVFHMIN64-NEXT: lh a1, 754(sp)
+; ZVFHMIN64-NEXT: lh a2, 498(sp)
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a3
+; ZVFHMIN64-NEXT: feq.h t3, fa5, fa4
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a2
+; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT: fmv.h.x fa5, t1
; ZVFHMIN64-NEXT: sb a1, 185(sp)
; ZVFHMIN64-NEXT: lh a1, 752(sp)
-; ZVFHMIN64-NEXT: fmv.h.x fa0, a3
; ZVFHMIN64-NEXT: lh a2, 496(sp)
-; ZVFHMIN64-NEXT: feq.h t0, fa5, fa1
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a4
+; ZVFHMIN64-NEXT: feq.h t1, fa5, fa4
; ZVFHMIN64-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT: feq.h t1, fa4, fa0
; ZVFHMIN64-NEXT: fmv.h.x fa4, a2
; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a4
+; ZVFHMIN64-NEXT: fmv.h.x fa5, t2
; ZVFHMIN64-NEXT: sb a1, 184(sp)
; ZVFHMIN64-NEXT: lh a1, 750(sp)
-; ZVFHMIN64-NEXT: fmv.h.x fa4, t2
; ZVFHMIN64-NEXT: lh a2, 494(sp)
-; ZVFHMIN64-NEXT: feq.h a3, fa3, fa5
+; ZVFHMIN64-NEXT: fmv.h.x fa4, s5
+; ZVFHMIN64-NEXT: feq.h a3, fa5, fa4
; ZVFHMIN64-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT: feq.h a1, fa2, fa4
; ZVFHMIN64-NEXT: fmv.h.x fa4, a2
-; ZVFHMIN64-NEXT: feq.h a2, fa5, fa4
+; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT: fmv.h.x fa5, t0
+; ZVFHMIN64-NEXT: sb a1, 183(sp)
+; ZVFHMIN64-NEXT: lh a1, 748(sp)
+; ZVFHMIN64-NEXT: lh a2, 492(sp)
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a0
+; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a2
+; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4
; ZVFHMIN64-NEXT: fmv.h.x fa5, a7
-; ZVFHMIN64-NEXT: sb a2, 183(sp)
-; ZVFHMIN64-NEXT: lh a2, 748(sp)
-; ZVFHMIN64-NEXT: lh a4, 492(sp)
-; ZVFHMIN64-NEXT: fmv.h.x fa4, t3
-; ZVFHMIN64-NEXT: feq.h a7, fa5, fa4
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a2
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a4
-; ZVFHMIN64-NEXT: feq.h a2, fa5, fa4
+; ZVFHMIN64-NEXT: sb a1, 182(sp)
+; ZVFHMIN64-NEXT: lh a1, 746(sp)
+; ZVFHMIN64-NEXT: lh a2, 490(sp)
+; ZVFHMIN64-NEXT: fmv.h.x fa4, s4
+; ZVFHMIN64-NEXT: feq.h a4, fa5, fa4
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a2
+; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4
; ZVFHMIN64-NEXT: fmv.h.x fa5, a6
-; ZVFHMIN64-NEXT: sb a2, 182(sp)
-; ZVFHMIN64-NEXT: lh a2, 746(sp)
-; ZVFHMIN64-NEXT: lh a4, 490(sp)
+; ZVFHMIN64-NEXT: sb a1, 181(sp)
+; ZVFHMIN64-NEXT: lh a1, 744(sp)
+; ZVFHMIN64-NEXT: lh a2, 488(sp)
; ZVFHMIN64-NEXT: fmv.h.x fa4, s3
; ZVFHMIN64-NEXT: feq.h a6, fa5, fa4
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a2
+; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT: ld a2, 80(sp) # 8-byte Folded Reload
; ZVFHMIN64-NEXT: fmv.h.x fa5, a2
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a4
-; ZVFHMIN64-NEXT: feq.h a2, fa5, fa4
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a5
-; ZVFHMIN64-NEXT: sb a2, 181(sp)
-; ZVFHMIN64-NEXT: lh a2, 744(sp)
-; ZVFHMIN64-NEXT: lh a4, 488(sp)
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a0
-; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a2
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a4
-; ZVFHMIN64-NEXT: feq.h a2, fa5, fa4
-; ZVFHMIN64-NEXT: ld a4, 88(sp) # 8-byte Folded Reload
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a4
-; ZVFHMIN64-NEXT: vmv.x.s a5, v0
+; ZVFHMIN64-NEXT: addi a2, sp, 800
+; ZVFHMIN64-NEXT: vl2r.v v8, (a2) # Unknown-size Folded Reload
+; ZVFHMIN64-NEXT: vmv.x.s a2, v8
; ZVFHMIN64-NEXT: vsetivli zero, 1, e16, m2, ta, ma
; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 15
-; ZVFHMIN64-NEXT: vmv.x.s a4, v8
-; ZVFHMIN64-NEXT: sb a2, 180(sp)
-; ZVFHMIN64-NEXT: lh a2, 742(sp)
-; ZVFHMIN64-NEXT: lh t2, 486(sp)
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a5
-; ZVFHMIN64-NEXT: feq.h a5, fa5, fa4
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a2
-; ZVFHMIN64-NEXT: fmv.h.x fa4, t2
+; ZVFHMIN64-NEXT: vmv.x.s a5, v8
+; ZVFHMIN64-NEXT: sb a1, 180(sp)
+; ZVFHMIN64-NEXT: lh a1, 742(sp)
+; ZVFHMIN64-NEXT: lh a7, 486(sp)
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a2
; ZVFHMIN64-NEXT: feq.h a2, fa5, fa4
-; ZVFHMIN64-NEXT: sb a2, 179(sp)
-; ZVFHMIN64-NEXT: lh a2, 740(sp)
-; ZVFHMIN64-NEXT: lh t2, 484(sp)
-; ZVFHMIN64-NEXT: sb a1, 140(sp)
-; ZVFHMIN64-NEXT: sb a3, 141(sp)
-; ZVFHMIN64-NEXT: sb t1, 142(sp)
-; ZVFHMIN64-NEXT: sb t0, 143(sp)
-; ZVFHMIN64-NEXT: sb a5, 136(sp)
-; ZVFHMIN64-NEXT: sb a0, 137(sp)
-; ZVFHMIN64-NEXT: sb a6, 138(sp)
-; ZVFHMIN64-NEXT: sb a7, 139(sp)
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a2
-; ZVFHMIN64-NEXT: fmv.h.x fa4, t2
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a7
+; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT: sb a1, 179(sp)
+; ZVFHMIN64-NEXT: lh a1, 740(sp)
+; ZVFHMIN64-NEXT: lh a7, 484(sp)
+; ZVFHMIN64-NEXT: sb a3, 140(sp)
+; ZVFHMIN64-NEXT: sb t1, 141(sp)
+; ZVFHMIN64-NEXT: sb t3, 142(sp)
+; ZVFHMIN64-NEXT: sb t4, 143(sp)
+; ZVFHMIN64-NEXT: sb a2, 136(sp)
+; ZVFHMIN64-NEXT: sb a6, 137(sp)
+; ZVFHMIN64-NEXT: sb a4, 138(sp)
+; ZVFHMIN64-NEXT: sb a0, 139(sp)
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a7
; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
; ZVFHMIN64-NEXT: sb a0, 178(sp)
-; ZVFHMIN64-NEXT: lh a0, 638(sp)
-; ZVFHMIN64-NEXT: lh a1, 382(sp)
+; ZVFHMIN64-NEXT: lh a1, 638(sp)
+; ZVFHMIN64-NEXT: lh a2, 382(sp)
; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 14
-; ZVFHMIN64-NEXT: vmv.x.s t2, v8
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT: sb a0, 255(sp)
-; ZVFHMIN64-NEXT: lh a0, 636(sp)
-; ZVFHMIN64-NEXT: lh a1, 380(sp)
+; ZVFHMIN64-NEXT: vmv.x.s a0, v8
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a2
+; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT: sb a1, 255(sp)
+; ZVFHMIN64-NEXT: lh a1, 636(sp)
+; ZVFHMIN64-NEXT: lh a2, 380(sp)
; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 13
-; ZVFHMIN64-NEXT: vmv.x.s t1, v8
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT: sb a0, 254(sp)
-; ZVFHMIN64-NEXT: lh a0, 634(sp)
-; ZVFHMIN64-NEXT: lh a1, 378(sp)
+; ZVFHMIN64-NEXT: vmv.x.s t2, v8
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a2
+; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT: sb a1, 254(sp)
+; ZVFHMIN64-NEXT: lh a1, 634(sp)
+; ZVFHMIN64-NEXT: lh a2, 378(sp)
; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 12
-; ZVFHMIN64-NEXT: vmv.x.s t0, v8
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT: sb a0, 253(sp)
-; ZVFHMIN64-NEXT: lh a0, 632(sp)
-; ZVFHMIN64-NEXT: lh a1, 376(sp)
+; ZVFHMIN64-NEXT: vmv.x.s t1, v8
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a2
+; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT: sb a1, 253(sp)
+; ZVFHMIN64-NEXT: lh a1, 632(sp)
+; ZVFHMIN64-NEXT: lh a2, 376(sp)
; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 11
-; ZVFHMIN64-NEXT: vmv.x.s a7, v8
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT: sb a0, 252(sp)
-; ZVFHMIN64-NEXT: lh a0, 630(sp)
-; ZVFHMIN64-NEXT: lh a1, 374(sp)
+; ZVFHMIN64-NEXT: vmv.x.s t0, v8
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a2
+; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT: sb a1, 252(sp)
+; ZVFHMIN64-NEXT: lh a1, 630(sp)
+; ZVFHMIN64-NEXT: lh a2, 374(sp)
; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 10
-; ZVFHMIN64-NEXT: vmv.x.s a6, v8
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT: sb a0, 251(sp)
-; ZVFHMIN64-NEXT: lh a0, 628(sp)
-; ZVFHMIN64-NEXT: lh a1, 372(sp)
+; ZVFHMIN64-NEXT: vmv.x.s a7, v8
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a2
+; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT: sb a1, 251(sp)
+; ZVFHMIN64-NEXT: lh a1, 628(sp)
+; ZVFHMIN64-NEXT: lh a2, 372(sp)
; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 9
-; ZVFHMIN64-NEXT: vmv.x.s a5, v8
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT: vmv.x.s a6, v8
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a2
+; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT: ld a2, 88(sp) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a2
+; ZVFHMIN64-NEXT: sb a1, 250(sp)
+; ZVFHMIN64-NEXT: lh a1, 626(sp)
+; ZVFHMIN64-NEXT: lh a2, 370(sp)
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a5
+; ZVFHMIN64-NEXT: feq.h a4, fa5, fa4
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a2
+; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT: ld a2, 96(sp) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a2
+; ZVFHMIN64-NEXT: sb a1, 249(sp)
+; ZVFHMIN64-NEXT: lh a1, 624(sp)
+; ZVFHMIN64-NEXT: lh a2, 368(sp)
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a0
+; ZVFHMIN64-NEXT: feq.h a3, fa5, fa4
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a2
; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
; ZVFHMIN64-NEXT: ld a1, 104(sp) # 8-byte Folded Reload
; ZVFHMIN64-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT: sb a0, 250(sp)
-; ZVFHMIN64-NEXT: lh a0, 626(sp)
-; ZVFHMIN64-NEXT: lh a1, 370(sp)
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a4
+; ZVFHMIN64-NEXT: sb a0, 248(sp)
+; ZVFHMIN64-NEXT: lh a0, 622(sp)
+; ZVFHMIN64-NEXT: lh a1, 366(sp)
+; ZVFHMIN64-NEXT: fmv.h.x fa4, t2
; ZVFHMIN64-NEXT: feq.h a2, fa5, fa4
; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT: ld a1, 120(sp) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT: ld a1, 112(sp) # 8-byte Folded Reload
; ZVFHMIN64-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT: sb a0, 249(sp)
-; ZVFHMIN64-NEXT: lh a1, 624(sp)
-; ZVFHMIN64-NEXT: lh a3, 368(sp)
-; ZVFHMIN64-NEXT: fmv.h.x fa4, t2
-; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a3
-; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4
-; ZVFHMIN64-NEXT: ld a3, 96(sp) # 8-byte Folded Reload
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a3
-; ZVFHMIN64-NEXT: sb a1, 248(sp)
-; ZVFHMIN64-NEXT: lh a1, 622(sp)
-; ZVFHMIN64-NEXT: lh a3, 366(sp)
+; ZVFHMIN64-NEXT: sb a0, 247(sp)
+; ZVFHMIN64-NEXT: lh a0, 620(sp)
+; ZVFHMIN64-NEXT: lh a1, 364(sp)
; ZVFHMIN64-NEXT: fmv.h.x fa4, t1
-; ZVFHMIN64-NEXT: feq.h a4, fa5, fa4
+; ZVFHMIN64-NEXT: feq.h a5, fa5, fa4
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT: ld a1, 120(sp) # 8-byte Folded Reload
; ZVFHMIN64-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a3
-; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4
-; ZVFHMIN64-NEXT: ld a3, 112(sp) # 8-byte Folded Reload
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a3
-; ZVFHMIN64-NEXT: sb a1, 247(sp)
-; ZVFHMIN64-NEXT: lh a1, 620(sp)
-; ZVFHMIN64-NEXT: lh a3, 364(sp)
+; ZVFHMIN64-NEXT: sb a0, 246(sp)
+; ZVFHMIN64-NEXT: lh a0, 618(sp)
+; ZVFHMIN64-NEXT: lh a1, 362(sp)
; ZVFHMIN64-NEXT: fmv.h.x fa4, t0
; ZVFHMIN64-NEXT: feq.h t0, fa5, fa4
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a3
-; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
; ZVFHMIN64-NEXT: fmv.h.x fa5, s2
-; ZVFHMIN64-NEXT: sb a1, 246(sp)
-; ZVFHMIN64-NEXT: lh a1, 618(sp)
-; ZVFHMIN64-NEXT: lh a3, 362(sp)
+; ZVFHMIN64-NEXT: sb a0, 245(sp)
+; ZVFHMIN64-NEXT: lh a0, 616(sp)
+; ZVFHMIN64-NEXT: lh a1, 360(sp)
; ZVFHMIN64-NEXT: fmv.h.x fa4, a7
; ZVFHMIN64-NEXT: feq.h a7, fa5, fa4
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a3
-; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
; ZVFHMIN64-NEXT: fmv.h.x fa5, t6
-; ZVFHMIN64-NEXT: sb a1, 245(sp)
-; ZVFHMIN64-NEXT: lh a1, 616(sp)
-; ZVFHMIN64-NEXT: lh a3, 360(sp)
+; ZVFHMIN64-NEXT: sb a0, 244(sp)
+; ZVFHMIN64-NEXT: lh a0, 614(sp)
+; ZVFHMIN64-NEXT: lh a1, 358(sp)
; ZVFHMIN64-NEXT: fmv.h.x fa4, a6
; ZVFHMIN64-NEXT: feq.h a6, fa5, fa4
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a3
-; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
; ZVFHMIN64-NEXT: fmv.h.x fa5, t5
-; ZVFHMIN64-NEXT: sb a1, 244(sp)
-; ZVFHMIN64-NEXT: lh a1, 614(sp)
-; ZVFHMIN64-NEXT: lh a3, 358(sp)
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a5
-; ZVFHMIN64-NEXT: feq.h a5, fa5, fa4
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a3
-; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4
-; ZVFHMIN64-NEXT: fmv.h.x fa5, t4
; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 8
-; ZVFHMIN64-NEXT: vmv.x.s a3, v8
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a3
-; ZVFHMIN64-NEXT: sb a1, 243(sp)
-; ZVFHMIN64-NEXT: lh a1, 612(sp)
-; ZVFHMIN64-NEXT: lh a3, 356(sp)
-; ZVFHMIN64-NEXT: sb t0, 204(sp)
-; ZVFHMIN64-NEXT: sb a4, 205(sp)
-; ZVFHMIN64-NEXT: sb a0, 206(sp)
-; ZVFHMIN64-NEXT: sb a2, 207(sp)
+; ZVFHMIN64-NEXT: vmv.x.s a1, v8
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT: sb a0, 243(sp)
+; ZVFHMIN64-NEXT: lh a0, 612(sp)
+; ZVFHMIN64-NEXT: lh a1, 356(sp)
+; ZVFHMIN64-NEXT: sb a5, 204(sp)
+; ZVFHMIN64-NEXT: sb a2, 205(sp)
+; ZVFHMIN64-NEXT: sb a3, 206(sp)
+; ZVFHMIN64-NEXT: sb a4, 207(sp)
+; ZVFHMIN64-NEXT: feq.h a2, fa5, fa4
+; ZVFHMIN64-NEXT: sb a2, 200(sp)
+; ZVFHMIN64-NEXT: sb a6, 201(sp)
+; ZVFHMIN64-NEXT: sb a7, 202(sp)
+; ZVFHMIN64-NEXT: sb t0, 203(sp)
+; ZVFHMIN64-NEXT: li a2, 128
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT: sb a0, 200(sp)
-; ZVFHMIN64-NEXT: sb a5, 201(sp)
-; ZVFHMIN64-NEXT: sb a6, 202(sp)
-; ZVFHMIN64-NEXT: sb a7, 203(sp)
-; ZVFHMIN64-NEXT: li a0, 128
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a3
-; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4
-; ZVFHMIN64-NEXT: sb a1, 242(sp)
-; ZVFHMIN64-NEXT: addi a1, sp, 128
-; ZVFHMIN64-NEXT: vsetvli zero, a0, e8, m8, ta, ma
-; ZVFHMIN64-NEXT: vle8.v v8, (a1)
+; ZVFHMIN64-NEXT: sb a0, 242(sp)
+; ZVFHMIN64-NEXT: addi a0, sp, 128
+; ZVFHMIN64-NEXT: vsetvli zero, a2, e8, m8, ta, ma
+; ZVFHMIN64-NEXT: vle8.v v8, (a0)
; ZVFHMIN64-NEXT: vand.vi v8, v8, 1
; ZVFHMIN64-NEXT: vmsne.vi v0, v8, 0
; ZVFHMIN64-NEXT: addi sp, s0, -896
diff --git a/llvm/test/CodeGen/RISCV/rvv/intrinsic-vector-match.ll b/llvm/test/CodeGen/RISCV/rvv/intrinsic-vector-match.ll
index dd2a8240ee2533..5b272c98a1e0ac 100644
--- a/llvm/test/CodeGen/RISCV/rvv/intrinsic-vector-match.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/intrinsic-vector-match.ll
@@ -507,26 +507,34 @@ define <8 x i1> @match_v8i8_v16i8(<8 x i8> %op1, <16 x i8> %op2, <8 x i1> %mask)
define <vscale x 16 x i1> @match_nxv16i8_v32i8(<vscale x 16 x i8> %op1, <32 x i8> %op2, <vscale x 16 x i1> %mask) {
; RV32-LABEL: match_nxv16i8_v32i8:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -48
-; RV32-NEXT: .cfi_def_cfa_offset 48
-; RV32-NEXT: sw s0, 44(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s1, 40(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s2, 36(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s3, 32(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s4, 28(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s5, 24(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s6, 20(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s7, 16(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s8, 12(sp) # 4-byte Folded Spill
-; RV32-NEXT: .cfi_offset s0, -4
-; RV32-NEXT: .cfi_offset s1, -8
-; RV32-NEXT: .cfi_offset s2, -12
-; RV32-NEXT: .cfi_offset s3, -16
-; RV32-NEXT: .cfi_offset s4, -20
-; RV32-NEXT: .cfi_offset s5, -24
-; RV32-NEXT: .cfi_offset s6, -28
-; RV32-NEXT: .cfi_offset s7, -32
-; RV32-NEXT: .cfi_offset s8, -36
+; RV32-NEXT: addi sp, sp, -64
+; RV32-NEXT: .cfi_def_cfa_offset 64
+; RV32-NEXT: sw ra, 60(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s0, 56(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s1, 52(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s2, 48(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s3, 44(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s4, 40(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s5, 36(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s6, 32(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s7, 28(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s8, 24(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s9, 20(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s10, 16(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s11, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT: .cfi_offset ra, -4
+; RV32-NEXT: .cfi_offset s0, -8
+; RV32-NEXT: .cfi_offset s1, -12
+; RV32-NEXT: .cfi_offset s2, -16
+; RV32-NEXT: .cfi_offset s3, -20
+; RV32-NEXT: .cfi_offset s4, -24
+; RV32-NEXT: .cfi_offset s5, -28
+; RV32-NEXT: .cfi_offset s6, -32
+; RV32-NEXT: .cfi_offset s7, -36
+; RV32-NEXT: .cfi_offset s8, -40
+; RV32-NEXT: .cfi_offset s9, -44
+; RV32-NEXT: .cfi_offset s10, -48
+; RV32-NEXT: .cfi_offset s11, -52
; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma
; RV32-NEXT: vmv.x.s a0, v10
; RV32-NEXT: vslidedown.vi v12, v10, 1
@@ -584,43 +592,43 @@ define <vscale x 16 x i1> @match_nxv16i8_v32i8(<vscale x 16 x i8> %op1, <32 x i8
; RV32-NEXT: vmv.x.s s5, v15
; RV32-NEXT: vmv.x.s s6, v16
; RV32-NEXT: vmv.x.s s7, v17
-; RV32-NEXT: vsetvli s8, zero, e8, m2, ta, ma
+; RV32-NEXT: vmv.x.s s8, v18
+; RV32-NEXT: vmv.x.s s9, v19
+; RV32-NEXT: vmv.x.s s10, v20
+; RV32-NEXT: vmv.x.s s11, v21
+; RV32-NEXT: vsetvli ra, zero, e8, m2, ta, ma
; RV32-NEXT: vmseq.vx v12, v8, a0
-; RV32-NEXT: vmv.x.s a0, v18
+; RV32-NEXT: vmv.x.s a0, v22
; RV32-NEXT: vmseq.vx v13, v8, s2
-; RV32-NEXT: vmv.x.s s2, v19
+; RV32-NEXT: vmv.x.s s2, v23
; RV32-NEXT: vmseq.vx v14, v8, s3
-; RV32-NEXT: vmv.x.s s3, v20
-; RV32-NEXT: vmseq.vx v15, v8, s4
-; RV32-NEXT: vmv.x.s s4, v21
-; RV32-NEXT: vmseq.vx v16, v8, s5
-; RV32-NEXT: vmv.x.s s5, v22
-; RV32-NEXT: vmseq.vx v17, v8, s6
-; RV32-NEXT: vmv.x.s s6, v23
-; RV32-NEXT: vmseq.vx v18, v8, s7
-; RV32-NEXT: vmv.x.s s7, v11
-; RV32-NEXT: vmseq.vx v11, v8, a0
-; RV32-NEXT: vmv.x.s a0, v24
-; RV32-NEXT: vmseq.vx v19, v8, s2
-; RV32-NEXT: vmv.x.s s2, v10
+; RV32-NEXT: vmv.x.s s3, v11
+; RV32-NEXT: vmseq.vx v11, v8, s4
+; RV32-NEXT: vmv.x.s s4, v24
+; RV32-NEXT: vmseq.vx v15, v8, s5
+; RV32-NEXT: vmv.x.s s5, v10
; RV32-NEXT: vmor.mm v10, v12, v13
+; RV32-NEXT: vmseq.vx v12, v8, s6
; RV32-NEXT: vmor.mm v10, v10, v14
+; RV32-NEXT: vmseq.vx v13, v8, s7
+; RV32-NEXT: vmor.mm v10, v10, v11
+; RV32-NEXT: vmseq.vx v11, v8, s8
; RV32-NEXT: vmor.mm v10, v10, v15
-; RV32-NEXT: vmor.mm v10, v10, v16
-; RV32-NEXT: vmor.mm v10, v10, v17
-; RV32-NEXT: vmseq.vx v12, v8, s3
-; RV32-NEXT: vmor.mm v10, v10, v18
-; RV32-NEXT: vmseq.vx v13, v8, s4
+; RV32-NEXT: vmseq.vx v14, v8, s9
+; RV32-NEXT: vmor.mm v10, v10, v12
+; RV32-NEXT: vmseq.vx v12, v8, s10
+; RV32-NEXT: vmor.mm v10, v10, v13
+; RV32-NEXT: vmseq.vx v13, v8, s11
; RV32-NEXT: vmor.mm v10, v10, v11
-; RV32-NEXT: vmseq.vx v11, v8, s5
-; RV32-NEXT: vmor.mm v10, v10, v19
-; RV32-NEXT: vmseq.vx v14, v8, s6
+; RV32-NEXT: vmseq.vx v11, v8, a0
+; RV32-NEXT: vmor.mm v10, v10, v14
+; RV32-NEXT: vmseq.vx v14, v8, s2
; RV32-NEXT: vmor.mm v10, v10, v12
-; RV32-NEXT: vmseq.vx v12, v8, s7
+; RV32-NEXT: vmseq.vx v12, v8, s3
; RV32-NEXT: vmor.mm v10, v10, v13
-; RV32-NEXT: vmseq.vx v13, v8, a0
+; RV32-NEXT: vmseq.vx v13, v8, s4
; RV32-NEXT: vmor.mm v10, v10, v11
-; RV32-NEXT: vmseq.vx v11, v8, s2
+; RV32-NEXT: vmseq.vx v11, v8, s5
; RV32-NEXT: vmor.mm v10, v10, v14
; RV32-NEXT: vmseq.vx v14, v8, a1
; RV32-NEXT: vmor.mm v10, v10, v12
@@ -658,15 +666,20 @@ define <vscale x 16 x i1> @match_nxv16i8_v32i8(<vscale x 16 x i8> %op1, <32 x i8
; RV32-NEXT: vmseq.vx v11, v8, s1
; RV32-NEXT: vmor.mm v8, v10, v11
; RV32-NEXT: vmand.mm v0, v8, v0
-; RV32-NEXT: lw s0, 44(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s1, 40(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s2, 36(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s3, 32(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s4, 28(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s5, 24(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s6, 20(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s7, 16(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s8, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw ra, 60(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s0, 56(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s1, 52(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s2, 48(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s3, 44(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s4, 40(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s5, 36(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s6, 32(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s7, 28(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s8, 24(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s9, 20(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s10, 16(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s11, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT: .cfi_restore ra
; RV32-NEXT: .cfi_restore s0
; RV32-NEXT: .cfi_restore s1
; RV32-NEXT: .cfi_restore s2
@@ -676,32 +689,43 @@ define <vscale x 16 x i1> @match_nxv16i8_v32i8(<vscale x 16 x i8> %op1, <32 x i8
; RV32-NEXT: .cfi_restore s6
; RV32-NEXT: .cfi_restore s7
; RV32-NEXT: .cfi_restore s8
-; RV32-NEXT: addi sp, sp, 48
+; RV32-NEXT: .cfi_restore s9
+; RV32-NEXT: .cfi_restore s10
+; RV32-NEXT: .cfi_restore s11
+; RV32-NEXT: addi sp, sp, 64
; RV32-NEXT: .cfi_def_cfa_offset 0
; RV32-NEXT: ret
;
; RV64-LABEL: match_nxv16i8_v32i8:
; RV64: # %bb.0:
-; RV64-NEXT: addi sp, sp, -80
-; RV64-NEXT: .cfi_def_cfa_offset 80
-; RV64-NEXT: sd s0, 72(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s1, 64(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s2, 56(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s3, 48(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s4, 40(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s5, 32(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s6, 24(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s7, 16(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s8, 8(sp) # 8-byte Folded Spill
-; RV64-NEXT: .cfi_offset s0, -8
-; RV64-NEXT: .cfi_offset s1, -16
-; RV64-NEXT: .cfi_offset s2, -24
-; RV64-NEXT: .cfi_offset s3, -32
-; RV64-NEXT: .cfi_offset s4, -40
-; RV64-NEXT: .cfi_offset s5, -48
-; RV64-NEXT: .cfi_offset s6, -56
-; RV64-NEXT: .cfi_offset s7, -64
-; RV64-NEXT: .cfi_offset s8, -72
+; RV64-NEXT: addi sp, sp, -112
+; RV64-NEXT: .cfi_def_cfa_offset 112
+; RV64-NEXT: sd ra, 104(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s0, 96(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s1, 88(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s2, 80(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s3, 72(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s4, 64(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s5, 56(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s6, 48(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s7, 40(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s8, 32(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s9, 24(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s10, 16(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s11, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT: .cfi_offset ra, -8
+; RV64-NEXT: .cfi_offset s0, -16
+; RV64-NEXT: .cfi_offset s1, -24
+; RV64-NEXT: .cfi_offset s2, -32
+; RV64-NEXT: .cfi_offset s3, -40
+; RV64-NEXT: .cfi_offset s4, -48
+; RV64-NEXT: .cfi_offset s5, -56
+; RV64-NEXT: .cfi_offset s6, -64
+; RV64-NEXT: .cfi_offset s7, -72
+; RV64-NEXT: .cfi_offset s8, -80
+; RV64-NEXT: .cfi_offset s9, -88
+; RV64-NEXT: .cfi_offset s10, -96
+; RV64-NEXT: .cfi_offset s11, -104
; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma
; RV64-NEXT: vmv.x.s a0, v10
; RV64-NEXT: vslidedown.vi v12, v10, 1
@@ -759,43 +783,43 @@ define <vscale x 16 x i1> @match_nxv16i8_v32i8(<vscale x 16 x i8> %op1, <32 x i8
; RV64-NEXT: vmv.x.s s5, v15
; RV64-NEXT: vmv.x.s s6, v16
; RV64-NEXT: vmv.x.s s7, v17
-; RV64-NEXT: vsetvli s8, zero, e8, m2, ta, ma
+; RV64-NEXT: vmv.x.s s8, v18
+; RV64-NEXT: vmv.x.s s9, v19
+; RV64-NEXT: vmv.x.s s10, v20
+; RV64-NEXT: vmv.x.s s11, v21
+; RV64-NEXT: vsetvli ra, zero, e8, m2, ta, ma
; RV64-NEXT: vmseq.vx v12, v8, a0
-; RV64-NEXT: vmv.x.s a0, v18
+; RV64-NEXT: vmv.x.s a0, v22
; RV64-NEXT: vmseq.vx v13, v8, s2
-; RV64-NEXT: vmv.x.s s2, v19
+; RV64-NEXT: vmv.x.s s2, v23
; RV64-NEXT: vmseq.vx v14, v8, s3
-; RV64-NEXT: vmv.x.s s3, v20
-; RV64-NEXT: vmseq.vx v15, v8, s4
-; RV64-NEXT: vmv.x.s s4, v21
-; RV64-NEXT: vmseq.vx v16, v8, s5
-; RV64-NEXT: vmv.x.s s5, v22
-; RV64-NEXT: vmseq.vx v17, v8, s6
-; RV64-NEXT: vmv.x.s s6, v23
-; RV64-NEXT: vmseq.vx v18, v8, s7
-; RV64-NEXT: vmv.x.s s7, v11
-; RV64-NEXT: vmseq.vx v11, v8, a0
-; RV64-NEXT: vmv.x.s a0, v24
-; RV64-NEXT: vmseq.vx v19, v8, s2
-; RV64-NEXT: vmv.x.s s2, v10
+; RV64-NEXT: vmv.x.s s3, v11
+; RV64-NEXT: vmseq.vx v11, v8, s4
+; RV64-NEXT: vmv.x.s s4, v24
+; RV64-NEXT: vmseq.vx v15, v8, s5
+; RV64-NEXT: vmv.x.s s5, v10
; RV64-NEXT: vmor.mm v10, v12, v13
+; RV64-NEXT: vmseq.vx v12, v8, s6
; RV64-NEXT: vmor.mm v10, v10, v14
+; RV64-NEXT: vmseq.vx v13, v8, s7
+; RV64-NEXT: vmor.mm v10, v10, v11
+; RV64-NEXT: vmseq.vx v11, v8, s8
; RV64-NEXT: vmor.mm v10, v10, v15
-; RV64-NEXT: vmor.mm v10, v10, v16
-; RV64-NEXT: vmor.mm v10, v10, v17
-; RV64-NEXT: vmseq.vx v12, v8, s3
-; RV64-NEXT: vmor.mm v10, v10, v18
-; RV64-NEXT: vmseq.vx v13, v8, s4
+; RV64-NEXT: vmseq.vx v14, v8, s9
+; RV64-NEXT: vmor.mm v10, v10, v12
+; RV64-NEXT: vmseq.vx v12, v8, s10
+; RV64-NEXT: vmor.mm v10, v10, v13
+; RV64-NEXT: vmseq.vx v13, v8, s11
; RV64-NEXT: vmor.mm v10, v10, v11
-; RV64-NEXT: vmseq.vx v11, v8, s5
-; RV64-NEXT: vmor.mm v10, v10, v19
-; RV64-NEXT: vmseq.vx v14, v8, s6
+; RV64-NEXT: vmseq.vx v11, v8, a0
+; RV64-NEXT: vmor.mm v10, v10, v14
+; RV64-NEXT: vmseq.vx v14, v8, s2
; RV64-NEXT: vmor.mm v10, v10, v12
-; RV64-NEXT: vmseq.vx v12, v8, s7
+; RV64-NEXT: vmseq.vx v12, v8, s3
; RV64-NEXT: vmor.mm v10, v10, v13
-; RV64-NEXT: vmseq.vx v13, v8, a0
+; RV64-NEXT: vmseq.vx v13, v8, s4
; RV64-NEXT: vmor.mm v10, v10, v11
-; RV64-NEXT: vmseq.vx v11, v8, s2
+; RV64-NEXT: vmseq.vx v11, v8, s5
; RV64-NEXT: vmor.mm v10, v10, v14
; RV64-NEXT: vmseq.vx v14, v8, a1
; RV64-NEXT: vmor.mm v10, v10, v12
@@ -833,15 +857,20 @@ define <vscale x 16 x i1> @match_nxv16i8_v32i8(<vscale x 16 x i8> %op1, <32 x i8
; RV64-NEXT: vmseq.vx v11, v8, s1
; RV64-NEXT: vmor.mm v8, v10, v11
; RV64-NEXT: vmand.mm v0, v8, v0
-; RV64-NEXT: ld s0, 72(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s1, 64(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s2, 56(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s3, 48(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s4, 40(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s5, 32(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s6, 24(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s7, 16(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s8, 8(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld ra, 104(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s0, 96(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s1, 88(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s2, 80(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s3, 72(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s4, 64(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s5, 56(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s6, 48(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s7, 40(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s8, 32(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s9, 24(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s10, 16(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s11, 8(sp) # 8-byte Folded Reload
+; RV64-NEXT: .cfi_restore ra
; RV64-NEXT: .cfi_restore s0
; RV64-NEXT: .cfi_restore s1
; RV64-NEXT: .cfi_restore s2
@@ -851,7 +880,10 @@ define <vscale x 16 x i1> @match_nxv16i8_v32i8(<vscale x 16 x i8> %op1, <32 x i8
; RV64-NEXT: .cfi_restore s6
; RV64-NEXT: .cfi_restore s7
; RV64-NEXT: .cfi_restore s8
-; RV64-NEXT: addi sp, sp, 80
+; RV64-NEXT: .cfi_restore s9
+; RV64-NEXT: .cfi_restore s10
+; RV64-NEXT: .cfi_restore s11
+; RV64-NEXT: addi sp, sp, 112
; RV64-NEXT: .cfi_def_cfa_offset 0
; RV64-NEXT: ret
%r = tail call <vscale x 16 x i1> @llvm.experimental.vector.match(<vscale x 16 x i8> %op1, <32 x i8> %op2, <vscale x 16 x i1> %mask)
@@ -861,16 +893,20 @@ define <vscale x 16 x i1> @match_nxv16i8_v32i8(<vscale x 16 x i8> %op1, <32 x i8
define <16 x i1> @match_v16i8_v32i8(<16 x i8> %op1, <32 x i8> %op2, <16 x i1> %mask) {
; RV32-LABEL: match_v16i8_v32i8:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -32
-; RV32-NEXT: .cfi_def_cfa_offset 32
-; RV32-NEXT: sw s0, 28(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s1, 24(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s2, 20(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s3, 16(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s4, 12(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s5, 8(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s6, 4(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s7, 0(sp) # 4-byte Folded Spill
+; RV32-NEXT: addi sp, sp, -48
+; RV32-NEXT: .cfi_def_cfa_offset 48
+; RV32-NEXT: sw s0, 44(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s1, 40(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s2, 36(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s3, 32(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s4, 28(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s5, 24(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s6, 20(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s7, 16(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s8, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s9, 8(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s10, 4(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s11, 0(sp) # 4-byte Folded Spill
; RV32-NEXT: .cfi_offset s0, -4
; RV32-NEXT: .cfi_offset s1, -8
; RV32-NEXT: .cfi_offset s2, -12
@@ -879,6 +915,10 @@ define <16 x i1> @match_v16i8_v32i8(<16 x i8> %op1, <32 x i8> %op2, <16 x i1> %m
; RV32-NEXT: .cfi_offset s5, -24
; RV32-NEXT: .cfi_offset s6, -28
; RV32-NEXT: .cfi_offset s7, -32
+; RV32-NEXT: .cfi_offset s8, -36
+; RV32-NEXT: .cfi_offset s9, -40
+; RV32-NEXT: .cfi_offset s10, -44
+; RV32-NEXT: .cfi_offset s11, -48
; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma
; RV32-NEXT: vmv.x.s a0, v10
; RV32-NEXT: vslidedown.vi v9, v10, 1
@@ -936,42 +976,42 @@ define <16 x i1> @match_v16i8_v32i8(<16 x i8> %op1, <32 x i8> %op2, <16 x i1> %m
; RV32-NEXT: vmv.x.s s5, v14
; RV32-NEXT: vmv.x.s s6, v15
; RV32-NEXT: vmv.x.s s7, v16
+; RV32-NEXT: vmv.x.s s8, v17
+; RV32-NEXT: vmv.x.s s9, v18
+; RV32-NEXT: vmv.x.s s10, v19
+; RV32-NEXT: vmv.x.s s11, v20
; RV32-NEXT: vmseq.vx v9, v8, a0
-; RV32-NEXT: vmv.x.s a0, v17
+; RV32-NEXT: vmv.x.s a0, v21
; RV32-NEXT: vmseq.vx v12, v8, s2
-; RV32-NEXT: vmv.x.s s2, v18
+; RV32-NEXT: vmv.x.s s2, v22
; RV32-NEXT: vmseq.vx v13, v8, s3
-; RV32-NEXT: vmv.x.s s3, v19
-; RV32-NEXT: vmseq.vx v14, v8, s4
-; RV32-NEXT: vmv.x.s s4, v20
-; RV32-NEXT: vmseq.vx v15, v8, s5
-; RV32-NEXT: vmv.x.s s5, v21
-; RV32-NEXT: vmseq.vx v16, v8, s6
-; RV32-NEXT: vmv.x.s s6, v22
-; RV32-NEXT: vmseq.vx v17, v8, s7
-; RV32-NEXT: vmv.x.s s7, v11
-; RV32-NEXT: vmseq.vx v11, v8, a0
-; RV32-NEXT: vmv.x.s a0, v23
-; RV32-NEXT: vmseq.vx v18, v8, s2
-; RV32-NEXT: vmv.x.s s2, v10
+; RV32-NEXT: vmv.x.s s3, v11
+; RV32-NEXT: vmseq.vx v11, v8, s4
+; RV32-NEXT: vmv.x.s s4, v23
+; RV32-NEXT: vmseq.vx v14, v8, s5
+; RV32-NEXT: vmv.x.s s5, v10
; RV32-NEXT: vmor.mm v9, v9, v12
+; RV32-NEXT: vmseq.vx v10, v8, s6
; RV32-NEXT: vmor.mm v9, v9, v13
+; RV32-NEXT: vmseq.vx v12, v8, s7
+; RV32-NEXT: vmor.mm v9, v9, v11
+; RV32-NEXT: vmseq.vx v11, v8, s8
; RV32-NEXT: vmor.mm v9, v9, v14
-; RV32-NEXT: vmor.mm v9, v9, v15
-; RV32-NEXT: vmor.mm v9, v9, v16
-; RV32-NEXT: vmseq.vx v10, v8, s3
-; RV32-NEXT: vmor.mm v9, v9, v17
-; RV32-NEXT: vmseq.vx v12, v8, s4
+; RV32-NEXT: vmseq.vx v13, v8, s9
+; RV32-NEXT: vmor.mm v9, v9, v10
+; RV32-NEXT: vmseq.vx v10, v8, s10
+; RV32-NEXT: vmor.mm v9, v9, v12
+; RV32-NEXT: vmseq.vx v12, v8, s11
; RV32-NEXT: vmor.mm v9, v9, v11
-; RV32-NEXT: vmseq.vx v11, v8, s5
-; RV32-NEXT: vmor.mm v9, v9, v18
-; RV32-NEXT: vmseq.vx v13, v8, s6
+; RV32-NEXT: vmseq.vx v11, v8, a0
+; RV32-NEXT: vmor.mm v9, v9, v13
+; RV32-NEXT: vmseq.vx v13, v8, s2
; RV32-NEXT: vmor.mm v9, v9, v10
-; RV32-NEXT: vmseq.vx v10, v8, s7
+; RV32-NEXT: vmseq.vx v10, v8, s3
; RV32-NEXT: vmor.mm v9, v9, v12
-; RV32-NEXT: vmseq.vx v12, v8, a0
+; RV32-NEXT: vmseq.vx v12, v8, s4
; RV32-NEXT: vmor.mm v9, v9, v11
-; RV32-NEXT: vmseq.vx v11, v8, s2
+; RV32-NEXT: vmseq.vx v11, v8, s5
; RV32-NEXT: vmor.mm v9, v9, v13
; RV32-NEXT: vmseq.vx v13, v8, a1
; RV32-NEXT: vmor.mm v9, v9, v10
@@ -1009,14 +1049,18 @@ define <16 x i1> @match_v16i8_v32i8(<16 x i8> %op1, <32 x i8> %op2, <16 x i1> %m
; RV32-NEXT: vmseq.vx v8, v8, s1
; RV32-NEXT: vmor.mm v8, v9, v8
; RV32-NEXT: vmand.mm v0, v8, v0
-; RV32-NEXT: lw s0, 28(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s1, 24(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s2, 20(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s3, 16(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s4, 12(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s5, 8(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s6, 4(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s7, 0(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s0, 44(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s1, 40(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s2, 36(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s3, 32(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s4, 28(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s5, 24(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s6, 20(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s7, 16(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s8, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s9, 8(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s10, 4(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s11, 0(sp) # 4-byte Folded Reload
; RV32-NEXT: .cfi_restore s0
; RV32-NEXT: .cfi_restore s1
; RV32-NEXT: .cfi_restore s2
@@ -1025,22 +1069,30 @@ define <16 x i1> @match_v16i8_v32i8(<16 x i8> %op1, <32 x i8> %op2, <16 x i1> %m
; RV32-NEXT: .cfi_restore s5
; RV32-NEXT: .cfi_restore s6
; RV32-NEXT: .cfi_restore s7
-; RV32-NEXT: addi sp, sp, 32
+; RV32-NEXT: .cfi_restore s8
+; RV32-NEXT: .cfi_restore s9
+; RV32-NEXT: .cfi_restore s10
+; RV32-NEXT: .cfi_restore s11
+; RV32-NEXT: addi sp, sp, 48
; RV32-NEXT: .cfi_def_cfa_offset 0
; RV32-NEXT: ret
;
; RV64-LABEL: match_v16i8_v32i8:
; RV64: # %bb.0:
-; RV64-NEXT: addi sp, sp, -64
-; RV64-NEXT: .cfi_def_cfa_offset 64
-; RV64-NEXT: sd s0, 56(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s1, 48(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s2, 40(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s3, 32(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s4, 24(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s5, 16(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s6, 8(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s7, 0(sp) # 8-byte Folded Spill
+; RV64-NEXT: addi sp, sp, -96
+; RV64-NEXT: .cfi_def_cfa_offset 96
+; RV64-NEXT: sd s0, 88(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s1, 80(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s2, 72(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s3, 64(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s4, 56(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s5, 48(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s6, 40(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s7, 32(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s8, 24(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s9, 16(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s10, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s11, 0(sp) # 8-byte Folded Spill
; RV64-NEXT: .cfi_offset s0, -8
; RV64-NEXT: .cfi_offset s1, -16
; RV64-NEXT: .cfi_offset s2, -24
@@ -1049,6 +1101,10 @@ define <16 x i1> @match_v16i8_v32i8(<16 x i8> %op1, <32 x i8> %op2, <16 x i1> %m
; RV64-NEXT: .cfi_offset s5, -48
; RV64-NEXT: .cfi_offset s6, -56
; RV64-NEXT: .cfi_offset s7, -64
+; RV64-NEXT: .cfi_offset s8, -72
+; RV64-NEXT: .cfi_offset s9, -80
+; RV64-NEXT: .cfi_offset s10, -88
+; RV64-NEXT: .cfi_offset s11, -96
; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma
; RV64-NEXT: vmv.x.s a0, v10
; RV64-NEXT: vslidedown.vi v9, v10, 1
@@ -1106,42 +1162,42 @@ define <16 x i1> @match_v16i8_v32i8(<16 x i8> %op1, <32 x i8> %op2, <16 x i1> %m
; RV64-NEXT: vmv.x.s s5, v14
; RV64-NEXT: vmv.x.s s6, v15
; RV64-NEXT: vmv.x.s s7, v16
+; RV64-NEXT: vmv.x.s s8, v17
+; RV64-NEXT: vmv.x.s s9, v18
+; RV64-NEXT: vmv.x.s s10, v19
+; RV64-NEXT: vmv.x.s s11, v20
; RV64-NEXT: vmseq.vx v9, v8, a0
-; RV64-NEXT: vmv.x.s a0, v17
+; RV64-NEXT: vmv.x.s a0, v21
; RV64-NEXT: vmseq.vx v12, v8, s2
-; RV64-NEXT: vmv.x.s s2, v18
+; RV64-NEXT: vmv.x.s s2, v22
; RV64-NEXT: vmseq.vx v13, v8, s3
-; RV64-NEXT: vmv.x.s s3, v19
-; RV64-NEXT: vmseq.vx v14, v8, s4
-; RV64-NEXT: vmv.x.s s4, v20
-; RV64-NEXT: vmseq.vx v15, v8, s5
-; RV64-NEXT: vmv.x.s s5, v21
-; RV64-NEXT: vmseq.vx v16, v8, s6
-; RV64-NEXT: vmv.x.s s6, v22
-; RV64-NEXT: vmseq.vx v17, v8, s7
-; RV64-NEXT: vmv.x.s s7, v11
-; RV64-NEXT: vmseq.vx v11, v8, a0
-; RV64-NEXT: vmv.x.s a0, v23
-; RV64-NEXT: vmseq.vx v18, v8, s2
-; RV64-NEXT: vmv.x.s s2, v10
+; RV64-NEXT: vmv.x.s s3, v11
+; RV64-NEXT: vmseq.vx v11, v8, s4
+; RV64-NEXT: vmv.x.s s4, v23
+; RV64-NEXT: vmseq.vx v14, v8, s5
+; RV64-NEXT: vmv.x.s s5, v10
; RV64-NEXT: vmor.mm v9, v9, v12
+; RV64-NEXT: vmseq.vx v10, v8, s6
; RV64-NEXT: vmor.mm v9, v9, v13
+; RV64-NEXT: vmseq.vx v12, v8, s7
+; RV64-NEXT: vmor.mm v9, v9, v11
+; RV64-NEXT: vmseq.vx v11, v8, s8
; RV64-NEXT: vmor.mm v9, v9, v14
-; RV64-NEXT: vmor.mm v9, v9, v15
-; RV64-NEXT: vmor.mm v9, v9, v16
-; RV64-NEXT: vmseq.vx v10, v8, s3
-; RV64-NEXT: vmor.mm v9, v9, v17
-; RV64-NEXT: vmseq.vx v12, v8, s4
+; RV64-NEXT: vmseq.vx v13, v8, s9
+; RV64-NEXT: vmor.mm v9, v9, v10
+; RV64-NEXT: vmseq.vx v10, v8, s10
+; RV64-NEXT: vmor.mm v9, v9, v12
+; RV64-NEXT: vmseq.vx v12, v8, s11
; RV64-NEXT: vmor.mm v9, v9, v11
-; RV64-NEXT: vmseq.vx v11, v8, s5
-; RV64-NEXT: vmor.mm v9, v9, v18
-; RV64-NEXT: vmseq.vx v13, v8, s6
+; RV64-NEXT: vmseq.vx v11, v8, a0
+; RV64-NEXT: vmor.mm v9, v9, v13
+; RV64-NEXT: vmseq.vx v13, v8, s2
; RV64-NEXT: vmor.mm v9, v9, v10
-; RV64-NEXT: vmseq.vx v10, v8, s7
+; RV64-NEXT: vmseq.vx v10, v8, s3
; RV64-NEXT: vmor.mm v9, v9, v12
-; RV64-NEXT: vmseq.vx v12, v8, a0
+; RV64-NEXT: vmseq.vx v12, v8, s4
; RV64-NEXT: vmor.mm v9, v9, v11
-; RV64-NEXT: vmseq.vx v11, v8, s2
+; RV64-NEXT: vmseq.vx v11, v8, s5
; RV64-NEXT: vmor.mm v9, v9, v13
; RV64-NEXT: vmseq.vx v13, v8, a1
; RV64-NEXT: vmor.mm v9, v9, v10
@@ -1179,14 +1235,18 @@ define <16 x i1> @match_v16i8_v32i8(<16 x i8> %op1, <32 x i8> %op2, <16 x i1> %m
; RV64-NEXT: vmseq.vx v8, v8, s1
; RV64-NEXT: vmor.mm v8, v9, v8
; RV64-NEXT: vmand.mm v0, v8, v0
-; RV64-NEXT: ld s0, 56(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s1, 48(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s2, 40(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s3, 32(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s4, 24(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s5, 16(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s6, 8(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s7, 0(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s0, 88(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s1, 80(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s2, 72(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s3, 64(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s4, 56(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s5, 48(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s6, 40(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s7, 32(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s8, 24(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s9, 16(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s10, 8(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s11, 0(sp) # 8-byte Folded Reload
; RV64-NEXT: .cfi_restore s0
; RV64-NEXT: .cfi_restore s1
; RV64-NEXT: .cfi_restore s2
@@ -1195,7 +1255,11 @@ define <16 x i1> @match_v16i8_v32i8(<16 x i8> %op1, <32 x i8> %op2, <16 x i1> %m
; RV64-NEXT: .cfi_restore s5
; RV64-NEXT: .cfi_restore s6
; RV64-NEXT: .cfi_restore s7
-; RV64-NEXT: addi sp, sp, 64
+; RV64-NEXT: .cfi_restore s8
+; RV64-NEXT: .cfi_restore s9
+; RV64-NEXT: .cfi_restore s10
+; RV64-NEXT: .cfi_restore s11
+; RV64-NEXT: addi sp, sp, 96
; RV64-NEXT: .cfi_def_cfa_offset 0
; RV64-NEXT: ret
%r = tail call <16 x i1> @llvm.experimental.vector.match(<16 x i8> %op1, <32 x i8> %op2, <16 x i1> %mask)
diff --git a/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll
index 22e6f23d4d6e6a..123048d996360c 100644
--- a/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll
+++ b/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll
@@ -2203,136 +2203,139 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
;
; RV32I-LABEL: lshr_32bytes:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi sp, sp, -112
-; RV32I-NEXT: sw s0, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s1, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s2, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s3, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s4, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s5, 88(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s6, 84(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s7, 80(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s8, 76(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s9, 72(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s10, 68(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s11, 64(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 0(a0)
+; RV32I-NEXT: addi sp, sp, -128
+; RV32I-NEXT: sw ra, 124(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s0, 120(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 116(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s2, 112(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s3, 108(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s4, 104(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s5, 100(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s6, 96(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s7, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s8, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s9, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s10, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s11, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT: lbu s1, 0(a0)
; RV32I-NEXT: lbu a4, 1(a0)
; RV32I-NEXT: lbu a5, 2(a0)
; RV32I-NEXT: lbu a6, 3(a0)
-; RV32I-NEXT: lbu a7, 4(a0)
-; RV32I-NEXT: lbu t0, 5(a0)
-; RV32I-NEXT: lbu t1, 6(a0)
-; RV32I-NEXT: lbu t2, 7(a0)
-; RV32I-NEXT: lbu t3, 8(a0)
-; RV32I-NEXT: lbu t4, 9(a0)
-; RV32I-NEXT: lbu t5, 10(a0)
-; RV32I-NEXT: lbu t6, 11(a0)
-; RV32I-NEXT: lbu s0, 12(a0)
-; RV32I-NEXT: lbu s1, 13(a0)
-; RV32I-NEXT: lbu s2, 14(a0)
-; RV32I-NEXT: lbu s3, 15(a0)
-; RV32I-NEXT: lbu s4, 16(a0)
-; RV32I-NEXT: lbu s5, 17(a0)
-; RV32I-NEXT: lbu s6, 18(a0)
-; RV32I-NEXT: lbu s7, 19(a0)
+; RV32I-NEXT: lbu t1, 4(a0)
+; RV32I-NEXT: lbu t3, 5(a0)
+; RV32I-NEXT: lbu t4, 6(a0)
+; RV32I-NEXT: lbu s0, 7(a0)
+; RV32I-NEXT: lbu t2, 8(a0)
+; RV32I-NEXT: lbu s3, 9(a0)
+; RV32I-NEXT: lbu s6, 10(a0)
+; RV32I-NEXT: lbu s8, 11(a0)
+; RV32I-NEXT: lbu s9, 12(a0)
+; RV32I-NEXT: lbu s10, 13(a0)
+; RV32I-NEXT: lbu s4, 14(a0)
+; RV32I-NEXT: lbu s7, 15(a0)
+; RV32I-NEXT: lbu s5, 16(a0)
+; RV32I-NEXT: lbu s11, 17(a0)
+; RV32I-NEXT: lbu ra, 18(a0)
+; RV32I-NEXT: lbu a3, 19(a0)
+; RV32I-NEXT: lbu t5, 20(a0)
+; RV32I-NEXT: lbu t6, 21(a0)
+; RV32I-NEXT: lbu a7, 22(a0)
+; RV32I-NEXT: lbu t0, 23(a0)
; RV32I-NEXT: slli a4, a4, 8
; RV32I-NEXT: slli a5, a5, 16
; RV32I-NEXT: slli a6, a6, 24
-; RV32I-NEXT: slli t0, t0, 8
-; RV32I-NEXT: slli t1, t1, 16
-; RV32I-NEXT: slli t2, t2, 24
-; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: slli t3, t3, 8
+; RV32I-NEXT: slli t4, t4, 16
+; RV32I-NEXT: slli s0, s0, 24
+; RV32I-NEXT: or a4, a4, s1
+; RV32I-NEXT: sw a4, 4(sp) # 4-byte Folded Spill
; RV32I-NEXT: or a4, a6, a5
-; RV32I-NEXT: or a5, t0, a7
-; RV32I-NEXT: or a6, t2, t1
-; RV32I-NEXT: lbu s8, 20(a0)
-; RV32I-NEXT: lbu s9, 21(a0)
-; RV32I-NEXT: lbu s10, 22(a0)
-; RV32I-NEXT: lbu s11, 23(a0)
-; RV32I-NEXT: slli t4, t4, 8
-; RV32I-NEXT: slli t5, t5, 16
-; RV32I-NEXT: slli t6, t6, 24
-; RV32I-NEXT: slli s1, s1, 8
-; RV32I-NEXT: slli s2, s2, 16
-; RV32I-NEXT: slli s3, s3, 24
-; RV32I-NEXT: or a7, t4, t3
-; RV32I-NEXT: or t0, t6, t5
-; RV32I-NEXT: or t1, s1, s0
-; RV32I-NEXT: or t2, s3, s2
-; RV32I-NEXT: lbu t6, 24(a0)
+; RV32I-NEXT: or a5, t3, t1
+; RV32I-NEXT: or a6, s0, t4
+; RV32I-NEXT: lbu t1, 24(a0)
; RV32I-NEXT: lbu s0, 25(a0)
; RV32I-NEXT: lbu s1, 26(a0)
; RV32I-NEXT: lbu s2, 27(a0)
-; RV32I-NEXT: slli s5, s5, 8
+; RV32I-NEXT: slli s3, s3, 8
; RV32I-NEXT: slli s6, s6, 16
-; RV32I-NEXT: slli s7, s7, 24
-; RV32I-NEXT: slli s9, s9, 8
-; RV32I-NEXT: or t3, s5, s4
-; RV32I-NEXT: or t4, s7, s6
-; RV32I-NEXT: or t5, s9, s8
+; RV32I-NEXT: slli s8, s8, 24
+; RV32I-NEXT: slli s10, s10, 8
+; RV32I-NEXT: or t2, s3, t2
+; RV32I-NEXT: or t3, s8, s6
+; RV32I-NEXT: or t4, s10, s9
; RV32I-NEXT: lbu s3, 28(a0)
-; RV32I-NEXT: lbu s4, 29(a0)
-; RV32I-NEXT: lbu s5, 30(a0)
-; RV32I-NEXT: lbu s6, 31(a0)
-; RV32I-NEXT: slli s10, s10, 16
-; RV32I-NEXT: slli s11, s11, 24
-; RV32I-NEXT: slli s0, s0, 8
-; RV32I-NEXT: slli s1, s1, 16
-; RV32I-NEXT: slli s2, s2, 24
-; RV32I-NEXT: or a0, s11, s10
-; RV32I-NEXT: or t6, s0, t6
-; RV32I-NEXT: or s0, s2, s1
-; RV32I-NEXT: lbu s1, 0(a1)
-; RV32I-NEXT: lbu s2, 1(a1)
-; RV32I-NEXT: lbu s7, 2(a1)
+; RV32I-NEXT: lbu s6, 29(a0)
+; RV32I-NEXT: lbu s8, 30(a0)
+; RV32I-NEXT: lbu s9, 31(a0)
+; RV32I-NEXT: slli s4, s4, 16
+; RV32I-NEXT: slli s7, s7, 24
+; RV32I-NEXT: slli s11, s11, 8
+; RV32I-NEXT: slli ra, ra, 16
+; RV32I-NEXT: slli a3, a3, 24
+; RV32I-NEXT: or a0, s7, s4
+; RV32I-NEXT: or s4, s11, s5
+; RV32I-NEXT: or s5, a3, ra
+; RV32I-NEXT: lbu a3, 0(a1)
+; RV32I-NEXT: lbu s7, 1(a1)
+; RV32I-NEXT: lbu s10, 2(a1)
; RV32I-NEXT: lbu a1, 3(a1)
-; RV32I-NEXT: sw zero, 48(sp)
-; RV32I-NEXT: sw zero, 52(sp)
; RV32I-NEXT: sw zero, 56(sp)
; RV32I-NEXT: sw zero, 60(sp)
-; RV32I-NEXT: sw zero, 32(sp)
-; RV32I-NEXT: sw zero, 36(sp)
+; RV32I-NEXT: sw zero, 64(sp)
+; RV32I-NEXT: sw zero, 68(sp)
; RV32I-NEXT: sw zero, 40(sp)
; RV32I-NEXT: sw zero, 44(sp)
-; RV32I-NEXT: slli s4, s4, 8
-; RV32I-NEXT: or s3, s4, s3
-; RV32I-NEXT: mv s4, sp
-; RV32I-NEXT: slli s5, s5, 16
-; RV32I-NEXT: slli s6, s6, 24
-; RV32I-NEXT: slli s2, s2, 8
-; RV32I-NEXT: slli s7, s7, 16
+; RV32I-NEXT: sw zero, 48(sp)
+; RV32I-NEXT: sw zero, 52(sp)
+; RV32I-NEXT: slli t6, t6, 8
+; RV32I-NEXT: or t5, t6, t5
+; RV32I-NEXT: addi t6, sp, 8
+; RV32I-NEXT: slli a7, a7, 16
+; RV32I-NEXT: slli t0, t0, 24
+; RV32I-NEXT: slli s0, s0, 8
+; RV32I-NEXT: slli s1, s1, 16
+; RV32I-NEXT: slli s2, s2, 24
+; RV32I-NEXT: slli s6, s6, 8
+; RV32I-NEXT: slli s8, s8, 16
+; RV32I-NEXT: slli s9, s9, 24
+; RV32I-NEXT: slli s7, s7, 8
+; RV32I-NEXT: slli s10, s10, 16
; RV32I-NEXT: slli a1, a1, 24
-; RV32I-NEXT: or s5, s6, s5
-; RV32I-NEXT: or s1, s2, s1
-; RV32I-NEXT: or a1, a1, s7
-; RV32I-NEXT: or a3, a4, a3
-; RV32I-NEXT: or a4, a6, a5
-; RV32I-NEXT: or a5, t0, a7
-; RV32I-NEXT: or a6, t2, t1
-; RV32I-NEXT: or a7, t4, t3
-; RV32I-NEXT: or a0, a0, t5
-; RV32I-NEXT: or t0, s0, t6
-; RV32I-NEXT: or t1, s5, s3
-; RV32I-NEXT: or a1, a1, s1
-; RV32I-NEXT: sw a7, 16(sp)
+; RV32I-NEXT: or a7, t0, a7
+; RV32I-NEXT: or t0, s0, t1
+; RV32I-NEXT: or t1, s2, s1
+; RV32I-NEXT: or s0, s6, s3
+; RV32I-NEXT: or s1, s9, s8
+; RV32I-NEXT: or a3, s7, a3
+; RV32I-NEXT: or a1, a1, s10
+; RV32I-NEXT: lw s2, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or a4, a4, s2
+; RV32I-NEXT: or a5, a6, a5
+; RV32I-NEXT: or a6, t3, t2
+; RV32I-NEXT: or a0, a0, t4
+; RV32I-NEXT: or t2, s5, s4
+; RV32I-NEXT: or a7, a7, t5
+; RV32I-NEXT: or t0, t1, t0
+; RV32I-NEXT: or s0, s1, s0
+; RV32I-NEXT: or a1, a1, a3
+; RV32I-NEXT: sw t2, 24(sp)
+; RV32I-NEXT: sw a7, 28(sp)
+; RV32I-NEXT: sw t0, 32(sp)
+; RV32I-NEXT: sw s0, 36(sp)
+; RV32I-NEXT: sw a4, 8(sp)
+; RV32I-NEXT: sw a5, 12(sp)
+; RV32I-NEXT: sw a6, 16(sp)
; RV32I-NEXT: sw a0, 20(sp)
-; RV32I-NEXT: sw t0, 24(sp)
-; RV32I-NEXT: sw t1, 28(sp)
-; RV32I-NEXT: sw a3, 0(sp)
-; RV32I-NEXT: sw a4, 4(sp)
-; RV32I-NEXT: sw a5, 8(sp)
-; RV32I-NEXT: sw a6, 12(sp)
; RV32I-NEXT: slli t1, a1, 3
; RV32I-NEXT: andi a1, a1, 28
-; RV32I-NEXT: add a1, s4, a1
+; RV32I-NEXT: add a1, t6, a1
; RV32I-NEXT: andi a0, t1, 24
-; RV32I-NEXT: xori a7, a0, 31
+; RV32I-NEXT: xori t0, a0, 31
; RV32I-NEXT: lw a3, 0(a1)
; RV32I-NEXT: lw a4, 4(a1)
; RV32I-NEXT: lw a5, 8(a1)
; RV32I-NEXT: lw a6, 12(a1)
-; RV32I-NEXT: lw t0, 16(a1)
+; RV32I-NEXT: lw a7, 16(a1)
; RV32I-NEXT: lw t2, 20(a1)
; RV32I-NEXT: lw t3, 24(a1)
; RV32I-NEXT: lw t4, 28(a1)
@@ -2341,33 +2344,33 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: srl a1, a3, t1
; RV32I-NEXT: slli t6, a4, 1
; RV32I-NEXT: srl a3, a6, t1
-; RV32I-NEXT: slli s0, t0, 1
+; RV32I-NEXT: slli s0, a7, 1
; RV32I-NEXT: srl a4, a5, t1
; RV32I-NEXT: slli s1, a6, 1
; RV32I-NEXT: srl a5, t2, t1
; RV32I-NEXT: slli s2, t3, 1
-; RV32I-NEXT: srl a6, t0, t1
+; RV32I-NEXT: srl a6, a7, t1
; RV32I-NEXT: slli t2, t2, 1
-; RV32I-NEXT: srl t0, t3, t1
+; RV32I-NEXT: srl a7, t3, t1
; RV32I-NEXT: slli t3, t4, 1
; RV32I-NEXT: srl t1, t4, t1
-; RV32I-NEXT: sll t4, t5, a7
-; RV32I-NEXT: sll t5, t6, a7
-; RV32I-NEXT: sll t6, s0, a7
-; RV32I-NEXT: sll s0, s1, a7
-; RV32I-NEXT: sll s1, s2, a7
-; RV32I-NEXT: sll t2, t2, a7
-; RV32I-NEXT: sll t3, t3, a7
+; RV32I-NEXT: sll t4, t5, t0
+; RV32I-NEXT: sll t5, t6, t0
+; RV32I-NEXT: sll t6, s0, t0
+; RV32I-NEXT: sll s0, s1, t0
+; RV32I-NEXT: sll s1, s2, t0
+; RV32I-NEXT: sll t2, t2, t0
+; RV32I-NEXT: sll t3, t3, t0
; RV32I-NEXT: srli s2, t1, 24
; RV32I-NEXT: srli s3, t1, 16
; RV32I-NEXT: srli s4, t1, 8
-; RV32I-NEXT: or a7, a0, t4
+; RV32I-NEXT: or t0, a0, t4
; RV32I-NEXT: or t4, a1, t5
; RV32I-NEXT: or t5, a3, t6
; RV32I-NEXT: or s0, a4, s0
; RV32I-NEXT: or s1, a5, s1
; RV32I-NEXT: or t2, a6, t2
-; RV32I-NEXT: or t3, t0, t3
+; RV32I-NEXT: or t3, a7, t3
; RV32I-NEXT: sb t1, 28(a2)
; RV32I-NEXT: sb s4, 29(a2)
; RV32I-NEXT: sb s3, 30(a2)
@@ -2384,23 +2387,23 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: srli s6, s0, 24
; RV32I-NEXT: srli s7, s0, 16
; RV32I-NEXT: srli s0, s0, 8
-; RV32I-NEXT: sb t0, 24(a2)
-; RV32I-NEXT: srli t0, t5, 24
-; RV32I-NEXT: sb t3, 25(a2)
-; RV32I-NEXT: srli t3, t5, 16
+; RV32I-NEXT: srli s8, t5, 24
+; RV32I-NEXT: srli s9, t5, 16
; RV32I-NEXT: srli t5, t5, 8
+; RV32I-NEXT: srli s10, t4, 24
+; RV32I-NEXT: srli s11, t4, 16
+; RV32I-NEXT: srli t4, t4, 8
+; RV32I-NEXT: sb a7, 24(a2)
+; RV32I-NEXT: sb t3, 25(a2)
; RV32I-NEXT: sb t6, 26(a2)
-; RV32I-NEXT: srli t6, t4, 24
; RV32I-NEXT: sb t1, 27(a2)
-; RV32I-NEXT: srli t1, t4, 16
-; RV32I-NEXT: srli t4, t4, 8
+; RV32I-NEXT: srli a7, t0, 24
; RV32I-NEXT: sb a6, 16(a2)
-; RV32I-NEXT: srli a6, a7, 24
; RV32I-NEXT: sb t2, 17(a2)
; RV32I-NEXT: sb s3, 18(a2)
; RV32I-NEXT: sb s2, 19(a2)
-; RV32I-NEXT: srli t2, a7, 16
-; RV32I-NEXT: srli a7, a7, 8
+; RV32I-NEXT: srli a6, t0, 16
+; RV32I-NEXT: srli t0, t0, 8
; RV32I-NEXT: sb a5, 20(a2)
; RV32I-NEXT: sb s1, 21(a2)
; RV32I-NEXT: sb s5, 22(a2)
@@ -2411,29 +2414,30 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: sb s6, 11(a2)
; RV32I-NEXT: sb a3, 12(a2)
; RV32I-NEXT: sb t5, 13(a2)
-; RV32I-NEXT: sb t3, 14(a2)
-; RV32I-NEXT: sb t0, 15(a2)
+; RV32I-NEXT: sb s9, 14(a2)
+; RV32I-NEXT: sb s8, 15(a2)
; RV32I-NEXT: sb a1, 0(a2)
; RV32I-NEXT: sb t4, 1(a2)
-; RV32I-NEXT: sb t1, 2(a2)
-; RV32I-NEXT: sb t6, 3(a2)
+; RV32I-NEXT: sb s11, 2(a2)
+; RV32I-NEXT: sb s10, 3(a2)
; RV32I-NEXT: sb a0, 4(a2)
-; RV32I-NEXT: sb a7, 5(a2)
-; RV32I-NEXT: sb t2, 6(a2)
-; RV32I-NEXT: sb a6, 7(a2)
-; RV32I-NEXT: lw s0, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s1, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s2, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s3, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s4, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s5, 88(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s6, 84(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s7, 80(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s8, 76(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s9, 72(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s10, 68(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s11, 64(sp) # 4-byte Folded Reload
-; RV32I-NEXT: addi sp, sp, 112
+; RV32I-NEXT: sb t0, 5(a2)
+; RV32I-NEXT: sb a6, 6(a2)
+; RV32I-NEXT: sb a7, 7(a2)
+; RV32I-NEXT: lw ra, 124(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s0, 120(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 116(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s2, 112(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s3, 108(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s4, 104(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s5, 100(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s6, 96(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s7, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s8, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s9, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s10, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s11, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT: addi sp, sp, 128
; RV32I-NEXT: ret
%src = load i256, ptr %src.ptr, align 1
%byteOff = load i256, ptr %byteOff.ptr, align 1
@@ -2678,128 +2682,132 @@ define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
;
; RV32I-LABEL: lshr_32bytes_wordOff:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi sp, sp, -112
-; RV32I-NEXT: sw s0, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s1, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s2, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s3, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s4, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s5, 88(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s6, 84(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s7, 80(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s8, 76(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s9, 72(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s10, 68(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s11, 64(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 0(a0)
-; RV32I-NEXT: lbu a4, 1(a0)
-; RV32I-NEXT: lbu a5, 2(a0)
-; RV32I-NEXT: lbu a6, 3(a0)
-; RV32I-NEXT: lbu a7, 4(a0)
-; RV32I-NEXT: lbu t0, 5(a0)
-; RV32I-NEXT: lbu t1, 6(a0)
-; RV32I-NEXT: lbu t2, 7(a0)
-; RV32I-NEXT: lbu t3, 8(a0)
-; RV32I-NEXT: lbu t4, 9(a0)
-; RV32I-NEXT: lbu t5, 10(a0)
-; RV32I-NEXT: lbu t6, 11(a0)
-; RV32I-NEXT: lbu s2, 12(a0)
-; RV32I-NEXT: lbu s3, 13(a0)
-; RV32I-NEXT: lbu s4, 14(a0)
-; RV32I-NEXT: lbu s5, 15(a0)
-; RV32I-NEXT: lbu s6, 16(a0)
-; RV32I-NEXT: lbu s7, 17(a0)
-; RV32I-NEXT: lbu s8, 18(a0)
-; RV32I-NEXT: lbu s9, 19(a0)
-; RV32I-NEXT: slli a4, a4, 8
-; RV32I-NEXT: slli a5, a5, 16
-; RV32I-NEXT: slli a6, a6, 24
-; RV32I-NEXT: slli t0, t0, 8
-; RV32I-NEXT: slli t1, t1, 16
-; RV32I-NEXT: slli t2, t2, 24
-; RV32I-NEXT: or a3, a4, a3
-; RV32I-NEXT: or a4, a6, a5
-; RV32I-NEXT: or a5, t0, a7
-; RV32I-NEXT: or a6, t2, t1
-; RV32I-NEXT: lbu s10, 20(a0)
-; RV32I-NEXT: lbu s11, 21(a0)
+; RV32I-NEXT: addi sp, sp, -128
+; RV32I-NEXT: sw ra, 124(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s0, 120(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 116(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s2, 112(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s3, 108(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s4, 104(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s5, 100(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s6, 96(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s7, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s8, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s9, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s10, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s11, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT: mv a3, a1
+; RV32I-NEXT: lbu a5, 0(a0)
+; RV32I-NEXT: lbu a7, 1(a0)
+; RV32I-NEXT: lbu t0, 2(a0)
+; RV32I-NEXT: lbu t1, 3(a0)
+; RV32I-NEXT: lbu s2, 4(a0)
+; RV32I-NEXT: lbu s4, 5(a0)
+; RV32I-NEXT: lbu s5, 6(a0)
+; RV32I-NEXT: lbu s6, 7(a0)
+; RV32I-NEXT: lbu s3, 8(a0)
+; RV32I-NEXT: lbu s9, 9(a0)
+; RV32I-NEXT: lbu s10, 10(a0)
+; RV32I-NEXT: lbu s11, 11(a0)
+; RV32I-NEXT: lbu ra, 12(a0)
+; RV32I-NEXT: lbu a1, 13(a0)
+; RV32I-NEXT: lbu t4, 14(a0)
+; RV32I-NEXT: lbu t6, 15(a0)
+; RV32I-NEXT: lbu a4, 16(a0)
+; RV32I-NEXT: sw a4, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT: lbu a6, 17(a0)
+; RV32I-NEXT: lbu t2, 18(a0)
+; RV32I-NEXT: lbu t3, 19(a0)
+; RV32I-NEXT: lbu a4, 20(a0)
+; RV32I-NEXT: lbu t5, 21(a0)
; RV32I-NEXT: lbu s0, 22(a0)
; RV32I-NEXT: lbu s1, 23(a0)
-; RV32I-NEXT: slli t4, t4, 8
-; RV32I-NEXT: slli t5, t5, 16
-; RV32I-NEXT: slli t6, t6, 24
-; RV32I-NEXT: slli s3, s3, 8
-; RV32I-NEXT: slli s4, s4, 16
-; RV32I-NEXT: slli s5, s5, 24
-; RV32I-NEXT: or a7, t4, t3
-; RV32I-NEXT: or t0, t6, t5
-; RV32I-NEXT: or t1, s3, s2
-; RV32I-NEXT: or t2, s5, s4
-; RV32I-NEXT: lbu t3, 24(a0)
-; RV32I-NEXT: lbu s2, 25(a0)
-; RV32I-NEXT: lbu s3, 26(a0)
-; RV32I-NEXT: lbu s4, 27(a0)
-; RV32I-NEXT: slli s7, s7, 8
-; RV32I-NEXT: slli s8, s8, 16
-; RV32I-NEXT: slli s9, s9, 24
-; RV32I-NEXT: slli s11, s11, 8
-; RV32I-NEXT: or t4, s7, s6
-; RV32I-NEXT: or t5, s9, s8
-; RV32I-NEXT: or t6, s11, s10
-; RV32I-NEXT: lbu s5, 28(a0)
-; RV32I-NEXT: lbu s6, 29(a0)
-; RV32I-NEXT: lbu s7, 30(a0)
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: slli t0, t0, 16
+; RV32I-NEXT: slli t1, t1, 24
+; RV32I-NEXT: slli s4, s4, 8
+; RV32I-NEXT: slli s5, s5, 16
+; RV32I-NEXT: slli s6, s6, 24
+; RV32I-NEXT: or a5, a7, a5
+; RV32I-NEXT: or a7, t1, t0
+; RV32I-NEXT: or t0, s4, s2
+; RV32I-NEXT: or t1, s6, s5
+; RV32I-NEXT: lbu s2, 24(a0)
+; RV32I-NEXT: lbu s6, 25(a0)
+; RV32I-NEXT: lbu s7, 26(a0)
+; RV32I-NEXT: lbu s8, 27(a0)
+; RV32I-NEXT: slli s9, s9, 8
+; RV32I-NEXT: slli s10, s10, 16
+; RV32I-NEXT: slli s11, s11, 24
+; RV32I-NEXT: slli a1, a1, 8
+; RV32I-NEXT: or s3, s9, s3
+; RV32I-NEXT: or s4, s11, s10
+; RV32I-NEXT: or s5, a1, ra
+; RV32I-NEXT: lbu s9, 28(a0)
+; RV32I-NEXT: lbu a1, 29(a0)
+; RV32I-NEXT: lbu s10, 30(a0)
; RV32I-NEXT: lbu a0, 31(a0)
-; RV32I-NEXT: lbu a1, 0(a1)
-; RV32I-NEXT: sw zero, 48(sp)
-; RV32I-NEXT: sw zero, 52(sp)
+; RV32I-NEXT: lbu a3, 0(a3)
; RV32I-NEXT: sw zero, 56(sp)
; RV32I-NEXT: sw zero, 60(sp)
-; RV32I-NEXT: sw zero, 32(sp)
-; RV32I-NEXT: sw zero, 36(sp)
+; RV32I-NEXT: sw zero, 64(sp)
+; RV32I-NEXT: sw zero, 68(sp)
; RV32I-NEXT: sw zero, 40(sp)
; RV32I-NEXT: sw zero, 44(sp)
+; RV32I-NEXT: sw zero, 48(sp)
+; RV32I-NEXT: sw zero, 52(sp)
+; RV32I-NEXT: slli t4, t4, 16
+; RV32I-NEXT: slli t6, t6, 24
+; RV32I-NEXT: or t4, t6, t4
+; RV32I-NEXT: addi t6, sp, 8
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: slli t2, t2, 16
+; RV32I-NEXT: slli t3, t3, 24
+; RV32I-NEXT: slli t5, t5, 8
; RV32I-NEXT: slli s0, s0, 16
; RV32I-NEXT: slli s1, s1, 24
-; RV32I-NEXT: or s0, s1, s0
-; RV32I-NEXT: mv s1, sp
-; RV32I-NEXT: slli s2, s2, 8
-; RV32I-NEXT: slli s3, s3, 16
-; RV32I-NEXT: slli s4, s4, 24
; RV32I-NEXT: slli s6, s6, 8
; RV32I-NEXT: slli s7, s7, 16
+; RV32I-NEXT: slli s8, s8, 24
+; RV32I-NEXT: slli a1, a1, 8
+; RV32I-NEXT: slli s10, s10, 16
; RV32I-NEXT: slli a0, a0, 24
-; RV32I-NEXT: slli a1, a1, 2
-; RV32I-NEXT: or t3, s2, t3
-; RV32I-NEXT: or s2, s4, s3
-; RV32I-NEXT: or s3, s6, s5
-; RV32I-NEXT: or a0, a0, s7
-; RV32I-NEXT: andi a1, a1, 28
-; RV32I-NEXT: or a3, a4, a3
-; RV32I-NEXT: or a4, a6, a5
-; RV32I-NEXT: or a5, t0, a7
-; RV32I-NEXT: or a6, t2, t1
-; RV32I-NEXT: or a7, t5, t4
-; RV32I-NEXT: or t0, s0, t6
-; RV32I-NEXT: or t1, s2, t3
-; RV32I-NEXT: or a0, a0, s3
-; RV32I-NEXT: add s1, s1, a1
-; RV32I-NEXT: sw a7, 16(sp)
-; RV32I-NEXT: sw t0, 20(sp)
-; RV32I-NEXT: sw t1, 24(sp)
-; RV32I-NEXT: sw a0, 28(sp)
-; RV32I-NEXT: sw a3, 0(sp)
-; RV32I-NEXT: sw a4, 4(sp)
+; RV32I-NEXT: slli a3, a3, 2
+; RV32I-NEXT: lw s11, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or a6, a6, s11
+; RV32I-NEXT: or t2, t3, t2
+; RV32I-NEXT: or a4, t5, a4
+; RV32I-NEXT: or s0, s1, s0
+; RV32I-NEXT: or t3, s6, s2
+; RV32I-NEXT: or t5, s8, s7
+; RV32I-NEXT: or a1, a1, s9
+; RV32I-NEXT: or a0, a0, s10
+; RV32I-NEXT: andi a3, a3, 28
+; RV32I-NEXT: or a5, a7, a5
+; RV32I-NEXT: or a7, t1, t0
+; RV32I-NEXT: or t0, s4, s3
+; RV32I-NEXT: or t1, t4, s5
+; RV32I-NEXT: or a6, t2, a6
+; RV32I-NEXT: or a4, s0, a4
+; RV32I-NEXT: or t2, t5, t3
+; RV32I-NEXT: or a0, a0, a1
+; RV32I-NEXT: add t6, t6, a3
+; RV32I-NEXT: sw a6, 24(sp)
+; RV32I-NEXT: sw a4, 28(sp)
+; RV32I-NEXT: sw t2, 32(sp)
+; RV32I-NEXT: sw a0, 36(sp)
; RV32I-NEXT: sw a5, 8(sp)
-; RV32I-NEXT: sw a6, 12(sp)
-; RV32I-NEXT: lw a6, 16(s1)
-; RV32I-NEXT: lw a5, 20(s1)
-; RV32I-NEXT: lw a7, 24(s1)
-; RV32I-NEXT: lw a1, 0(s1)
-; RV32I-NEXT: lw a0, 4(s1)
-; RV32I-NEXT: lw a4, 8(s1)
-; RV32I-NEXT: lw a3, 12(s1)
-; RV32I-NEXT: lw t0, 28(s1)
+; RV32I-NEXT: sw a7, 12(sp)
+; RV32I-NEXT: sw t0, 16(sp)
+; RV32I-NEXT: sw t1, 20(sp)
+; RV32I-NEXT: lw a6, 16(t6)
+; RV32I-NEXT: lw a5, 20(t6)
+; RV32I-NEXT: lw a7, 24(t6)
+; RV32I-NEXT: lw a1, 0(t6)
+; RV32I-NEXT: lw a0, 4(t6)
+; RV32I-NEXT: lw a4, 8(t6)
+; RV32I-NEXT: lw a3, 12(t6)
+; RV32I-NEXT: lw t0, 28(t6)
; RV32I-NEXT: srli t1, a7, 24
; RV32I-NEXT: srli t2, a7, 16
; RV32I-NEXT: srli t3, a7, 8
@@ -2814,21 +2822,21 @@ define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
; RV32I-NEXT: srli s5, a5, 8
; RV32I-NEXT: srli s6, a4, 24
; RV32I-NEXT: srli s7, a4, 16
+; RV32I-NEXT: srli s8, a4, 8
+; RV32I-NEXT: srli s9, a3, 24
+; RV32I-NEXT: srli s10, a3, 16
+; RV32I-NEXT: srli s11, a3, 8
; RV32I-NEXT: sb a7, 24(a2)
-; RV32I-NEXT: srli a7, a4, 8
+; RV32I-NEXT: srli a7, a1, 24
; RV32I-NEXT: sb t3, 25(a2)
-; RV32I-NEXT: srli t3, a3, 24
; RV32I-NEXT: sb t2, 26(a2)
-; RV32I-NEXT: srli t2, a3, 16
; RV32I-NEXT: sb t1, 27(a2)
-; RV32I-NEXT: srli t1, a3, 8
+; RV32I-NEXT: srli t1, a1, 16
; RV32I-NEXT: sb t0, 28(a2)
-; RV32I-NEXT: srli t0, a1, 24
; RV32I-NEXT: sb t6, 29(a2)
-; RV32I-NEXT: srli t6, a1, 16
; RV32I-NEXT: sb t5, 30(a2)
; RV32I-NEXT: sb t4, 31(a2)
-; RV32I-NEXT: srli t4, a1, 8
+; RV32I-NEXT: srli t0, a1, 8
; RV32I-NEXT: sb a6, 16(a2)
; RV32I-NEXT: sb s2, 17(a2)
; RV32I-NEXT: sb s1, 18(a2)
@@ -2840,35 +2848,36 @@ define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
; RV32I-NEXT: sb s3, 23(a2)
; RV32I-NEXT: srli a5, a0, 16
; RV32I-NEXT: sb a4, 8(a2)
-; RV32I-NEXT: sb a7, 9(a2)
+; RV32I-NEXT: sb s8, 9(a2)
; RV32I-NEXT: sb s7, 10(a2)
; RV32I-NEXT: sb s6, 11(a2)
; RV32I-NEXT: srli a4, a0, 8
; RV32I-NEXT: sb a3, 12(a2)
-; RV32I-NEXT: sb t1, 13(a2)
-; RV32I-NEXT: sb t2, 14(a2)
-; RV32I-NEXT: sb t3, 15(a2)
+; RV32I-NEXT: sb s11, 13(a2)
+; RV32I-NEXT: sb s10, 14(a2)
+; RV32I-NEXT: sb s9, 15(a2)
; RV32I-NEXT: sb a1, 0(a2)
-; RV32I-NEXT: sb t4, 1(a2)
-; RV32I-NEXT: sb t6, 2(a2)
-; RV32I-NEXT: sb t0, 3(a2)
+; RV32I-NEXT: sb t0, 1(a2)
+; RV32I-NEXT: sb t1, 2(a2)
+; RV32I-NEXT: sb a7, 3(a2)
; RV32I-NEXT: sb a0, 4(a2)
; RV32I-NEXT: sb a4, 5(a2)
; RV32I-NEXT: sb a5, 6(a2)
; RV32I-NEXT: sb a6, 7(a2)
-; RV32I-NEXT: lw s0, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s1, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s2, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s3, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s4, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s5, 88(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s6, 84(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s7, 80(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s8, 76(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s9, 72(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s10, 68(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s11, 64(sp) # 4-byte Folded Reload
-; RV32I-NEXT: addi sp, sp, 112
+; RV32I-NEXT: lw ra, 124(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s0, 120(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 116(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s2, 112(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s3, 108(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s4, 104(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s5, 100(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s6, 96(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s7, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s8, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s9, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s10, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s11, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT: addi sp, sp, 128
; RV32I-NEXT: ret
%src = load i256, ptr %src.ptr, align 1
%wordOff = load i256, ptr %wordOff.ptr, align 1
@@ -2894,111 +2903,111 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
; RV64I-NEXT: sd s9, 80(sp) # 8-byte Folded Spill
; RV64I-NEXT: sd s10, 72(sp) # 8-byte Folded Spill
; RV64I-NEXT: sd s11, 64(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 0(a0)
-; RV64I-NEXT: lbu a4, 1(a0)
-; RV64I-NEXT: lbu a5, 2(a0)
-; RV64I-NEXT: lbu a6, 3(a0)
-; RV64I-NEXT: lbu a7, 4(a0)
-; RV64I-NEXT: lbu t0, 5(a0)
-; RV64I-NEXT: lbu t1, 6(a0)
-; RV64I-NEXT: lbu t2, 7(a0)
-; RV64I-NEXT: lbu t3, 8(a0)
-; RV64I-NEXT: lbu t4, 9(a0)
-; RV64I-NEXT: lbu t5, 10(a0)
-; RV64I-NEXT: lbu t6, 11(a0)
-; RV64I-NEXT: lbu s0, 12(a0)
-; RV64I-NEXT: lbu s1, 13(a0)
-; RV64I-NEXT: lbu s2, 14(a0)
-; RV64I-NEXT: lbu s3, 15(a0)
-; RV64I-NEXT: lbu s4, 16(a0)
-; RV64I-NEXT: lbu s5, 17(a0)
-; RV64I-NEXT: lbu s6, 18(a0)
-; RV64I-NEXT: lbu s7, 19(a0)
-; RV64I-NEXT: slli a4, a4, 8
-; RV64I-NEXT: slli a5, a5, 16
-; RV64I-NEXT: slli a6, a6, 24
-; RV64I-NEXT: slli t0, t0, 8
-; RV64I-NEXT: slli t1, t1, 16
-; RV64I-NEXT: slli t2, t2, 24
-; RV64I-NEXT: or a3, a4, a3
-; RV64I-NEXT: or a4, a6, a5
-; RV64I-NEXT: or a5, t0, a7
-; RV64I-NEXT: or a6, t2, t1
-; RV64I-NEXT: lbu s8, 20(a0)
-; RV64I-NEXT: lbu s9, 21(a0)
-; RV64I-NEXT: lbu s10, 22(a0)
-; RV64I-NEXT: lbu s11, 23(a0)
-; RV64I-NEXT: slli t4, t4, 8
-; RV64I-NEXT: slli t5, t5, 16
-; RV64I-NEXT: slli t6, t6, 24
-; RV64I-NEXT: slli s1, s1, 8
-; RV64I-NEXT: slli s2, s2, 16
+; RV64I-NEXT: lbu a5, 0(a0)
+; RV64I-NEXT: lbu a7, 1(a0)
+; RV64I-NEXT: lbu t2, 2(a0)
+; RV64I-NEXT: lbu s3, 3(a0)
+; RV64I-NEXT: lbu t0, 4(a0)
+; RV64I-NEXT: lbu s8, 5(a0)
+; RV64I-NEXT: lbu s9, 6(a0)
+; RV64I-NEXT: lbu s10, 7(a0)
+; RV64I-NEXT: lbu s2, 8(a0)
+; RV64I-NEXT: lbu s4, 9(a0)
+; RV64I-NEXT: lbu s5, 10(a0)
+; RV64I-NEXT: lbu s6, 11(a0)
+; RV64I-NEXT: lbu s7, 12(a0)
+; RV64I-NEXT: lbu s11, 13(a0)
+; RV64I-NEXT: lbu t1, 14(a0)
+; RV64I-NEXT: lbu t3, 15(a0)
+; RV64I-NEXT: lbu a3, 16(a0)
+; RV64I-NEXT: lbu a6, 17(a0)
+; RV64I-NEXT: lbu t4, 18(a0)
+; RV64I-NEXT: lbu t5, 19(a0)
+; RV64I-NEXT: lbu a4, 20(a0)
+; RV64I-NEXT: lbu t6, 21(a0)
+; RV64I-NEXT: lbu s0, 22(a0)
+; RV64I-NEXT: lbu s1, 23(a0)
+; RV64I-NEXT: slli a7, a7, 8
+; RV64I-NEXT: slli t2, t2, 16
; RV64I-NEXT: slli s3, s3, 24
-; RV64I-NEXT: or a7, t4, t3
-; RV64I-NEXT: or t0, t6, t5
-; RV64I-NEXT: or t1, s1, s0
-; RV64I-NEXT: or t2, s3, s2
-; RV64I-NEXT: lbu t3, 24(a0)
-; RV64I-NEXT: lbu t4, 25(a0)
-; RV64I-NEXT: lbu t5, 26(a0)
-; RV64I-NEXT: lbu t6, 27(a0)
-; RV64I-NEXT: slli s5, s5, 8
-; RV64I-NEXT: slli s6, s6, 16
-; RV64I-NEXT: slli s7, s7, 24
-; RV64I-NEXT: slli s9, s9, 8
-; RV64I-NEXT: or s0, s5, s4
-; RV64I-NEXT: or s1, s7, s6
-; RV64I-NEXT: or s2, s9, s8
-; RV64I-NEXT: lbu s3, 28(a0)
-; RV64I-NEXT: lbu s4, 29(a0)
-; RV64I-NEXT: lbu s5, 30(a0)
+; RV64I-NEXT: slli s8, s8, 8
+; RV64I-NEXT: slli s9, s9, 16
+; RV64I-NEXT: slli s10, s10, 24
+; RV64I-NEXT: or a5, a7, a5
+; RV64I-NEXT: or a7, s3, t2
+; RV64I-NEXT: or t0, s8, t0
+; RV64I-NEXT: or t2, s10, s9
+; RV64I-NEXT: lbu s3, 24(a0)
+; RV64I-NEXT: lbu s8, 25(a0)
+; RV64I-NEXT: lbu s9, 26(a0)
+; RV64I-NEXT: lbu s10, 27(a0)
+; RV64I-NEXT: slli s4, s4, 8
+; RV64I-NEXT: slli s5, s5, 16
+; RV64I-NEXT: slli s6, s6, 24
+; RV64I-NEXT: slli s11, s11, 8
+; RV64I-NEXT: or s2, s4, s2
+; RV64I-NEXT: or s4, s6, s5
+; RV64I-NEXT: or s5, s11, s7
+; RV64I-NEXT: lbu s6, 28(a0)
+; RV64I-NEXT: lbu s7, 29(a0)
+; RV64I-NEXT: lbu s11, 30(a0)
; RV64I-NEXT: lbu a0, 31(a0)
; RV64I-NEXT: lbu a1, 0(a1)
; RV64I-NEXT: sd zero, 32(sp)
; RV64I-NEXT: sd zero, 40(sp)
; RV64I-NEXT: sd zero, 48(sp)
; RV64I-NEXT: sd zero, 56(sp)
-; RV64I-NEXT: slli s10, s10, 16
-; RV64I-NEXT: slli s11, s11, 24
-; RV64I-NEXT: or s6, s11, s10
-; RV64I-NEXT: mv s7, sp
-; RV64I-NEXT: slli t4, t4, 8
-; RV64I-NEXT: slli t5, t5, 16
-; RV64I-NEXT: slli t6, t6, 24
-; RV64I-NEXT: slli s4, s4, 8
-; RV64I-NEXT: slli s5, s5, 16
+; RV64I-NEXT: slli t1, t1, 16
+; RV64I-NEXT: slli t3, t3, 24
+; RV64I-NEXT: or t1, t3, t1
+; RV64I-NEXT: mv t3, sp
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: slli t4, t4, 16
+; RV64I-NEXT: slli t5, t5, 24
+; RV64I-NEXT: slli t6, t6, 8
+; RV64I-NEXT: slli s0, s0, 16
+; RV64I-NEXT: slli s1, s1, 24
+; RV64I-NEXT: slli s8, s8, 8
+; RV64I-NEXT: slli s9, s9, 16
+; RV64I-NEXT: slli s10, s10, 24
+; RV64I-NEXT: slli s7, s7, 8
+; RV64I-NEXT: slli s11, s11, 16
; RV64I-NEXT: slli a0, a0, 24
; RV64I-NEXT: slli a1, a1, 3
-; RV64I-NEXT: or t3, t4, t3
-; RV64I-NEXT: or t4, t6, t5
-; RV64I-NEXT: or t5, s4, s3
-; RV64I-NEXT: or a0, a0, s5
-; RV64I-NEXT: andi a1, a1, 24
-; RV64I-NEXT: or a3, a4, a3
-; RV64I-NEXT: or a4, a6, a5
-; RV64I-NEXT: or a5, t0, a7
-; RV64I-NEXT: or a6, t2, t1
+; RV64I-NEXT: or a3, a6, a3
+; RV64I-NEXT: or a6, t5, t4
+; RV64I-NEXT: or a4, t6, a4
; RV64I-NEXT: or s0, s1, s0
-; RV64I-NEXT: or a7, s6, s2
-; RV64I-NEXT: or t0, t4, t3
-; RV64I-NEXT: or a0, a0, t5
-; RV64I-NEXT: add s7, s7, a1
-; RV64I-NEXT: slli a4, a4, 32
-; RV64I-NEXT: slli a6, a6, 32
+; RV64I-NEXT: or t4, s8, s3
+; RV64I-NEXT: or t5, s10, s9
+; RV64I-NEXT: or t6, s7, s6
+; RV64I-NEXT: or a0, a0, s11
+; RV64I-NEXT: andi a1, a1, 24
+; RV64I-NEXT: or a5, a7, a5
+; RV64I-NEXT: or a7, t2, t0
+; RV64I-NEXT: or t0, s4, s2
+; RV64I-NEXT: or t1, t1, s5
+; RV64I-NEXT: or a3, a6, a3
+; RV64I-NEXT: or a4, s0, a4
+; RV64I-NEXT: or a6, t5, t4
+; RV64I-NEXT: or a0, a0, t6
+; RV64I-NEXT: add t3, t3, a1
; RV64I-NEXT: slli a7, a7, 32
+; RV64I-NEXT: slli t1, t1, 32
+; RV64I-NEXT: slli a4, a4, 32
; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: or a1, a7, a5
+; RV64I-NEXT: or a5, t1, t0
; RV64I-NEXT: or a3, a4, a3
-; RV64I-NEXT: or a1, a6, a5
-; RV64I-NEXT: or a4, a7, s0
-; RV64I-NEXT: or a0, a0, t0
-; RV64I-NEXT: sd a3, 0(sp)
-; RV64I-NEXT: sd a1, 8(sp)
-; RV64I-NEXT: sd a4, 16(sp)
+; RV64I-NEXT: or a0, a0, a6
+; RV64I-NEXT: sd a1, 0(sp)
+; RV64I-NEXT: sd a5, 8(sp)
+; RV64I-NEXT: sd a3, 16(sp)
; RV64I-NEXT: sd a0, 24(sp)
-; RV64I-NEXT: ld a4, 16(s7)
-; RV64I-NEXT: ld a0, 8(s7)
-; RV64I-NEXT: ld a1, 0(s7)
-; RV64I-NEXT: ld a3, 24(s7)
+; RV64I-NEXT: ld a4, 16(t3)
+; RV64I-NEXT: ld a0, 8(t3)
+; RV64I-NEXT: ld a1, 0(t3)
+; RV64I-NEXT: ld a3, 24(t3)
; RV64I-NEXT: srli a5, a4, 56
; RV64I-NEXT: srli a6, a4, 48
; RV64I-NEXT: srli a7, a4, 40
@@ -3017,25 +3026,25 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
; RV64I-NEXT: srli s5, a1, 48
; RV64I-NEXT: srli s6, a1, 40
; RV64I-NEXT: srli s7, a1, 32
+; RV64I-NEXT: srli s8, a1, 24
+; RV64I-NEXT: srli s9, a1, 16
+; RV64I-NEXT: srli s10, a1, 8
+; RV64I-NEXT: srli s11, a0, 56
; RV64I-NEXT: sb t0, 20(a2)
-; RV64I-NEXT: srli t0, a1, 24
; RV64I-NEXT: sb a7, 21(a2)
-; RV64I-NEXT: srli a7, a1, 16
; RV64I-NEXT: sb a6, 22(a2)
-; RV64I-NEXT: srli a6, a1, 8
; RV64I-NEXT: sb a5, 23(a2)
-; RV64I-NEXT: srli a5, a0, 56
+; RV64I-NEXT: srli a5, a0, 48
; RV64I-NEXT: sb a4, 16(a2)
-; RV64I-NEXT: srli a4, a0, 48
; RV64I-NEXT: sb t3, 17(a2)
; RV64I-NEXT: sb t2, 18(a2)
; RV64I-NEXT: sb t1, 19(a2)
-; RV64I-NEXT: srli t1, a0, 40
+; RV64I-NEXT: srli a4, a0, 40
; RV64I-NEXT: sb s0, 28(a2)
; RV64I-NEXT: sb t6, 29(a2)
; RV64I-NEXT: sb t5, 30(a2)
; RV64I-NEXT: sb t4, 31(a2)
-; RV64I-NEXT: srli t2, a0, 32
+; RV64I-NEXT: srli a6, a0, 32
; RV64I-NEXT: sb a3, 24(a2)
; RV64I-NEXT: sb s3, 25(a2)
; RV64I-NEXT: sb s2, 26(a2)
@@ -3045,19 +3054,19 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
; RV64I-NEXT: sb s6, 5(a2)
; RV64I-NEXT: sb s5, 6(a2)
; RV64I-NEXT: sb s4, 7(a2)
-; RV64I-NEXT: srli t3, a0, 16
+; RV64I-NEXT: srli a7, a0, 16
; RV64I-NEXT: sb a1, 0(a2)
-; RV64I-NEXT: sb a6, 1(a2)
-; RV64I-NEXT: sb a7, 2(a2)
-; RV64I-NEXT: sb t0, 3(a2)
+; RV64I-NEXT: sb s10, 1(a2)
+; RV64I-NEXT: sb s9, 2(a2)
+; RV64I-NEXT: sb s8, 3(a2)
; RV64I-NEXT: srli a1, a0, 8
-; RV64I-NEXT: sb t2, 12(a2)
-; RV64I-NEXT: sb t1, 13(a2)
-; RV64I-NEXT: sb a4, 14(a2)
-; RV64I-NEXT: sb a5, 15(a2)
+; RV64I-NEXT: sb a6, 12(a2)
+; RV64I-NEXT: sb a4, 13(a2)
+; RV64I-NEXT: sb a5, 14(a2)
+; RV64I-NEXT: sb s11, 15(a2)
; RV64I-NEXT: sb a0, 8(a2)
; RV64I-NEXT: sb a1, 9(a2)
-; RV64I-NEXT: sb t3, 10(a2)
+; RV64I-NEXT: sb a7, 10(a2)
; RV64I-NEXT: sb a3, 11(a2)
; RV64I-NEXT: ld s0, 152(sp) # 8-byte Folded Reload
; RV64I-NEXT: ld s1, 144(sp) # 8-byte Folded Reload
@@ -3076,128 +3085,132 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
;
; RV32I-LABEL: lshr_32bytes_dwordOff:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi sp, sp, -112
-; RV32I-NEXT: sw s0, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s1, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s2, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s3, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s4, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s5, 88(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s6, 84(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s7, 80(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s8, 76(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s9, 72(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s10, 68(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s11, 64(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 0(a0)
-; RV32I-NEXT: lbu a4, 1(a0)
-; RV32I-NEXT: lbu a5, 2(a0)
-; RV32I-NEXT: lbu a6, 3(a0)
-; RV32I-NEXT: lbu a7, 4(a0)
-; RV32I-NEXT: lbu t0, 5(a0)
-; RV32I-NEXT: lbu t1, 6(a0)
-; RV32I-NEXT: lbu t2, 7(a0)
-; RV32I-NEXT: lbu t3, 8(a0)
-; RV32I-NEXT: lbu t4, 9(a0)
-; RV32I-NEXT: lbu t5, 10(a0)
-; RV32I-NEXT: lbu t6, 11(a0)
-; RV32I-NEXT: lbu s2, 12(a0)
-; RV32I-NEXT: lbu s3, 13(a0)
-; RV32I-NEXT: lbu s4, 14(a0)
-; RV32I-NEXT: lbu s5, 15(a0)
-; RV32I-NEXT: lbu s6, 16(a0)
-; RV32I-NEXT: lbu s7, 17(a0)
-; RV32I-NEXT: lbu s8, 18(a0)
-; RV32I-NEXT: lbu s9, 19(a0)
-; RV32I-NEXT: slli a4, a4, 8
-; RV32I-NEXT: slli a5, a5, 16
-; RV32I-NEXT: slli a6, a6, 24
-; RV32I-NEXT: slli t0, t0, 8
-; RV32I-NEXT: slli t1, t1, 16
-; RV32I-NEXT: slli t2, t2, 24
-; RV32I-NEXT: or a3, a4, a3
-; RV32I-NEXT: or a4, a6, a5
-; RV32I-NEXT: or a5, t0, a7
-; RV32I-NEXT: or a6, t2, t1
-; RV32I-NEXT: lbu s10, 20(a0)
-; RV32I-NEXT: lbu s11, 21(a0)
+; RV32I-NEXT: addi sp, sp, -128
+; RV32I-NEXT: sw ra, 124(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s0, 120(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 116(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s2, 112(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s3, 108(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s4, 104(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s5, 100(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s6, 96(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s7, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s8, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s9, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s10, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s11, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT: mv a3, a1
+; RV32I-NEXT: lbu a5, 0(a0)
+; RV32I-NEXT: lbu a7, 1(a0)
+; RV32I-NEXT: lbu t0, 2(a0)
+; RV32I-NEXT: lbu t1, 3(a0)
+; RV32I-NEXT: lbu s2, 4(a0)
+; RV32I-NEXT: lbu s4, 5(a0)
+; RV32I-NEXT: lbu s5, 6(a0)
+; RV32I-NEXT: lbu s6, 7(a0)
+; RV32I-NEXT: lbu s3, 8(a0)
+; RV32I-NEXT: lbu s9, 9(a0)
+; RV32I-NEXT: lbu s10, 10(a0)
+; RV32I-NEXT: lbu s11, 11(a0)
+; RV32I-NEXT: lbu ra, 12(a0)
+; RV32I-NEXT: lbu a1, 13(a0)
+; RV32I-NEXT: lbu t4, 14(a0)
+; RV32I-NEXT: lbu t6, 15(a0)
+; RV32I-NEXT: lbu a4, 16(a0)
+; RV32I-NEXT: sw a4, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT: lbu a6, 17(a0)
+; RV32I-NEXT: lbu t2, 18(a0)
+; RV32I-NEXT: lbu t3, 19(a0)
+; RV32I-NEXT: lbu a4, 20(a0)
+; RV32I-NEXT: lbu t5, 21(a0)
; RV32I-NEXT: lbu s0, 22(a0)
; RV32I-NEXT: lbu s1, 23(a0)
-; RV32I-NEXT: slli t4, t4, 8
-; RV32I-NEXT: slli t5, t5, 16
-; RV32I-NEXT: slli t6, t6, 24
-; RV32I-NEXT: slli s3, s3, 8
-; RV32I-NEXT: slli s4, s4, 16
-; RV32I-NEXT: slli s5, s5, 24
-; RV32I-NEXT: or a7, t4, t3
-; RV32I-NEXT: or t0, t6, t5
-; RV32I-NEXT: or t1, s3, s2
-; RV32I-NEXT: or t2, s5, s4
-; RV32I-NEXT: lbu t3, 24(a0)
-; RV32I-NEXT: lbu s2, 25(a0)
-; RV32I-NEXT: lbu s3, 26(a0)
-; RV32I-NEXT: lbu s4, 27(a0)
-; RV32I-NEXT: slli s7, s7, 8
-; RV32I-NEXT: slli s8, s8, 16
-; RV32I-NEXT: slli s9, s9, 24
-; RV32I-NEXT: slli s11, s11, 8
-; RV32I-NEXT: or t4, s7, s6
-; RV32I-NEXT: or t5, s9, s8
-; RV32I-NEXT: or t6, s11, s10
-; RV32I-NEXT: lbu s5, 28(a0)
-; RV32I-NEXT: lbu s6, 29(a0)
-; RV32I-NEXT: lbu s7, 30(a0)
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: slli t0, t0, 16
+; RV32I-NEXT: slli t1, t1, 24
+; RV32I-NEXT: slli s4, s4, 8
+; RV32I-NEXT: slli s5, s5, 16
+; RV32I-NEXT: slli s6, s6, 24
+; RV32I-NEXT: or a5, a7, a5
+; RV32I-NEXT: or a7, t1, t0
+; RV32I-NEXT: or t0, s4, s2
+; RV32I-NEXT: or t1, s6, s5
+; RV32I-NEXT: lbu s2, 24(a0)
+; RV32I-NEXT: lbu s6, 25(a0)
+; RV32I-NEXT: lbu s7, 26(a0)
+; RV32I-NEXT: lbu s8, 27(a0)
+; RV32I-NEXT: slli s9, s9, 8
+; RV32I-NEXT: slli s10, s10, 16
+; RV32I-NEXT: slli s11, s11, 24
+; RV32I-NEXT: slli a1, a1, 8
+; RV32I-NEXT: or s3, s9, s3
+; RV32I-NEXT: or s4, s11, s10
+; RV32I-NEXT: or s5, a1, ra
+; RV32I-NEXT: lbu s9, 28(a0)
+; RV32I-NEXT: lbu a1, 29(a0)
+; RV32I-NEXT: lbu s10, 30(a0)
; RV32I-NEXT: lbu a0, 31(a0)
-; RV32I-NEXT: lbu a1, 0(a1)
-; RV32I-NEXT: sw zero, 48(sp)
-; RV32I-NEXT: sw zero, 52(sp)
+; RV32I-NEXT: lbu a3, 0(a3)
; RV32I-NEXT: sw zero, 56(sp)
; RV32I-NEXT: sw zero, 60(sp)
-; RV32I-NEXT: sw zero, 32(sp)
-; RV32I-NEXT: sw zero, 36(sp)
+; RV32I-NEXT: sw zero, 64(sp)
+; RV32I-NEXT: sw zero, 68(sp)
; RV32I-NEXT: sw zero, 40(sp)
; RV32I-NEXT: sw zero, 44(sp)
+; RV32I-NEXT: sw zero, 48(sp)
+; RV32I-NEXT: sw zero, 52(sp)
+; RV32I-NEXT: slli t4, t4, 16
+; RV32I-NEXT: slli t6, t6, 24
+; RV32I-NEXT: or t4, t6, t4
+; RV32I-NEXT: addi t6, sp, 8
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: slli t2, t2, 16
+; RV32I-NEXT: slli t3, t3, 24
+; RV32I-NEXT: slli t5, t5, 8
; RV32I-NEXT: slli s0, s0, 16
; RV32I-NEXT: slli s1, s1, 24
-; RV32I-NEXT: or s0, s1, s0
-; RV32I-NEXT: mv s1, sp
-; RV32I-NEXT: slli s2, s2, 8
-; RV32I-NEXT: slli s3, s3, 16
-; RV32I-NEXT: slli s4, s4, 24
; RV32I-NEXT: slli s6, s6, 8
; RV32I-NEXT: slli s7, s7, 16
+; RV32I-NEXT: slli s8, s8, 24
+; RV32I-NEXT: slli a1, a1, 8
+; RV32I-NEXT: slli s10, s10, 16
; RV32I-NEXT: slli a0, a0, 24
-; RV32I-NEXT: slli a1, a1, 3
-; RV32I-NEXT: or t3, s2, t3
-; RV32I-NEXT: or s2, s4, s3
-; RV32I-NEXT: or s3, s6, s5
-; RV32I-NEXT: or a0, a0, s7
-; RV32I-NEXT: andi a1, a1, 24
-; RV32I-NEXT: or a3, a4, a3
-; RV32I-NEXT: or a4, a6, a5
-; RV32I-NEXT: or a5, t0, a7
-; RV32I-NEXT: or a6, t2, t1
-; RV32I-NEXT: or a7, t5, t4
-; RV32I-NEXT: or t0, s0, t6
-; RV32I-NEXT: or t1, s2, t3
-; RV32I-NEXT: or a0, a0, s3
-; RV32I-NEXT: add s1, s1, a1
-; RV32I-NEXT: sw a7, 16(sp)
-; RV32I-NEXT: sw t0, 20(sp)
-; RV32I-NEXT: sw t1, 24(sp)
-; RV32I-NEXT: sw a0, 28(sp)
-; RV32I-NEXT: sw a3, 0(sp)
-; RV32I-NEXT: sw a4, 4(sp)
+; RV32I-NEXT: slli a3, a3, 3
+; RV32I-NEXT: lw s11, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or a6, a6, s11
+; RV32I-NEXT: or t2, t3, t2
+; RV32I-NEXT: or a4, t5, a4
+; RV32I-NEXT: or s0, s1, s0
+; RV32I-NEXT: or t3, s6, s2
+; RV32I-NEXT: or t5, s8, s7
+; RV32I-NEXT: or a1, a1, s9
+; RV32I-NEXT: or a0, a0, s10
+; RV32I-NEXT: andi a3, a3, 24
+; RV32I-NEXT: or a5, a7, a5
+; RV32I-NEXT: or a7, t1, t0
+; RV32I-NEXT: or t0, s4, s3
+; RV32I-NEXT: or t1, t4, s5
+; RV32I-NEXT: or a6, t2, a6
+; RV32I-NEXT: or a4, s0, a4
+; RV32I-NEXT: or t2, t5, t3
+; RV32I-NEXT: or a0, a0, a1
+; RV32I-NEXT: add t6, t6, a3
+; RV32I-NEXT: sw a6, 24(sp)
+; RV32I-NEXT: sw a4, 28(sp)
+; RV32I-NEXT: sw t2, 32(sp)
+; RV32I-NEXT: sw a0, 36(sp)
; RV32I-NEXT: sw a5, 8(sp)
-; RV32I-NEXT: sw a6, 12(sp)
-; RV32I-NEXT: lw a6, 16(s1)
-; RV32I-NEXT: lw a5, 20(s1)
-; RV32I-NEXT: lw a7, 24(s1)
-; RV32I-NEXT: lw a1, 0(s1)
-; RV32I-NEXT: lw a0, 4(s1)
-; RV32I-NEXT: lw a4, 8(s1)
-; RV32I-NEXT: lw a3, 12(s1)
-; RV32I-NEXT: lw t0, 28(s1)
+; RV32I-NEXT: sw a7, 12(sp)
+; RV32I-NEXT: sw t0, 16(sp)
+; RV32I-NEXT: sw t1, 20(sp)
+; RV32I-NEXT: lw a6, 16(t6)
+; RV32I-NEXT: lw a5, 20(t6)
+; RV32I-NEXT: lw a7, 24(t6)
+; RV32I-NEXT: lw a1, 0(t6)
+; RV32I-NEXT: lw a0, 4(t6)
+; RV32I-NEXT: lw a4, 8(t6)
+; RV32I-NEXT: lw a3, 12(t6)
+; RV32I-NEXT: lw t0, 28(t6)
; RV32I-NEXT: srli t1, a7, 24
; RV32I-NEXT: srli t2, a7, 16
; RV32I-NEXT: srli t3, a7, 8
@@ -3212,21 +3225,21 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
; RV32I-NEXT: srli s5, a5, 8
; RV32I-NEXT: srli s6, a4, 24
; RV32I-NEXT: srli s7, a4, 16
+; RV32I-NEXT: srli s8, a4, 8
+; RV32I-NEXT: srli s9, a3, 24
+; RV32I-NEXT: srli s10, a3, 16
+; RV32I-NEXT: srli s11, a3, 8
; RV32I-NEXT: sb a7, 24(a2)
-; RV32I-NEXT: srli a7, a4, 8
+; RV32I-NEXT: srli a7, a1, 24
; RV32I-NEXT: sb t3, 25(a2)
-; RV32I-NEXT: srli t3, a3, 24
; RV32I-NEXT: sb t2, 26(a2)
-; RV32I-NEXT: srli t2, a3, 16
; RV32I-NEXT: sb t1, 27(a2)
-; RV32I-NEXT: srli t1, a3, 8
+; RV32I-NEXT: srli t1, a1, 16
; RV32I-NEXT: sb t0, 28(a2)
-; RV32I-NEXT: srli t0, a1, 24
; RV32I-NEXT: sb t6, 29(a2)
-; RV32I-NEXT: srli t6, a1, 16
; RV32I-NEXT: sb t5, 30(a2)
; RV32I-NEXT: sb t4, 31(a2)
-; RV32I-NEXT: srli t4, a1, 8
+; RV32I-NEXT: srli t0, a1, 8
; RV32I-NEXT: sb a6, 16(a2)
; RV32I-NEXT: sb s2, 17(a2)
; RV32I-NEXT: sb s1, 18(a2)
@@ -3238,35 +3251,36 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
; RV32I-NEXT: sb s3, 23(a2)
; RV32I-NEXT: srli a5, a0, 16
; RV32I-NEXT: sb a4, 8(a2)
-; RV32I-NEXT: sb a7, 9(a2)
+; RV32I-NEXT: sb s8, 9(a2)
; RV32I-NEXT: sb s7, 10(a2)
; RV32I-NEXT: sb s6, 11(a2)
; RV32I-NEXT: srli a4, a0, 8
; RV32I-NEXT: sb a3, 12(a2)
-; RV32I-NEXT: sb t1, 13(a2)
-; RV32I-NEXT: sb t2, 14(a2)
-; RV32I-NEXT: sb t3, 15(a2)
+; RV32I-NEXT: sb s11, 13(a2)
+; RV32I-NEXT: sb s10, 14(a2)
+; RV32I-NEXT: sb s9, 15(a2)
; RV32I-NEXT: sb a1, 0(a2)
-; RV32I-NEXT: sb t4, 1(a2)
-; RV32I-NEXT: sb t6, 2(a2)
-; RV32I-NEXT: sb t0, 3(a2)
+; RV32I-NEXT: sb t0, 1(a2)
+; RV32I-NEXT: sb t1, 2(a2)
+; RV32I-NEXT: sb a7, 3(a2)
; RV32I-NEXT: sb a0, 4(a2)
; RV32I-NEXT: sb a4, 5(a2)
; RV32I-NEXT: sb a5, 6(a2)
; RV32I-NEXT: sb a6, 7(a2)
-; RV32I-NEXT: lw s0, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s1, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s2, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s3, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s4, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s5, 88(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s6, 84(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s7, 80(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s8, 76(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s9, 72(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s10, 68(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s11, 64(sp) # 4-byte Folded Reload
-; RV32I-NEXT: addi sp, sp, 112
+; RV32I-NEXT: lw ra, 124(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s0, 120(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 116(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s2, 112(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s3, 108(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s4, 104(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s5, 100(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s6, 96(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s7, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s8, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s9, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s10, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s11, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT: addi sp, sp, 128
; RV32I-NEXT: ret
%src = load i256, ptr %src.ptr, align 1
%dwordOff = load i256, ptr %dwordOff.ptr, align 1
@@ -3510,129 +3524,132 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
;
; RV32I-LABEL: shl_32bytes:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi sp, sp, -112
-; RV32I-NEXT: sw s0, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s1, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s2, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s3, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s4, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s5, 88(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s6, 84(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s7, 80(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s8, 76(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s9, 72(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s10, 68(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s11, 64(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 0(a0)
+; RV32I-NEXT: addi sp, sp, -128
+; RV32I-NEXT: sw ra, 124(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s0, 120(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 116(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s2, 112(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s3, 108(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s4, 104(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s5, 100(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s6, 96(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s7, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s8, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s9, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s10, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s11, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT: lbu s1, 0(a0)
; RV32I-NEXT: lbu a4, 1(a0)
; RV32I-NEXT: lbu a5, 2(a0)
; RV32I-NEXT: lbu a6, 3(a0)
-; RV32I-NEXT: lbu a7, 4(a0)
-; RV32I-NEXT: lbu t0, 5(a0)
-; RV32I-NEXT: lbu t1, 6(a0)
-; RV32I-NEXT: lbu t2, 7(a0)
-; RV32I-NEXT: lbu t3, 8(a0)
-; RV32I-NEXT: lbu t4, 9(a0)
-; RV32I-NEXT: lbu t5, 10(a0)
-; RV32I-NEXT: lbu t6, 11(a0)
-; RV32I-NEXT: lbu s0, 12(a0)
-; RV32I-NEXT: lbu s1, 13(a0)
-; RV32I-NEXT: lbu s2, 14(a0)
-; RV32I-NEXT: lbu s3, 15(a0)
-; RV32I-NEXT: lbu s4, 16(a0)
-; RV32I-NEXT: lbu s5, 17(a0)
-; RV32I-NEXT: lbu s6, 18(a0)
-; RV32I-NEXT: lbu s7, 19(a0)
+; RV32I-NEXT: lbu t1, 4(a0)
+; RV32I-NEXT: lbu t3, 5(a0)
+; RV32I-NEXT: lbu t4, 6(a0)
+; RV32I-NEXT: lbu s0, 7(a0)
+; RV32I-NEXT: lbu t2, 8(a0)
+; RV32I-NEXT: lbu s3, 9(a0)
+; RV32I-NEXT: lbu s6, 10(a0)
+; RV32I-NEXT: lbu s8, 11(a0)
+; RV32I-NEXT: lbu s9, 12(a0)
+; RV32I-NEXT: lbu s10, 13(a0)
+; RV32I-NEXT: lbu s4, 14(a0)
+; RV32I-NEXT: lbu s7, 15(a0)
+; RV32I-NEXT: lbu s5, 16(a0)
+; RV32I-NEXT: lbu s11, 17(a0)
+; RV32I-NEXT: lbu ra, 18(a0)
+; RV32I-NEXT: lbu a3, 19(a0)
+; RV32I-NEXT: lbu t5, 20(a0)
+; RV32I-NEXT: lbu t6, 21(a0)
+; RV32I-NEXT: lbu a7, 22(a0)
+; RV32I-NEXT: lbu t0, 23(a0)
; RV32I-NEXT: slli a4, a4, 8
; RV32I-NEXT: slli a5, a5, 16
; RV32I-NEXT: slli a6, a6, 24
-; RV32I-NEXT: slli t0, t0, 8
-; RV32I-NEXT: slli t1, t1, 16
-; RV32I-NEXT: slli t2, t2, 24
-; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: slli t3, t3, 8
+; RV32I-NEXT: slli t4, t4, 16
+; RV32I-NEXT: slli s0, s0, 24
+; RV32I-NEXT: or a4, a4, s1
+; RV32I-NEXT: sw a4, 4(sp) # 4-byte Folded Spill
; RV32I-NEXT: or a4, a6, a5
-; RV32I-NEXT: or a5, t0, a7
-; RV32I-NEXT: or a6, t2, t1
-; RV32I-NEXT: lbu s8, 20(a0)
-; RV32I-NEXT: lbu s9, 21(a0)
-; RV32I-NEXT: lbu s10, 22(a0)
-; RV32I-NEXT: lbu s11, 23(a0)
-; RV32I-NEXT: slli t4, t4, 8
-; RV32I-NEXT: slli t5, t5, 16
-; RV32I-NEXT: slli t6, t6, 24
-; RV32I-NEXT: slli s1, s1, 8
-; RV32I-NEXT: slli s2, s2, 16
-; RV32I-NEXT: slli s3, s3, 24
-; RV32I-NEXT: or a7, t4, t3
-; RV32I-NEXT: or t0, t6, t5
-; RV32I-NEXT: or t1, s1, s0
-; RV32I-NEXT: or t2, s3, s2
-; RV32I-NEXT: lbu t6, 24(a0)
+; RV32I-NEXT: or a5, t3, t1
+; RV32I-NEXT: or a6, s0, t4
+; RV32I-NEXT: lbu t1, 24(a0)
; RV32I-NEXT: lbu s0, 25(a0)
; RV32I-NEXT: lbu s1, 26(a0)
; RV32I-NEXT: lbu s2, 27(a0)
-; RV32I-NEXT: slli s5, s5, 8
+; RV32I-NEXT: slli s3, s3, 8
; RV32I-NEXT: slli s6, s6, 16
-; RV32I-NEXT: slli s7, s7, 24
-; RV32I-NEXT: slli s9, s9, 8
-; RV32I-NEXT: or t3, s5, s4
-; RV32I-NEXT: or t4, s7, s6
-; RV32I-NEXT: or t5, s9, s8
+; RV32I-NEXT: slli s8, s8, 24
+; RV32I-NEXT: slli s10, s10, 8
+; RV32I-NEXT: or t2, s3, t2
+; RV32I-NEXT: or t3, s8, s6
+; RV32I-NEXT: or t4, s10, s9
; RV32I-NEXT: lbu s3, 28(a0)
-; RV32I-NEXT: lbu s4, 29(a0)
-; RV32I-NEXT: lbu s5, 30(a0)
-; RV32I-NEXT: lbu s6, 31(a0)
-; RV32I-NEXT: slli s10, s10, 16
-; RV32I-NEXT: slli s11, s11, 24
-; RV32I-NEXT: slli s0, s0, 8
-; RV32I-NEXT: slli s1, s1, 16
-; RV32I-NEXT: slli s2, s2, 24
-; RV32I-NEXT: or a0, s11, s10
-; RV32I-NEXT: or t6, s0, t6
-; RV32I-NEXT: or s0, s2, s1
-; RV32I-NEXT: lbu s1, 0(a1)
-; RV32I-NEXT: lbu s2, 1(a1)
-; RV32I-NEXT: lbu s7, 2(a1)
+; RV32I-NEXT: lbu s6, 29(a0)
+; RV32I-NEXT: lbu s8, 30(a0)
+; RV32I-NEXT: lbu s9, 31(a0)
+; RV32I-NEXT: slli s4, s4, 16
+; RV32I-NEXT: slli s7, s7, 24
+; RV32I-NEXT: slli s11, s11, 8
+; RV32I-NEXT: slli ra, ra, 16
+; RV32I-NEXT: slli a3, a3, 24
+; RV32I-NEXT: or a0, s7, s4
+; RV32I-NEXT: or s4, s11, s5
+; RV32I-NEXT: or s5, a3, ra
+; RV32I-NEXT: lbu a3, 0(a1)
+; RV32I-NEXT: lbu s7, 1(a1)
+; RV32I-NEXT: lbu s10, 2(a1)
; RV32I-NEXT: lbu a1, 3(a1)
-; RV32I-NEXT: sw zero, 16(sp)
-; RV32I-NEXT: sw zero, 20(sp)
; RV32I-NEXT: sw zero, 24(sp)
; RV32I-NEXT: sw zero, 28(sp)
-; RV32I-NEXT: sw zero, 0(sp)
-; RV32I-NEXT: sw zero, 4(sp)
+; RV32I-NEXT: sw zero, 32(sp)
+; RV32I-NEXT: sw zero, 36(sp)
; RV32I-NEXT: sw zero, 8(sp)
; RV32I-NEXT: sw zero, 12(sp)
-; RV32I-NEXT: slli s4, s4, 8
-; RV32I-NEXT: or s3, s4, s3
-; RV32I-NEXT: addi s4, sp, 32
-; RV32I-NEXT: slli s5, s5, 16
-; RV32I-NEXT: slli s6, s6, 24
-; RV32I-NEXT: slli s2, s2, 8
-; RV32I-NEXT: slli s7, s7, 16
+; RV32I-NEXT: sw zero, 16(sp)
+; RV32I-NEXT: sw zero, 20(sp)
+; RV32I-NEXT: slli t6, t6, 8
+; RV32I-NEXT: or t5, t6, t5
+; RV32I-NEXT: addi t6, sp, 40
+; RV32I-NEXT: slli a7, a7, 16
+; RV32I-NEXT: slli t0, t0, 24
+; RV32I-NEXT: slli s0, s0, 8
+; RV32I-NEXT: slli s1, s1, 16
+; RV32I-NEXT: slli s2, s2, 24
+; RV32I-NEXT: slli s6, s6, 8
+; RV32I-NEXT: slli s8, s8, 16
+; RV32I-NEXT: slli s9, s9, 24
+; RV32I-NEXT: slli s7, s7, 8
+; RV32I-NEXT: slli s10, s10, 16
; RV32I-NEXT: slli a1, a1, 24
-; RV32I-NEXT: or s5, s6, s5
-; RV32I-NEXT: or s1, s2, s1
-; RV32I-NEXT: or a1, a1, s7
-; RV32I-NEXT: or a3, a4, a3
-; RV32I-NEXT: or a4, a6, a5
-; RV32I-NEXT: or a5, t0, a7
-; RV32I-NEXT: or a6, t2, t1
-; RV32I-NEXT: or a7, t4, t3
-; RV32I-NEXT: or a0, a0, t5
-; RV32I-NEXT: or t0, s0, t6
-; RV32I-NEXT: or t1, s5, s3
-; RV32I-NEXT: or a1, a1, s1
-; RV32I-NEXT: sw a7, 48(sp)
+; RV32I-NEXT: or a7, t0, a7
+; RV32I-NEXT: or t0, s0, t1
+; RV32I-NEXT: or t1, s2, s1
+; RV32I-NEXT: or s0, s6, s3
+; RV32I-NEXT: or s1, s9, s8
+; RV32I-NEXT: or a3, s7, a3
+; RV32I-NEXT: or a1, a1, s10
+; RV32I-NEXT: lw s2, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or a4, a4, s2
+; RV32I-NEXT: or a5, a6, a5
+; RV32I-NEXT: or a6, t3, t2
+; RV32I-NEXT: or a0, a0, t4
+; RV32I-NEXT: or t2, s5, s4
+; RV32I-NEXT: or a7, a7, t5
+; RV32I-NEXT: or t0, t1, t0
+; RV32I-NEXT: or s0, s1, s0
+; RV32I-NEXT: or a1, a1, a3
+; RV32I-NEXT: sw t2, 56(sp)
+; RV32I-NEXT: sw a7, 60(sp)
+; RV32I-NEXT: sw t0, 64(sp)
+; RV32I-NEXT: sw s0, 68(sp)
+; RV32I-NEXT: sw a4, 40(sp)
+; RV32I-NEXT: sw a5, 44(sp)
+; RV32I-NEXT: sw a6, 48(sp)
; RV32I-NEXT: sw a0, 52(sp)
-; RV32I-NEXT: sw t0, 56(sp)
-; RV32I-NEXT: sw t1, 60(sp)
-; RV32I-NEXT: sw a3, 32(sp)
-; RV32I-NEXT: sw a4, 36(sp)
-; RV32I-NEXT: sw a5, 40(sp)
-; RV32I-NEXT: sw a6, 44(sp)
; RV32I-NEXT: slli a3, a1, 3
; RV32I-NEXT: andi a1, a1, 28
-; RV32I-NEXT: sub a1, s4, a1
+; RV32I-NEXT: sub a1, t6, a1
; RV32I-NEXT: andi a0, a3, 24
; RV32I-NEXT: xori a0, a0, 31
; RV32I-NEXT: lw a4, 0(a1)
@@ -3647,10 +3664,10 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: srli t4, a4, 1
; RV32I-NEXT: sll t5, a7, a3
; RV32I-NEXT: srli t6, a6, 1
-; RV32I-NEXT: sll a6, a6, a3
+; RV32I-NEXT: sll s0, a6, a3
; RV32I-NEXT: srli a5, a5, 1
-; RV32I-NEXT: sll s0, t1, a3
-; RV32I-NEXT: srli s1, t0, 1
+; RV32I-NEXT: sll s1, t1, a3
+; RV32I-NEXT: srli a6, t0, 1
; RV32I-NEXT: sll s2, t0, a3
; RV32I-NEXT: srli a7, a7, 1
; RV32I-NEXT: sll s3, a1, a3
@@ -3658,56 +3675,56 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: sll s4, t2, a3
; RV32I-NEXT: srli t0, t1, 1
; RV32I-NEXT: sll s5, a4, a3
-; RV32I-NEXT: srl t4, t4, a0
-; RV32I-NEXT: srl a4, t6, a0
-; RV32I-NEXT: srl t1, a5, a0
-; RV32I-NEXT: srl t6, s1, a0
-; RV32I-NEXT: srl s1, a7, a0
-; RV32I-NEXT: srl s6, a1, a0
-; RV32I-NEXT: srl s7, t0, a0
-; RV32I-NEXT: srli t2, s4, 24
-; RV32I-NEXT: srli t0, s3, 24
+; RV32I-NEXT: srl t2, t4, a0
+; RV32I-NEXT: srl t4, t6, a0
+; RV32I-NEXT: srl t6, a5, a0
+; RV32I-NEXT: srl s6, a6, a0
+; RV32I-NEXT: srl s7, a7, a0
+; RV32I-NEXT: srl s8, a1, a0
+; RV32I-NEXT: srl s9, t0, a0
+; RV32I-NEXT: srli t1, s4, 24
+; RV32I-NEXT: srli a7, s3, 24
; RV32I-NEXT: srli a5, s2, 24
-; RV32I-NEXT: srli a3, s0, 24
-; RV32I-NEXT: srli a1, a6, 24
+; RV32I-NEXT: srli a3, s1, 24
+; RV32I-NEXT: srli a1, s0, 24
; RV32I-NEXT: srli a0, t5, 24
-; RV32I-NEXT: srli s8, s5, 24
-; RV32I-NEXT: or a4, t5, a4
-; RV32I-NEXT: srli t5, s5, 16
-; RV32I-NEXT: or t1, a6, t1
-; RV32I-NEXT: srli s9, s5, 8
-; RV32I-NEXT: or a7, t3, t4
-; RV32I-NEXT: srli a6, t3, 24
-; RV32I-NEXT: or t3, s0, t6
-; RV32I-NEXT: or t4, s2, s1
-; RV32I-NEXT: or t6, s3, s6
-; RV32I-NEXT: or s0, s4, s7
+; RV32I-NEXT: srli s10, s5, 24
+; RV32I-NEXT: srli s11, s5, 16
+; RV32I-NEXT: srli ra, s5, 8
+; RV32I-NEXT: srli a4, t3, 24
+; RV32I-NEXT: or a6, t3, t2
+; RV32I-NEXT: or t0, t5, t4
+; RV32I-NEXT: or t2, s0, t6
+; RV32I-NEXT: or t3, s1, s6
+; RV32I-NEXT: or t4, s2, s7
+; RV32I-NEXT: or t5, s3, s8
+; RV32I-NEXT: or t6, s4, s9
; RV32I-NEXT: sb s5, 0(a2)
-; RV32I-NEXT: sb s9, 1(a2)
-; RV32I-NEXT: sb t5, 2(a2)
-; RV32I-NEXT: sb s8, 3(a2)
-; RV32I-NEXT: srli t5, s0, 16
-; RV32I-NEXT: srli s1, s0, 8
-; RV32I-NEXT: srli s2, t6, 16
-; RV32I-NEXT: srli s3, t6, 8
+; RV32I-NEXT: sb ra, 1(a2)
+; RV32I-NEXT: sb s11, 2(a2)
+; RV32I-NEXT: sb s10, 3(a2)
+; RV32I-NEXT: srli s0, t6, 16
+; RV32I-NEXT: srli s1, t6, 8
+; RV32I-NEXT: srli s2, t5, 16
+; RV32I-NEXT: srli s3, t5, 8
; RV32I-NEXT: srli s4, t4, 16
; RV32I-NEXT: srli s5, t4, 8
; RV32I-NEXT: srli s6, t3, 16
; RV32I-NEXT: srli s7, t3, 8
-; RV32I-NEXT: sb s0, 24(a2)
-; RV32I-NEXT: srli s0, t1, 16
+; RV32I-NEXT: srli s8, t2, 16
+; RV32I-NEXT: srli s9, t2, 8
+; RV32I-NEXT: srli s10, t0, 16
+; RV32I-NEXT: srli s11, t0, 8
+; RV32I-NEXT: sb t6, 24(a2)
; RV32I-NEXT: sb s1, 25(a2)
-; RV32I-NEXT: srli s1, t1, 8
-; RV32I-NEXT: sb t5, 26(a2)
-; RV32I-NEXT: srli t5, a4, 16
-; RV32I-NEXT: sb t2, 27(a2)
-; RV32I-NEXT: srli t2, a4, 8
-; RV32I-NEXT: sb t6, 28(a2)
-; RV32I-NEXT: srli t6, a7, 16
+; RV32I-NEXT: sb s0, 26(a2)
+; RV32I-NEXT: sb t1, 27(a2)
+; RV32I-NEXT: srli t1, a6, 16
+; RV32I-NEXT: sb t5, 28(a2)
; RV32I-NEXT: sb s3, 29(a2)
; RV32I-NEXT: sb s2, 30(a2)
-; RV32I-NEXT: sb t0, 31(a2)
-; RV32I-NEXT: srli t0, a7, 8
+; RV32I-NEXT: sb a7, 31(a2)
+; RV32I-NEXT: srli a7, a6, 8
; RV32I-NEXT: sb t4, 16(a2)
; RV32I-NEXT: sb s5, 17(a2)
; RV32I-NEXT: sb s4, 18(a2)
@@ -3716,31 +3733,32 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: sb s7, 21(a2)
; RV32I-NEXT: sb s6, 22(a2)
; RV32I-NEXT: sb a3, 23(a2)
-; RV32I-NEXT: sb t1, 8(a2)
-; RV32I-NEXT: sb s1, 9(a2)
-; RV32I-NEXT: sb s0, 10(a2)
+; RV32I-NEXT: sb t2, 8(a2)
+; RV32I-NEXT: sb s9, 9(a2)
+; RV32I-NEXT: sb s8, 10(a2)
; RV32I-NEXT: sb a1, 11(a2)
-; RV32I-NEXT: sb a4, 12(a2)
-; RV32I-NEXT: sb t2, 13(a2)
-; RV32I-NEXT: sb t5, 14(a2)
+; RV32I-NEXT: sb t0, 12(a2)
+; RV32I-NEXT: sb s11, 13(a2)
+; RV32I-NEXT: sb s10, 14(a2)
; RV32I-NEXT: sb a0, 15(a2)
-; RV32I-NEXT: sb a7, 4(a2)
-; RV32I-NEXT: sb t0, 5(a2)
-; RV32I-NEXT: sb t6, 6(a2)
-; RV32I-NEXT: sb a6, 7(a2)
-; RV32I-NEXT: lw s0, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s1, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s2, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s3, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s4, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s5, 88(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s6, 84(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s7, 80(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s8, 76(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s9, 72(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s10, 68(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s11, 64(sp) # 4-byte Folded Reload
-; RV32I-NEXT: addi sp, sp, 112
+; RV32I-NEXT: sb a6, 4(a2)
+; RV32I-NEXT: sb a7, 5(a2)
+; RV32I-NEXT: sb t1, 6(a2)
+; RV32I-NEXT: sb a4, 7(a2)
+; RV32I-NEXT: lw ra, 124(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s0, 120(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 116(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s2, 112(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s3, 108(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s4, 104(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s5, 100(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s6, 96(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s7, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s8, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s9, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s10, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s11, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT: addi sp, sp, 128
; RV32I-NEXT: ret
%src = load i256, ptr %src.ptr, align 1
%byteOff = load i256, ptr %byteOff.ptr, align 1
@@ -3985,128 +4003,132 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw
;
; RV32I-LABEL: shl_32bytes_wordOff:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi sp, sp, -112
-; RV32I-NEXT: sw s0, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s1, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s2, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s3, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s4, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s5, 88(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s6, 84(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s7, 80(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s8, 76(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s9, 72(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s10, 68(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s11, 64(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 0(a0)
-; RV32I-NEXT: lbu a4, 1(a0)
-; RV32I-NEXT: lbu a5, 2(a0)
-; RV32I-NEXT: lbu a6, 3(a0)
-; RV32I-NEXT: lbu a7, 4(a0)
-; RV32I-NEXT: lbu t0, 5(a0)
-; RV32I-NEXT: lbu t1, 6(a0)
-; RV32I-NEXT: lbu t2, 7(a0)
-; RV32I-NEXT: lbu t3, 8(a0)
-; RV32I-NEXT: lbu t4, 9(a0)
-; RV32I-NEXT: lbu t5, 10(a0)
-; RV32I-NEXT: lbu t6, 11(a0)
-; RV32I-NEXT: lbu s2, 12(a0)
-; RV32I-NEXT: lbu s3, 13(a0)
-; RV32I-NEXT: lbu s4, 14(a0)
-; RV32I-NEXT: lbu s5, 15(a0)
-; RV32I-NEXT: lbu s6, 16(a0)
-; RV32I-NEXT: lbu s7, 17(a0)
-; RV32I-NEXT: lbu s8, 18(a0)
-; RV32I-NEXT: lbu s9, 19(a0)
-; RV32I-NEXT: slli a4, a4, 8
-; RV32I-NEXT: slli a5, a5, 16
-; RV32I-NEXT: slli a6, a6, 24
-; RV32I-NEXT: slli t0, t0, 8
-; RV32I-NEXT: slli t1, t1, 16
-; RV32I-NEXT: slli t2, t2, 24
-; RV32I-NEXT: or a3, a4, a3
-; RV32I-NEXT: or a4, a6, a5
-; RV32I-NEXT: or a5, t0, a7
-; RV32I-NEXT: or a6, t2, t1
-; RV32I-NEXT: lbu s10, 20(a0)
-; RV32I-NEXT: lbu s11, 21(a0)
+; RV32I-NEXT: addi sp, sp, -128
+; RV32I-NEXT: sw ra, 124(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s0, 120(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 116(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s2, 112(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s3, 108(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s4, 104(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s5, 100(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s6, 96(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s7, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s8, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s9, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s10, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s11, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT: mv a3, a1
+; RV32I-NEXT: lbu a5, 0(a0)
+; RV32I-NEXT: lbu a7, 1(a0)
+; RV32I-NEXT: lbu t0, 2(a0)
+; RV32I-NEXT: lbu t1, 3(a0)
+; RV32I-NEXT: lbu s2, 4(a0)
+; RV32I-NEXT: lbu s4, 5(a0)
+; RV32I-NEXT: lbu s5, 6(a0)
+; RV32I-NEXT: lbu s6, 7(a0)
+; RV32I-NEXT: lbu s3, 8(a0)
+; RV32I-NEXT: lbu s9, 9(a0)
+; RV32I-NEXT: lbu s10, 10(a0)
+; RV32I-NEXT: lbu s11, 11(a0)
+; RV32I-NEXT: lbu ra, 12(a0)
+; RV32I-NEXT: lbu a1, 13(a0)
+; RV32I-NEXT: lbu t4, 14(a0)
+; RV32I-NEXT: lbu t6, 15(a0)
+; RV32I-NEXT: lbu a4, 16(a0)
+; RV32I-NEXT: sw a4, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT: lbu a6, 17(a0)
+; RV32I-NEXT: lbu t2, 18(a0)
+; RV32I-NEXT: lbu t3, 19(a0)
+; RV32I-NEXT: lbu a4, 20(a0)
+; RV32I-NEXT: lbu t5, 21(a0)
; RV32I-NEXT: lbu s0, 22(a0)
; RV32I-NEXT: lbu s1, 23(a0)
-; RV32I-NEXT: slli t4, t4, 8
-; RV32I-NEXT: slli t5, t5, 16
-; RV32I-NEXT: slli t6, t6, 24
-; RV32I-NEXT: slli s3, s3, 8
-; RV32I-NEXT: slli s4, s4, 16
-; RV32I-NEXT: slli s5, s5, 24
-; RV32I-NEXT: or a7, t4, t3
-; RV32I-NEXT: or t0, t6, t5
-; RV32I-NEXT: or t1, s3, s2
-; RV32I-NEXT: or t2, s5, s4
-; RV32I-NEXT: lbu t3, 24(a0)
-; RV32I-NEXT: lbu s2, 25(a0)
-; RV32I-NEXT: lbu s3, 26(a0)
-; RV32I-NEXT: lbu s4, 27(a0)
-; RV32I-NEXT: slli s7, s7, 8
-; RV32I-NEXT: slli s8, s8, 16
-; RV32I-NEXT: slli s9, s9, 24
-; RV32I-NEXT: slli s11, s11, 8
-; RV32I-NEXT: or t4, s7, s6
-; RV32I-NEXT: or t5, s9, s8
-; RV32I-NEXT: or t6, s11, s10
-; RV32I-NEXT: lbu s5, 28(a0)
-; RV32I-NEXT: lbu s6, 29(a0)
-; RV32I-NEXT: lbu s7, 30(a0)
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: slli t0, t0, 16
+; RV32I-NEXT: slli t1, t1, 24
+; RV32I-NEXT: slli s4, s4, 8
+; RV32I-NEXT: slli s5, s5, 16
+; RV32I-NEXT: slli s6, s6, 24
+; RV32I-NEXT: or a5, a7, a5
+; RV32I-NEXT: or a7, t1, t0
+; RV32I-NEXT: or t0, s4, s2
+; RV32I-NEXT: or t1, s6, s5
+; RV32I-NEXT: lbu s2, 24(a0)
+; RV32I-NEXT: lbu s6, 25(a0)
+; RV32I-NEXT: lbu s7, 26(a0)
+; RV32I-NEXT: lbu s8, 27(a0)
+; RV32I-NEXT: slli s9, s9, 8
+; RV32I-NEXT: slli s10, s10, 16
+; RV32I-NEXT: slli s11, s11, 24
+; RV32I-NEXT: slli a1, a1, 8
+; RV32I-NEXT: or s3, s9, s3
+; RV32I-NEXT: or s4, s11, s10
+; RV32I-NEXT: or s5, a1, ra
+; RV32I-NEXT: lbu s9, 28(a0)
+; RV32I-NEXT: lbu a1, 29(a0)
+; RV32I-NEXT: lbu s10, 30(a0)
; RV32I-NEXT: lbu a0, 31(a0)
-; RV32I-NEXT: lbu a1, 0(a1)
-; RV32I-NEXT: sw zero, 16(sp)
-; RV32I-NEXT: sw zero, 20(sp)
+; RV32I-NEXT: lbu a3, 0(a3)
; RV32I-NEXT: sw zero, 24(sp)
; RV32I-NEXT: sw zero, 28(sp)
-; RV32I-NEXT: sw zero, 0(sp)
-; RV32I-NEXT: sw zero, 4(sp)
+; RV32I-NEXT: sw zero, 32(sp)
+; RV32I-NEXT: sw zero, 36(sp)
; RV32I-NEXT: sw zero, 8(sp)
; RV32I-NEXT: sw zero, 12(sp)
+; RV32I-NEXT: sw zero, 16(sp)
+; RV32I-NEXT: sw zero, 20(sp)
+; RV32I-NEXT: slli t4, t4, 16
+; RV32I-NEXT: slli t6, t6, 24
+; RV32I-NEXT: or t4, t6, t4
+; RV32I-NEXT: addi t6, sp, 40
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: slli t2, t2, 16
+; RV32I-NEXT: slli t3, t3, 24
+; RV32I-NEXT: slli t5, t5, 8
; RV32I-NEXT: slli s0, s0, 16
; RV32I-NEXT: slli s1, s1, 24
-; RV32I-NEXT: or s0, s1, s0
-; RV32I-NEXT: addi s1, sp, 32
-; RV32I-NEXT: slli s2, s2, 8
-; RV32I-NEXT: slli s3, s3, 16
-; RV32I-NEXT: slli s4, s4, 24
; RV32I-NEXT: slli s6, s6, 8
; RV32I-NEXT: slli s7, s7, 16
+; RV32I-NEXT: slli s8, s8, 24
+; RV32I-NEXT: slli a1, a1, 8
+; RV32I-NEXT: slli s10, s10, 16
; RV32I-NEXT: slli a0, a0, 24
-; RV32I-NEXT: slli a1, a1, 2
-; RV32I-NEXT: or t3, s2, t3
-; RV32I-NEXT: or s2, s4, s3
-; RV32I-NEXT: or s3, s6, s5
-; RV32I-NEXT: or a0, a0, s7
-; RV32I-NEXT: andi a1, a1, 28
-; RV32I-NEXT: or a3, a4, a3
-; RV32I-NEXT: or a4, a6, a5
-; RV32I-NEXT: or a5, t0, a7
-; RV32I-NEXT: or a6, t2, t1
-; RV32I-NEXT: or a7, t5, t4
-; RV32I-NEXT: or t0, s0, t6
-; RV32I-NEXT: or t1, s2, t3
-; RV32I-NEXT: or a0, a0, s3
-; RV32I-NEXT: sub s1, s1, a1
-; RV32I-NEXT: sw a7, 48(sp)
-; RV32I-NEXT: sw t0, 52(sp)
-; RV32I-NEXT: sw t1, 56(sp)
-; RV32I-NEXT: sw a0, 60(sp)
-; RV32I-NEXT: sw a3, 32(sp)
-; RV32I-NEXT: sw a4, 36(sp)
+; RV32I-NEXT: slli a3, a3, 2
+; RV32I-NEXT: lw s11, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or a6, a6, s11
+; RV32I-NEXT: or t2, t3, t2
+; RV32I-NEXT: or a4, t5, a4
+; RV32I-NEXT: or s0, s1, s0
+; RV32I-NEXT: or t3, s6, s2
+; RV32I-NEXT: or t5, s8, s7
+; RV32I-NEXT: or a1, a1, s9
+; RV32I-NEXT: or a0, a0, s10
+; RV32I-NEXT: andi a3, a3, 28
+; RV32I-NEXT: or a5, a7, a5
+; RV32I-NEXT: or a7, t1, t0
+; RV32I-NEXT: or t0, s4, s3
+; RV32I-NEXT: or t1, t4, s5
+; RV32I-NEXT: or a6, t2, a6
+; RV32I-NEXT: or a4, s0, a4
+; RV32I-NEXT: or t2, t5, t3
+; RV32I-NEXT: or a0, a0, a1
+; RV32I-NEXT: sub t3, t6, a3
+; RV32I-NEXT: sw a6, 56(sp)
+; RV32I-NEXT: sw a4, 60(sp)
+; RV32I-NEXT: sw t2, 64(sp)
+; RV32I-NEXT: sw a0, 68(sp)
; RV32I-NEXT: sw a5, 40(sp)
-; RV32I-NEXT: sw a6, 44(sp)
-; RV32I-NEXT: lw a6, 16(s1)
-; RV32I-NEXT: lw a5, 20(s1)
-; RV32I-NEXT: lw a7, 24(s1)
-; RV32I-NEXT: lw a1, 0(s1)
-; RV32I-NEXT: lw a0, 4(s1)
-; RV32I-NEXT: lw a4, 8(s1)
-; RV32I-NEXT: lw a3, 12(s1)
-; RV32I-NEXT: lw t0, 28(s1)
+; RV32I-NEXT: sw a7, 44(sp)
+; RV32I-NEXT: sw t0, 48(sp)
+; RV32I-NEXT: sw t1, 52(sp)
+; RV32I-NEXT: lw a6, 16(t3)
+; RV32I-NEXT: lw a5, 20(t3)
+; RV32I-NEXT: lw a7, 24(t3)
+; RV32I-NEXT: lw a1, 0(t3)
+; RV32I-NEXT: lw a0, 4(t3)
+; RV32I-NEXT: lw a4, 8(t3)
+; RV32I-NEXT: lw a3, 12(t3)
+; RV32I-NEXT: lw t0, 28(t3)
; RV32I-NEXT: srli t1, a7, 24
; RV32I-NEXT: srli t2, a7, 16
; RV32I-NEXT: srli t3, a7, 8
@@ -4121,21 +4143,21 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw
; RV32I-NEXT: srli s5, a5, 8
; RV32I-NEXT: srli s6, a4, 24
; RV32I-NEXT: srli s7, a4, 16
+; RV32I-NEXT: srli s8, a4, 8
+; RV32I-NEXT: srli s9, a3, 24
+; RV32I-NEXT: srli s10, a3, 16
+; RV32I-NEXT: srli s11, a3, 8
; RV32I-NEXT: sb a7, 24(a2)
-; RV32I-NEXT: srli a7, a4, 8
+; RV32I-NEXT: srli a7, a1, 24
; RV32I-NEXT: sb t3, 25(a2)
-; RV32I-NEXT: srli t3, a3, 24
; RV32I-NEXT: sb t2, 26(a2)
-; RV32I-NEXT: srli t2, a3, 16
; RV32I-NEXT: sb t1, 27(a2)
-; RV32I-NEXT: srli t1, a3, 8
+; RV32I-NEXT: srli t1, a1, 16
; RV32I-NEXT: sb t0, 28(a2)
-; RV32I-NEXT: srli t0, a1, 24
; RV32I-NEXT: sb t6, 29(a2)
-; RV32I-NEXT: srli t6, a1, 16
; RV32I-NEXT: sb t5, 30(a2)
; RV32I-NEXT: sb t4, 31(a2)
-; RV32I-NEXT: srli t4, a1, 8
+; RV32I-NEXT: srli t0, a1, 8
; RV32I-NEXT: sb a6, 16(a2)
; RV32I-NEXT: sb s2, 17(a2)
; RV32I-NEXT: sb s1, 18(a2)
@@ -4147,35 +4169,36 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw
; RV32I-NEXT: sb s3, 23(a2)
; RV32I-NEXT: srli a5, a0, 16
; RV32I-NEXT: sb a4, 8(a2)
-; RV32I-NEXT: sb a7, 9(a2)
+; RV32I-NEXT: sb s8, 9(a2)
; RV32I-NEXT: sb s7, 10(a2)
; RV32I-NEXT: sb s6, 11(a2)
; RV32I-NEXT: srli a4, a0, 8
; RV32I-NEXT: sb a3, 12(a2)
-; RV32I-NEXT: sb t1, 13(a2)
-; RV32I-NEXT: sb t2, 14(a2)
-; RV32I-NEXT: sb t3, 15(a2)
+; RV32I-NEXT: sb s11, 13(a2)
+; RV32I-NEXT: sb s10, 14(a2)
+; RV32I-NEXT: sb s9, 15(a2)
; RV32I-NEXT: sb a1, 0(a2)
-; RV32I-NEXT: sb t4, 1(a2)
-; RV32I-NEXT: sb t6, 2(a2)
-; RV32I-NEXT: sb t0, 3(a2)
+; RV32I-NEXT: sb t0, 1(a2)
+; RV32I-NEXT: sb t1, 2(a2)
+; RV32I-NEXT: sb a7, 3(a2)
; RV32I-NEXT: sb a0, 4(a2)
; RV32I-NEXT: sb a4, 5(a2)
; RV32I-NEXT: sb a5, 6(a2)
; RV32I-NEXT: sb a6, 7(a2)
-; RV32I-NEXT: lw s0, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s1, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s2, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s3, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s4, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s5, 88(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s6, 84(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s7, 80(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s8, 76(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s9, 72(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s10, 68(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s11, 64(sp) # 4-byte Folded Reload
-; RV32I-NEXT: addi sp, sp, 112
+; RV32I-NEXT: lw ra, 124(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s0, 120(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 116(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s2, 112(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s3, 108(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s4, 104(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s5, 100(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s6, 96(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s7, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s8, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s9, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s10, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s11, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT: addi sp, sp, 128
; RV32I-NEXT: ret
%src = load i256, ptr %src.ptr, align 1
%wordOff = load i256, ptr %wordOff.ptr, align 1
@@ -4201,111 +4224,111 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
; RV64I-NEXT: sd s9, 80(sp) # 8-byte Folded Spill
; RV64I-NEXT: sd s10, 72(sp) # 8-byte Folded Spill
; RV64I-NEXT: sd s11, 64(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 0(a0)
-; RV64I-NEXT: lbu a4, 1(a0)
-; RV64I-NEXT: lbu a5, 2(a0)
-; RV64I-NEXT: lbu a6, 3(a0)
-; RV64I-NEXT: lbu a7, 4(a0)
-; RV64I-NEXT: lbu t0, 5(a0)
-; RV64I-NEXT: lbu t1, 6(a0)
-; RV64I-NEXT: lbu t2, 7(a0)
-; RV64I-NEXT: lbu t3, 8(a0)
-; RV64I-NEXT: lbu t4, 9(a0)
-; RV64I-NEXT: lbu t5, 10(a0)
-; RV64I-NEXT: lbu t6, 11(a0)
-; RV64I-NEXT: lbu s0, 12(a0)
-; RV64I-NEXT: lbu s1, 13(a0)
-; RV64I-NEXT: lbu s2, 14(a0)
-; RV64I-NEXT: lbu s3, 15(a0)
-; RV64I-NEXT: lbu s4, 16(a0)
-; RV64I-NEXT: lbu s5, 17(a0)
-; RV64I-NEXT: lbu s6, 18(a0)
-; RV64I-NEXT: lbu s7, 19(a0)
-; RV64I-NEXT: slli a4, a4, 8
-; RV64I-NEXT: slli a5, a5, 16
-; RV64I-NEXT: slli a6, a6, 24
-; RV64I-NEXT: slli t0, t0, 8
-; RV64I-NEXT: slli t1, t1, 16
-; RV64I-NEXT: slli t2, t2, 24
-; RV64I-NEXT: or a3, a4, a3
-; RV64I-NEXT: or a4, a6, a5
-; RV64I-NEXT: or a5, t0, a7
-; RV64I-NEXT: or a6, t2, t1
-; RV64I-NEXT: lbu s8, 20(a0)
-; RV64I-NEXT: lbu s9, 21(a0)
-; RV64I-NEXT: lbu s10, 22(a0)
-; RV64I-NEXT: lbu s11, 23(a0)
-; RV64I-NEXT: slli t4, t4, 8
-; RV64I-NEXT: slli t5, t5, 16
-; RV64I-NEXT: slli t6, t6, 24
-; RV64I-NEXT: slli s1, s1, 8
-; RV64I-NEXT: slli s2, s2, 16
+; RV64I-NEXT: lbu a5, 0(a0)
+; RV64I-NEXT: lbu a7, 1(a0)
+; RV64I-NEXT: lbu t2, 2(a0)
+; RV64I-NEXT: lbu s3, 3(a0)
+; RV64I-NEXT: lbu t0, 4(a0)
+; RV64I-NEXT: lbu s8, 5(a0)
+; RV64I-NEXT: lbu s9, 6(a0)
+; RV64I-NEXT: lbu s10, 7(a0)
+; RV64I-NEXT: lbu s2, 8(a0)
+; RV64I-NEXT: lbu s4, 9(a0)
+; RV64I-NEXT: lbu s5, 10(a0)
+; RV64I-NEXT: lbu s6, 11(a0)
+; RV64I-NEXT: lbu s7, 12(a0)
+; RV64I-NEXT: lbu s11, 13(a0)
+; RV64I-NEXT: lbu t1, 14(a0)
+; RV64I-NEXT: lbu t3, 15(a0)
+; RV64I-NEXT: lbu a3, 16(a0)
+; RV64I-NEXT: lbu a6, 17(a0)
+; RV64I-NEXT: lbu t4, 18(a0)
+; RV64I-NEXT: lbu t5, 19(a0)
+; RV64I-NEXT: lbu a4, 20(a0)
+; RV64I-NEXT: lbu t6, 21(a0)
+; RV64I-NEXT: lbu s0, 22(a0)
+; RV64I-NEXT: lbu s1, 23(a0)
+; RV64I-NEXT: slli a7, a7, 8
+; RV64I-NEXT: slli t2, t2, 16
; RV64I-NEXT: slli s3, s3, 24
-; RV64I-NEXT: or a7, t4, t3
-; RV64I-NEXT: or t0, t6, t5
-; RV64I-NEXT: or t1, s1, s0
-; RV64I-NEXT: or t2, s3, s2
-; RV64I-NEXT: lbu t3, 24(a0)
-; RV64I-NEXT: lbu t4, 25(a0)
-; RV64I-NEXT: lbu t5, 26(a0)
-; RV64I-NEXT: lbu t6, 27(a0)
-; RV64I-NEXT: slli s5, s5, 8
-; RV64I-NEXT: slli s6, s6, 16
-; RV64I-NEXT: slli s7, s7, 24
-; RV64I-NEXT: slli s9, s9, 8
-; RV64I-NEXT: or s0, s5, s4
-; RV64I-NEXT: or s1, s7, s6
-; RV64I-NEXT: or s2, s9, s8
-; RV64I-NEXT: lbu s3, 28(a0)
-; RV64I-NEXT: lbu s4, 29(a0)
-; RV64I-NEXT: lbu s5, 30(a0)
+; RV64I-NEXT: slli s8, s8, 8
+; RV64I-NEXT: slli s9, s9, 16
+; RV64I-NEXT: slli s10, s10, 24
+; RV64I-NEXT: or a5, a7, a5
+; RV64I-NEXT: or a7, s3, t2
+; RV64I-NEXT: or t0, s8, t0
+; RV64I-NEXT: or t2, s10, s9
+; RV64I-NEXT: lbu s3, 24(a0)
+; RV64I-NEXT: lbu s8, 25(a0)
+; RV64I-NEXT: lbu s9, 26(a0)
+; RV64I-NEXT: lbu s10, 27(a0)
+; RV64I-NEXT: slli s4, s4, 8
+; RV64I-NEXT: slli s5, s5, 16
+; RV64I-NEXT: slli s6, s6, 24
+; RV64I-NEXT: slli s11, s11, 8
+; RV64I-NEXT: or s2, s4, s2
+; RV64I-NEXT: or s4, s6, s5
+; RV64I-NEXT: or s5, s11, s7
+; RV64I-NEXT: lbu s6, 28(a0)
+; RV64I-NEXT: lbu s7, 29(a0)
+; RV64I-NEXT: lbu s11, 30(a0)
; RV64I-NEXT: lbu a0, 31(a0)
; RV64I-NEXT: lbu a1, 0(a1)
; RV64I-NEXT: sd zero, 0(sp)
; RV64I-NEXT: sd zero, 8(sp)
; RV64I-NEXT: sd zero, 16(sp)
; RV64I-NEXT: sd zero, 24(sp)
-; RV64I-NEXT: slli s10, s10, 16
-; RV64I-NEXT: slli s11, s11, 24
-; RV64I-NEXT: or s6, s11, s10
-; RV64I-NEXT: addi s7, sp, 32
-; RV64I-NEXT: slli t4, t4, 8
-; RV64I-NEXT: slli t5, t5, 16
-; RV64I-NEXT: slli t6, t6, 24
-; RV64I-NEXT: slli s4, s4, 8
-; RV64I-NEXT: slli s5, s5, 16
+; RV64I-NEXT: slli t1, t1, 16
+; RV64I-NEXT: slli t3, t3, 24
+; RV64I-NEXT: or t1, t3, t1
+; RV64I-NEXT: addi t3, sp, 32
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: slli t4, t4, 16
+; RV64I-NEXT: slli t5, t5, 24
+; RV64I-NEXT: slli t6, t6, 8
+; RV64I-NEXT: slli s0, s0, 16
+; RV64I-NEXT: slli s1, s1, 24
+; RV64I-NEXT: slli s8, s8, 8
+; RV64I-NEXT: slli s9, s9, 16
+; RV64I-NEXT: slli s10, s10, 24
+; RV64I-NEXT: slli s7, s7, 8
+; RV64I-NEXT: slli s11, s11, 16
; RV64I-NEXT: slli a0, a0, 24
; RV64I-NEXT: slli a1, a1, 3
-; RV64I-NEXT: or t3, t4, t3
-; RV64I-NEXT: or t4, t6, t5
-; RV64I-NEXT: or t5, s4, s3
-; RV64I-NEXT: or a0, a0, s5
-; RV64I-NEXT: andi a1, a1, 24
-; RV64I-NEXT: or a3, a4, a3
-; RV64I-NEXT: or a4, a6, a5
-; RV64I-NEXT: or a5, t0, a7
-; RV64I-NEXT: or a6, t2, t1
+; RV64I-NEXT: or a3, a6, a3
+; RV64I-NEXT: or a6, t5, t4
+; RV64I-NEXT: or a4, t6, a4
; RV64I-NEXT: or s0, s1, s0
-; RV64I-NEXT: or a7, s6, s2
-; RV64I-NEXT: or t0, t4, t3
-; RV64I-NEXT: or a0, a0, t5
-; RV64I-NEXT: sub t1, s7, a1
-; RV64I-NEXT: slli a4, a4, 32
-; RV64I-NEXT: slli a6, a6, 32
+; RV64I-NEXT: or t4, s8, s3
+; RV64I-NEXT: or t5, s10, s9
+; RV64I-NEXT: or t6, s7, s6
+; RV64I-NEXT: or a0, a0, s11
+; RV64I-NEXT: andi a1, a1, 24
+; RV64I-NEXT: or a5, a7, a5
+; RV64I-NEXT: or a7, t2, t0
+; RV64I-NEXT: or t0, s4, s2
+; RV64I-NEXT: or t1, t1, s5
+; RV64I-NEXT: or a3, a6, a3
+; RV64I-NEXT: or a4, s0, a4
+; RV64I-NEXT: or a6, t5, t4
+; RV64I-NEXT: or a0, a0, t6
+; RV64I-NEXT: sub t2, t3, a1
; RV64I-NEXT: slli a7, a7, 32
+; RV64I-NEXT: slli t1, t1, 32
+; RV64I-NEXT: slli a4, a4, 32
; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: or a1, a7, a5
+; RV64I-NEXT: or a5, t1, t0
; RV64I-NEXT: or a3, a4, a3
-; RV64I-NEXT: or a1, a6, a5
-; RV64I-NEXT: or a4, a7, s0
-; RV64I-NEXT: or a0, a0, t0
-; RV64I-NEXT: sd a3, 32(sp)
-; RV64I-NEXT: sd a1, 40(sp)
-; RV64I-NEXT: sd a4, 48(sp)
+; RV64I-NEXT: or a0, a0, a6
+; RV64I-NEXT: sd a1, 32(sp)
+; RV64I-NEXT: sd a5, 40(sp)
+; RV64I-NEXT: sd a3, 48(sp)
; RV64I-NEXT: sd a0, 56(sp)
-; RV64I-NEXT: ld a4, 16(t1)
-; RV64I-NEXT: ld a0, 8(t1)
-; RV64I-NEXT: ld a1, 0(t1)
-; RV64I-NEXT: ld a3, 24(t1)
+; RV64I-NEXT: ld a4, 16(t2)
+; RV64I-NEXT: ld a0, 8(t2)
+; RV64I-NEXT: ld a1, 0(t2)
+; RV64I-NEXT: ld a3, 24(t2)
; RV64I-NEXT: srli a5, a4, 56
; RV64I-NEXT: srli a6, a4, 48
; RV64I-NEXT: srli a7, a4, 40
@@ -4324,25 +4347,25 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
; RV64I-NEXT: srli s5, a1, 48
; RV64I-NEXT: srli s6, a1, 40
; RV64I-NEXT: srli s7, a1, 32
+; RV64I-NEXT: srli s8, a1, 24
+; RV64I-NEXT: srli s9, a1, 16
+; RV64I-NEXT: srli s10, a1, 8
+; RV64I-NEXT: srli s11, a0, 56
; RV64I-NEXT: sb t0, 20(a2)
-; RV64I-NEXT: srli t0, a1, 24
; RV64I-NEXT: sb a7, 21(a2)
-; RV64I-NEXT: srli a7, a1, 16
; RV64I-NEXT: sb a6, 22(a2)
-; RV64I-NEXT: srli a6, a1, 8
; RV64I-NEXT: sb a5, 23(a2)
-; RV64I-NEXT: srli a5, a0, 56
+; RV64I-NEXT: srli a5, a0, 48
; RV64I-NEXT: sb a4, 16(a2)
-; RV64I-NEXT: srli a4, a0, 48
; RV64I-NEXT: sb t3, 17(a2)
; RV64I-NEXT: sb t2, 18(a2)
; RV64I-NEXT: sb t1, 19(a2)
-; RV64I-NEXT: srli t1, a0, 40
+; RV64I-NEXT: srli a4, a0, 40
; RV64I-NEXT: sb s0, 28(a2)
; RV64I-NEXT: sb t6, 29(a2)
; RV64I-NEXT: sb t5, 30(a2)
; RV64I-NEXT: sb t4, 31(a2)
-; RV64I-NEXT: srli t2, a0, 32
+; RV64I-NEXT: srli a6, a0, 32
; RV64I-NEXT: sb a3, 24(a2)
; RV64I-NEXT: sb s3, 25(a2)
; RV64I-NEXT: sb s2, 26(a2)
@@ -4352,19 +4375,19 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
; RV64I-NEXT: sb s6, 5(a2)
; RV64I-NEXT: sb s5, 6(a2)
; RV64I-NEXT: sb s4, 7(a2)
-; RV64I-NEXT: srli t3, a0, 16
+; RV64I-NEXT: srli a7, a0, 16
; RV64I-NEXT: sb a1, 0(a2)
-; RV64I-NEXT: sb a6, 1(a2)
-; RV64I-NEXT: sb a7, 2(a2)
-; RV64I-NEXT: sb t0, 3(a2)
+; RV64I-NEXT: sb s10, 1(a2)
+; RV64I-NEXT: sb s9, 2(a2)
+; RV64I-NEXT: sb s8, 3(a2)
; RV64I-NEXT: srli a1, a0, 8
-; RV64I-NEXT: sb t2, 12(a2)
-; RV64I-NEXT: sb t1, 13(a2)
-; RV64I-NEXT: sb a4, 14(a2)
-; RV64I-NEXT: sb a5, 15(a2)
+; RV64I-NEXT: sb a6, 12(a2)
+; RV64I-NEXT: sb a4, 13(a2)
+; RV64I-NEXT: sb a5, 14(a2)
+; RV64I-NEXT: sb s11, 15(a2)
; RV64I-NEXT: sb a0, 8(a2)
; RV64I-NEXT: sb a1, 9(a2)
-; RV64I-NEXT: sb t3, 10(a2)
+; RV64I-NEXT: sb a7, 10(a2)
; RV64I-NEXT: sb a3, 11(a2)
; RV64I-NEXT: ld s0, 152(sp) # 8-byte Folded Reload
; RV64I-NEXT: ld s1, 144(sp) # 8-byte Folded Reload
@@ -4383,128 +4406,132 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
;
; RV32I-LABEL: shl_32bytes_dwordOff:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi sp, sp, -112
-; RV32I-NEXT: sw s0, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s1, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s2, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s3, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s4, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s5, 88(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s6, 84(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s7, 80(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s8, 76(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s9, 72(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s10, 68(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s11, 64(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 0(a0)
-; RV32I-NEXT: lbu a4, 1(a0)
-; RV32I-NEXT: lbu a5, 2(a0)
-; RV32I-NEXT: lbu a6, 3(a0)
-; RV32I-NEXT: lbu a7, 4(a0)
-; RV32I-NEXT: lbu t0, 5(a0)
-; RV32I-NEXT: lbu t1, 6(a0)
-; RV32I-NEXT: lbu t2, 7(a0)
-; RV32I-NEXT: lbu t3, 8(a0)
-; RV32I-NEXT: lbu t4, 9(a0)
-; RV32I-NEXT: lbu t5, 10(a0)
-; RV32I-NEXT: lbu t6, 11(a0)
-; RV32I-NEXT: lbu s2, 12(a0)
-; RV32I-NEXT: lbu s3, 13(a0)
-; RV32I-NEXT: lbu s4, 14(a0)
-; RV32I-NEXT: lbu s5, 15(a0)
-; RV32I-NEXT: lbu s6, 16(a0)
-; RV32I-NEXT: lbu s7, 17(a0)
-; RV32I-NEXT: lbu s8, 18(a0)
-; RV32I-NEXT: lbu s9, 19(a0)
-; RV32I-NEXT: slli a4, a4, 8
-; RV32I-NEXT: slli a5, a5, 16
-; RV32I-NEXT: slli a6, a6, 24
-; RV32I-NEXT: slli t0, t0, 8
-; RV32I-NEXT: slli t1, t1, 16
-; RV32I-NEXT: slli t2, t2, 24
-; RV32I-NEXT: or a3, a4, a3
-; RV32I-NEXT: or a4, a6, a5
-; RV32I-NEXT: or a5, t0, a7
-; RV32I-NEXT: or a6, t2, t1
-; RV32I-NEXT: lbu s10, 20(a0)
-; RV32I-NEXT: lbu s11, 21(a0)
+; RV32I-NEXT: addi sp, sp, -128
+; RV32I-NEXT: sw ra, 124(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s0, 120(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 116(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s2, 112(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s3, 108(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s4, 104(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s5, 100(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s6, 96(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s7, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s8, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s9, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s10, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s11, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT: mv a3, a1
+; RV32I-NEXT: lbu a5, 0(a0)
+; RV32I-NEXT: lbu a7, 1(a0)
+; RV32I-NEXT: lbu t0, 2(a0)
+; RV32I-NEXT: lbu t1, 3(a0)
+; RV32I-NEXT: lbu s2, 4(a0)
+; RV32I-NEXT: lbu s4, 5(a0)
+; RV32I-NEXT: lbu s5, 6(a0)
+; RV32I-NEXT: lbu s6, 7(a0)
+; RV32I-NEXT: lbu s3, 8(a0)
+; RV32I-NEXT: lbu s9, 9(a0)
+; RV32I-NEXT: lbu s10, 10(a0)
+; RV32I-NEXT: lbu s11, 11(a0)
+; RV32I-NEXT: lbu ra, 12(a0)
+; RV32I-NEXT: lbu a1, 13(a0)
+; RV32I-NEXT: lbu t4, 14(a0)
+; RV32I-NEXT: lbu t6, 15(a0)
+; RV32I-NEXT: lbu a4, 16(a0)
+; RV32I-NEXT: sw a4, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT: lbu a6, 17(a0)
+; RV32I-NEXT: lbu t2, 18(a0)
+; RV32I-NEXT: lbu t3, 19(a0)
+; RV32I-NEXT: lbu a4, 20(a0)
+; RV32I-NEXT: lbu t5, 21(a0)
; RV32I-NEXT: lbu s0, 22(a0)
; RV32I-NEXT: lbu s1, 23(a0)
-; RV32I-NEXT: slli t4, t4, 8
-; RV32I-NEXT: slli t5, t5, 16
-; RV32I-NEXT: slli t6, t6, 24
-; RV32I-NEXT: slli s3, s3, 8
-; RV32I-NEXT: slli s4, s4, 16
-; RV32I-NEXT: slli s5, s5, 24
-; RV32I-NEXT: or a7, t4, t3
-; RV32I-NEXT: or t0, t6, t5
-; RV32I-NEXT: or t1, s3, s2
-; RV32I-NEXT: or t2, s5, s4
-; RV32I-NEXT: lbu t3, 24(a0)
-; RV32I-NEXT: lbu s2, 25(a0)
-; RV32I-NEXT: lbu s3, 26(a0)
-; RV32I-NEXT: lbu s4, 27(a0)
-; RV32I-NEXT: slli s7, s7, 8
-; RV32I-NEXT: slli s8, s8, 16
-; RV32I-NEXT: slli s9, s9, 24
-; RV32I-NEXT: slli s11, s11, 8
-; RV32I-NEXT: or t4, s7, s6
-; RV32I-NEXT: or t5, s9, s8
-; RV32I-NEXT: or t6, s11, s10
-; RV32I-NEXT: lbu s5, 28(a0)
-; RV32I-NEXT: lbu s6, 29(a0)
-; RV32I-NEXT: lbu s7, 30(a0)
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: slli t0, t0, 16
+; RV32I-NEXT: slli t1, t1, 24
+; RV32I-NEXT: slli s4, s4, 8
+; RV32I-NEXT: slli s5, s5, 16
+; RV32I-NEXT: slli s6, s6, 24
+; RV32I-NEXT: or a5, a7, a5
+; RV32I-NEXT: or a7, t1, t0
+; RV32I-NEXT: or t0, s4, s2
+; RV32I-NEXT: or t1, s6, s5
+; RV32I-NEXT: lbu s2, 24(a0)
+; RV32I-NEXT: lbu s6, 25(a0)
+; RV32I-NEXT: lbu s7, 26(a0)
+; RV32I-NEXT: lbu s8, 27(a0)
+; RV32I-NEXT: slli s9, s9, 8
+; RV32I-NEXT: slli s10, s10, 16
+; RV32I-NEXT: slli s11, s11, 24
+; RV32I-NEXT: slli a1, a1, 8
+; RV32I-NEXT: or s3, s9, s3
+; RV32I-NEXT: or s4, s11, s10
+; RV32I-NEXT: or s5, a1, ra
+; RV32I-NEXT: lbu s9, 28(a0)
+; RV32I-NEXT: lbu a1, 29(a0)
+; RV32I-NEXT: lbu s10, 30(a0)
; RV32I-NEXT: lbu a0, 31(a0)
-; RV32I-NEXT: lbu a1, 0(a1)
-; RV32I-NEXT: sw zero, 16(sp)
-; RV32I-NEXT: sw zero, 20(sp)
+; RV32I-NEXT: lbu a3, 0(a3)
; RV32I-NEXT: sw zero, 24(sp)
; RV32I-NEXT: sw zero, 28(sp)
-; RV32I-NEXT: sw zero, 0(sp)
-; RV32I-NEXT: sw zero, 4(sp)
+; RV32I-NEXT: sw zero, 32(sp)
+; RV32I-NEXT: sw zero, 36(sp)
; RV32I-NEXT: sw zero, 8(sp)
; RV32I-NEXT: sw zero, 12(sp)
+; RV32I-NEXT: sw zero, 16(sp)
+; RV32I-NEXT: sw zero, 20(sp)
+; RV32I-NEXT: slli t4, t4, 16
+; RV32I-NEXT: slli t6, t6, 24
+; RV32I-NEXT: or t4, t6, t4
+; RV32I-NEXT: addi t6, sp, 40
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: slli t2, t2, 16
+; RV32I-NEXT: slli t3, t3, 24
+; RV32I-NEXT: slli t5, t5, 8
; RV32I-NEXT: slli s0, s0, 16
; RV32I-NEXT: slli s1, s1, 24
-; RV32I-NEXT: or s0, s1, s0
-; RV32I-NEXT: addi s1, sp, 32
-; RV32I-NEXT: slli s2, s2, 8
-; RV32I-NEXT: slli s3, s3, 16
-; RV32I-NEXT: slli s4, s4, 24
; RV32I-NEXT: slli s6, s6, 8
; RV32I-NEXT: slli s7, s7, 16
+; RV32I-NEXT: slli s8, s8, 24
+; RV32I-NEXT: slli a1, a1, 8
+; RV32I-NEXT: slli s10, s10, 16
; RV32I-NEXT: slli a0, a0, 24
-; RV32I-NEXT: slli a1, a1, 3
-; RV32I-NEXT: or t3, s2, t3
-; RV32I-NEXT: or s2, s4, s3
-; RV32I-NEXT: or s3, s6, s5
-; RV32I-NEXT: or a0, a0, s7
-; RV32I-NEXT: andi a1, a1, 24
-; RV32I-NEXT: or a3, a4, a3
-; RV32I-NEXT: or a4, a6, a5
-; RV32I-NEXT: or a5, t0, a7
-; RV32I-NEXT: or a6, t2, t1
-; RV32I-NEXT: or a7, t5, t4
-; RV32I-NEXT: or t0, s0, t6
-; RV32I-NEXT: or t1, s2, t3
-; RV32I-NEXT: or a0, a0, s3
-; RV32I-NEXT: sub s1, s1, a1
-; RV32I-NEXT: sw a7, 48(sp)
-; RV32I-NEXT: sw t0, 52(sp)
-; RV32I-NEXT: sw t1, 56(sp)
-; RV32I-NEXT: sw a0, 60(sp)
-; RV32I-NEXT: sw a3, 32(sp)
-; RV32I-NEXT: sw a4, 36(sp)
+; RV32I-NEXT: slli a3, a3, 3
+; RV32I-NEXT: lw s11, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or a6, a6, s11
+; RV32I-NEXT: or t2, t3, t2
+; RV32I-NEXT: or a4, t5, a4
+; RV32I-NEXT: or s0, s1, s0
+; RV32I-NEXT: or t3, s6, s2
+; RV32I-NEXT: or t5, s8, s7
+; RV32I-NEXT: or a1, a1, s9
+; RV32I-NEXT: or a0, a0, s10
+; RV32I-NEXT: andi a3, a3, 24
+; RV32I-NEXT: or a5, a7, a5
+; RV32I-NEXT: or a7, t1, t0
+; RV32I-NEXT: or t0, s4, s3
+; RV32I-NEXT: or t1, t4, s5
+; RV32I-NEXT: or a6, t2, a6
+; RV32I-NEXT: or a4, s0, a4
+; RV32I-NEXT: or t2, t5, t3
+; RV32I-NEXT: or a0, a0, a1
+; RV32I-NEXT: sub t3, t6, a3
+; RV32I-NEXT: sw a6, 56(sp)
+; RV32I-NEXT: sw a4, 60(sp)
+; RV32I-NEXT: sw t2, 64(sp)
+; RV32I-NEXT: sw a0, 68(sp)
; RV32I-NEXT: sw a5, 40(sp)
-; RV32I-NEXT: sw a6, 44(sp)
-; RV32I-NEXT: lw a6, 16(s1)
-; RV32I-NEXT: lw a5, 20(s1)
-; RV32I-NEXT: lw a7, 24(s1)
-; RV32I-NEXT: lw a1, 0(s1)
-; RV32I-NEXT: lw a0, 4(s1)
-; RV32I-NEXT: lw a4, 8(s1)
-; RV32I-NEXT: lw a3, 12(s1)
-; RV32I-NEXT: lw t0, 28(s1)
+; RV32I-NEXT: sw a7, 44(sp)
+; RV32I-NEXT: sw t0, 48(sp)
+; RV32I-NEXT: sw t1, 52(sp)
+; RV32I-NEXT: lw a6, 16(t3)
+; RV32I-NEXT: lw a5, 20(t3)
+; RV32I-NEXT: lw a7, 24(t3)
+; RV32I-NEXT: lw a1, 0(t3)
+; RV32I-NEXT: lw a0, 4(t3)
+; RV32I-NEXT: lw a4, 8(t3)
+; RV32I-NEXT: lw a3, 12(t3)
+; RV32I-NEXT: lw t0, 28(t3)
; RV32I-NEXT: srli t1, a7, 24
; RV32I-NEXT: srli t2, a7, 16
; RV32I-NEXT: srli t3, a7, 8
@@ -4519,21 +4546,21 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
; RV32I-NEXT: srli s5, a5, 8
; RV32I-NEXT: srli s6, a4, 24
; RV32I-NEXT: srli s7, a4, 16
+; RV32I-NEXT: srli s8, a4, 8
+; RV32I-NEXT: srli s9, a3, 24
+; RV32I-NEXT: srli s10, a3, 16
+; RV32I-NEXT: srli s11, a3, 8
; RV32I-NEXT: sb a7, 24(a2)
-; RV32I-NEXT: srli a7, a4, 8
+; RV32I-NEXT: srli a7, a1, 24
; RV32I-NEXT: sb t3, 25(a2)
-; RV32I-NEXT: srli t3, a3, 24
; RV32I-NEXT: sb t2, 26(a2)
-; RV32I-NEXT: srli t2, a3, 16
; RV32I-NEXT: sb t1, 27(a2)
-; RV32I-NEXT: srli t1, a3, 8
+; RV32I-NEXT: srli t1, a1, 16
; RV32I-NEXT: sb t0, 28(a2)
-; RV32I-NEXT: srli t0, a1, 24
; RV32I-NEXT: sb t6, 29(a2)
-; RV32I-NEXT: srli t6, a1, 16
; RV32I-NEXT: sb t5, 30(a2)
; RV32I-NEXT: sb t4, 31(a2)
-; RV32I-NEXT: srli t4, a1, 8
+; RV32I-NEXT: srli t0, a1, 8
; RV32I-NEXT: sb a6, 16(a2)
; RV32I-NEXT: sb s2, 17(a2)
; RV32I-NEXT: sb s1, 18(a2)
@@ -4545,35 +4572,36 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
; RV32I-NEXT: sb s3, 23(a2)
; RV32I-NEXT: srli a5, a0, 16
; RV32I-NEXT: sb a4, 8(a2)
-; RV32I-NEXT: sb a7, 9(a2)
+; RV32I-NEXT: sb s8, 9(a2)
; RV32I-NEXT: sb s7, 10(a2)
; RV32I-NEXT: sb s6, 11(a2)
; RV32I-NEXT: srli a4, a0, 8
; RV32I-NEXT: sb a3, 12(a2)
-; RV32I-NEXT: sb t1, 13(a2)
-; RV32I-NEXT: sb t2, 14(a2)
-; RV32I-NEXT: sb t3, 15(a2)
+; RV32I-NEXT: sb s11, 13(a2)
+; RV32I-NEXT: sb s10, 14(a2)
+; RV32I-NEXT: sb s9, 15(a2)
; RV32I-NEXT: sb a1, 0(a2)
-; RV32I-NEXT: sb t4, 1(a2)
-; RV32I-NEXT: sb t6, 2(a2)
-; RV32I-NEXT: sb t0, 3(a2)
+; RV32I-NEXT: sb t0, 1(a2)
+; RV32I-NEXT: sb t1, 2(a2)
+; RV32I-NEXT: sb a7, 3(a2)
; RV32I-NEXT: sb a0, 4(a2)
; RV32I-NEXT: sb a4, 5(a2)
; RV32I-NEXT: sb a5, 6(a2)
; RV32I-NEXT: sb a6, 7(a2)
-; RV32I-NEXT: lw s0, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s1, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s2, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s3, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s4, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s5, 88(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s6, 84(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s7, 80(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s8, 76(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s9, 72(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s10, 68(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s11, 64(sp) # 4-byte Folded Reload
-; RV32I-NEXT: addi sp, sp, 112
+; RV32I-NEXT: lw ra, 124(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s0, 120(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 116(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s2, 112(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s3, 108(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s4, 104(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s5, 100(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s6, 96(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s7, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s8, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s9, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s10, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s11, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT: addi sp, sp, 128
; RV32I-NEXT: ret
%src = load i256, ptr %src.ptr, align 1
%dwordOff = load i256, ptr %dwordOff.ptr, align 1
@@ -4818,137 +4846,140 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
;
; RV32I-LABEL: ashr_32bytes:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi sp, sp, -112
-; RV32I-NEXT: sw s0, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s1, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s2, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s3, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s4, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s5, 88(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s6, 84(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s7, 80(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s8, 76(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s9, 72(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s10, 68(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s11, 64(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 0(a0)
+; RV32I-NEXT: addi sp, sp, -128
+; RV32I-NEXT: sw ra, 124(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s0, 120(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 116(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s2, 112(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s3, 108(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s4, 104(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s5, 100(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s6, 96(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s7, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s8, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s9, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s10, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s11, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT: lbu t6, 0(a0)
; RV32I-NEXT: lbu a4, 1(a0)
; RV32I-NEXT: lbu a5, 2(a0)
; RV32I-NEXT: lbu a6, 3(a0)
-; RV32I-NEXT: lbu a7, 4(a0)
-; RV32I-NEXT: lbu t0, 5(a0)
-; RV32I-NEXT: lbu t1, 6(a0)
-; RV32I-NEXT: lbu t2, 7(a0)
-; RV32I-NEXT: lbu t3, 8(a0)
-; RV32I-NEXT: lbu t4, 9(a0)
-; RV32I-NEXT: lbu t5, 10(a0)
-; RV32I-NEXT: lbu t6, 11(a0)
-; RV32I-NEXT: lbu s0, 12(a0)
-; RV32I-NEXT: lbu s1, 13(a0)
-; RV32I-NEXT: lbu s2, 14(a0)
-; RV32I-NEXT: lbu s3, 15(a0)
-; RV32I-NEXT: lbu s4, 16(a0)
-; RV32I-NEXT: lbu s5, 17(a0)
-; RV32I-NEXT: lbu s6, 18(a0)
-; RV32I-NEXT: lbu s7, 19(a0)
+; RV32I-NEXT: lbu t1, 4(a0)
+; RV32I-NEXT: lbu t3, 5(a0)
+; RV32I-NEXT: lbu t4, 6(a0)
+; RV32I-NEXT: lbu t5, 7(a0)
+; RV32I-NEXT: lbu t2, 8(a0)
+; RV32I-NEXT: lbu s1, 9(a0)
+; RV32I-NEXT: lbu s7, 10(a0)
+; RV32I-NEXT: lbu s8, 11(a0)
+; RV32I-NEXT: lbu s9, 12(a0)
+; RV32I-NEXT: lbu s10, 13(a0)
+; RV32I-NEXT: lbu s4, 14(a0)
+; RV32I-NEXT: lbu s6, 15(a0)
+; RV32I-NEXT: lbu s5, 16(a0)
+; RV32I-NEXT: lbu s11, 17(a0)
+; RV32I-NEXT: lbu ra, 18(a0)
+; RV32I-NEXT: lbu a3, 19(a0)
+; RV32I-NEXT: lbu s2, 20(a0)
+; RV32I-NEXT: lbu s3, 21(a0)
+; RV32I-NEXT: lbu a7, 22(a0)
+; RV32I-NEXT: lbu t0, 23(a0)
; RV32I-NEXT: slli a4, a4, 8
; RV32I-NEXT: slli a5, a5, 16
; RV32I-NEXT: slli a6, a6, 24
-; RV32I-NEXT: slli t0, t0, 8
-; RV32I-NEXT: slli t1, t1, 16
-; RV32I-NEXT: slli t2, t2, 24
-; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: slli t3, t3, 8
+; RV32I-NEXT: slli t4, t4, 16
+; RV32I-NEXT: slli t5, t5, 24
+; RV32I-NEXT: or a4, a4, t6
+; RV32I-NEXT: sw a4, 4(sp) # 4-byte Folded Spill
; RV32I-NEXT: or a4, a6, a5
-; RV32I-NEXT: or a5, t0, a7
-; RV32I-NEXT: or a6, t2, t1
-; RV32I-NEXT: lbu s8, 20(a0)
-; RV32I-NEXT: lbu s9, 21(a0)
-; RV32I-NEXT: lbu s10, 22(a0)
-; RV32I-NEXT: lbu s11, 23(a0)
-; RV32I-NEXT: slli t4, t4, 8
-; RV32I-NEXT: slli t5, t5, 16
-; RV32I-NEXT: slli t6, t6, 24
+; RV32I-NEXT: or a5, t3, t1
+; RV32I-NEXT: or a6, t5, t4
+; RV32I-NEXT: lbu t1, 24(a0)
+; RV32I-NEXT: lbu t5, 25(a0)
+; RV32I-NEXT: lbu t6, 26(a0)
+; RV32I-NEXT: lbu s0, 27(a0)
; RV32I-NEXT: slli s1, s1, 8
-; RV32I-NEXT: slli s2, s2, 16
-; RV32I-NEXT: slli s3, s3, 24
-; RV32I-NEXT: or a7, t4, t3
-; RV32I-NEXT: or t0, t6, t5
-; RV32I-NEXT: or t1, s1, s0
-; RV32I-NEXT: or t2, s3, s2
-; RV32I-NEXT: lbu t6, 24(a0)
-; RV32I-NEXT: lbu s0, 25(a0)
-; RV32I-NEXT: lbu s1, 26(a0)
-; RV32I-NEXT: lbu s2, 27(a0)
-; RV32I-NEXT: slli s5, s5, 8
-; RV32I-NEXT: slli s6, s6, 16
-; RV32I-NEXT: slli s7, s7, 24
-; RV32I-NEXT: slli s9, s9, 8
-; RV32I-NEXT: or t3, s5, s4
-; RV32I-NEXT: or t4, s7, s6
-; RV32I-NEXT: or t5, s9, s8
-; RV32I-NEXT: lbu s3, 28(a0)
-; RV32I-NEXT: lbu s4, 29(a0)
-; RV32I-NEXT: lbu s5, 30(a0)
-; RV32I-NEXT: lbu a0, 31(a0)
-; RV32I-NEXT: slli s10, s10, 16
-; RV32I-NEXT: slli s11, s11, 24
-; RV32I-NEXT: slli s0, s0, 8
-; RV32I-NEXT: slli s1, s1, 16
-; RV32I-NEXT: slli s2, s2, 24
-; RV32I-NEXT: or s6, s11, s10
-; RV32I-NEXT: or t6, s0, t6
-; RV32I-NEXT: or s0, s2, s1
-; RV32I-NEXT: lbu s1, 0(a1)
-; RV32I-NEXT: lbu s2, 1(a1)
-; RV32I-NEXT: lbu s7, 2(a1)
-; RV32I-NEXT: lbu a1, 3(a1)
-; RV32I-NEXT: slli s4, s4, 8
-; RV32I-NEXT: or s3, s4, s3
-; RV32I-NEXT: mv s4, sp
-; RV32I-NEXT: slli s5, s5, 16
-; RV32I-NEXT: slli a0, a0, 24
-; RV32I-NEXT: slli s2, s2, 8
; RV32I-NEXT: slli s7, s7, 16
+; RV32I-NEXT: slli s8, s8, 24
+; RV32I-NEXT: slli s10, s10, 8
+; RV32I-NEXT: or t2, s1, t2
+; RV32I-NEXT: or t3, s8, s7
+; RV32I-NEXT: or t4, s10, s9
+; RV32I-NEXT: lbu s1, 28(a0)
+; RV32I-NEXT: lbu s7, 29(a0)
+; RV32I-NEXT: lbu s8, 30(a0)
+; RV32I-NEXT: lbu s9, 31(a0)
+; RV32I-NEXT: slli s4, s4, 16
+; RV32I-NEXT: slli s6, s6, 24
+; RV32I-NEXT: slli s11, s11, 8
+; RV32I-NEXT: slli ra, ra, 16
+; RV32I-NEXT: slli a3, a3, 24
+; RV32I-NEXT: or a0, s6, s4
+; RV32I-NEXT: or s4, s11, s5
+; RV32I-NEXT: or s5, a3, ra
+; RV32I-NEXT: lbu a3, 0(a1)
+; RV32I-NEXT: lbu s6, 1(a1)
+; RV32I-NEXT: lbu s10, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: slli s3, s3, 8
+; RV32I-NEXT: or s2, s3, s2
+; RV32I-NEXT: addi s3, sp, 8
+; RV32I-NEXT: slli a7, a7, 16
+; RV32I-NEXT: slli t0, t0, 24
+; RV32I-NEXT: slli t5, t5, 8
+; RV32I-NEXT: slli t6, t6, 16
+; RV32I-NEXT: slli s0, s0, 24
+; RV32I-NEXT: slli s7, s7, 8
+; RV32I-NEXT: slli s8, s8, 16
+; RV32I-NEXT: slli s9, s9, 24
+; RV32I-NEXT: slli s6, s6, 8
+; RV32I-NEXT: slli s10, s10, 16
; RV32I-NEXT: slli a1, a1, 24
-; RV32I-NEXT: or s5, a0, s5
-; RV32I-NEXT: or s1, s2, s1
-; RV32I-NEXT: or a1, a1, s7
-; RV32I-NEXT: srai a0, a0, 31
-; RV32I-NEXT: or a3, a4, a3
-; RV32I-NEXT: or a4, a6, a5
-; RV32I-NEXT: or a5, t0, a7
-; RV32I-NEXT: or a6, t2, t1
-; RV32I-NEXT: or a7, t4, t3
-; RV32I-NEXT: or t0, s6, t5
+; RV32I-NEXT: or a7, t0, a7
+; RV32I-NEXT: or t0, t5, t1
; RV32I-NEXT: or t1, s0, t6
-; RV32I-NEXT: or t2, s5, s3
-; RV32I-NEXT: or a1, a1, s1
-; RV32I-NEXT: sw a0, 48(sp)
-; RV32I-NEXT: sw a0, 52(sp)
-; RV32I-NEXT: sw a0, 56(sp)
-; RV32I-NEXT: sw a0, 60(sp)
-; RV32I-NEXT: sw a0, 32(sp)
-; RV32I-NEXT: sw a0, 36(sp)
-; RV32I-NEXT: sw a0, 40(sp)
-; RV32I-NEXT: sw a0, 44(sp)
-; RV32I-NEXT: sw a7, 16(sp)
-; RV32I-NEXT: sw t0, 20(sp)
-; RV32I-NEXT: sw t1, 24(sp)
-; RV32I-NEXT: sw t2, 28(sp)
-; RV32I-NEXT: sw a3, 0(sp)
-; RV32I-NEXT: sw a4, 4(sp)
-; RV32I-NEXT: sw a5, 8(sp)
-; RV32I-NEXT: sw a6, 12(sp)
+; RV32I-NEXT: or t5, s7, s1
+; RV32I-NEXT: or t6, s9, s8
+; RV32I-NEXT: or a3, s6, a3
+; RV32I-NEXT: or a1, a1, s10
+; RV32I-NEXT: srai s0, s9, 31
+; RV32I-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or a4, a4, s1
+; RV32I-NEXT: or a5, a6, a5
+; RV32I-NEXT: or a6, t3, t2
+; RV32I-NEXT: or a0, a0, t4
+; RV32I-NEXT: or t2, s5, s4
+; RV32I-NEXT: or a7, a7, s2
+; RV32I-NEXT: or t0, t1, t0
+; RV32I-NEXT: or t1, t6, t5
+; RV32I-NEXT: or a1, a1, a3
+; RV32I-NEXT: sw s0, 56(sp)
+; RV32I-NEXT: sw s0, 60(sp)
+; RV32I-NEXT: sw s0, 64(sp)
+; RV32I-NEXT: sw s0, 68(sp)
+; RV32I-NEXT: sw s0, 40(sp)
+; RV32I-NEXT: sw s0, 44(sp)
+; RV32I-NEXT: sw s0, 48(sp)
+; RV32I-NEXT: sw s0, 52(sp)
+; RV32I-NEXT: sw t2, 24(sp)
+; RV32I-NEXT: sw a7, 28(sp)
+; RV32I-NEXT: sw t0, 32(sp)
+; RV32I-NEXT: sw t1, 36(sp)
+; RV32I-NEXT: sw a4, 8(sp)
+; RV32I-NEXT: sw a5, 12(sp)
+; RV32I-NEXT: sw a6, 16(sp)
+; RV32I-NEXT: sw a0, 20(sp)
; RV32I-NEXT: slli t1, a1, 3
; RV32I-NEXT: andi a1, a1, 28
-; RV32I-NEXT: add a1, s4, a1
+; RV32I-NEXT: add a1, s3, a1
; RV32I-NEXT: andi a0, t1, 24
-; RV32I-NEXT: xori a7, a0, 31
+; RV32I-NEXT: xori t0, a0, 31
; RV32I-NEXT: lw a3, 0(a1)
; RV32I-NEXT: lw a4, 4(a1)
; RV32I-NEXT: lw a5, 8(a1)
; RV32I-NEXT: lw a6, 12(a1)
-; RV32I-NEXT: lw t0, 16(a1)
+; RV32I-NEXT: lw a7, 16(a1)
; RV32I-NEXT: lw t2, 20(a1)
; RV32I-NEXT: lw t3, 24(a1)
; RV32I-NEXT: lw t4, 28(a1)
@@ -4957,33 +4988,33 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: srl a1, a3, t1
; RV32I-NEXT: slli t6, a4, 1
; RV32I-NEXT: srl a3, a6, t1
-; RV32I-NEXT: slli s0, t0, 1
+; RV32I-NEXT: slli s0, a7, 1
; RV32I-NEXT: srl a4, a5, t1
; RV32I-NEXT: slli s1, a6, 1
; RV32I-NEXT: srl a5, t2, t1
; RV32I-NEXT: slli s2, t3, 1
-; RV32I-NEXT: srl a6, t0, t1
+; RV32I-NEXT: srl a6, a7, t1
; RV32I-NEXT: slli t2, t2, 1
-; RV32I-NEXT: srl t0, t3, t1
+; RV32I-NEXT: srl a7, t3, t1
; RV32I-NEXT: slli t3, t4, 1
; RV32I-NEXT: sra t1, t4, t1
-; RV32I-NEXT: sll t4, t5, a7
-; RV32I-NEXT: sll t5, t6, a7
-; RV32I-NEXT: sll t6, s0, a7
-; RV32I-NEXT: sll s0, s1, a7
-; RV32I-NEXT: sll s1, s2, a7
-; RV32I-NEXT: sll t2, t2, a7
-; RV32I-NEXT: sll t3, t3, a7
+; RV32I-NEXT: sll t4, t5, t0
+; RV32I-NEXT: sll t5, t6, t0
+; RV32I-NEXT: sll t6, s0, t0
+; RV32I-NEXT: sll s0, s1, t0
+; RV32I-NEXT: sll s1, s2, t0
+; RV32I-NEXT: sll t2, t2, t0
+; RV32I-NEXT: sll t3, t3, t0
; RV32I-NEXT: srli s2, t1, 24
; RV32I-NEXT: srli s3, t1, 16
; RV32I-NEXT: srli s4, t1, 8
-; RV32I-NEXT: or a7, a0, t4
+; RV32I-NEXT: or t0, a0, t4
; RV32I-NEXT: or t4, a1, t5
; RV32I-NEXT: or t5, a3, t6
; RV32I-NEXT: or s0, a4, s0
; RV32I-NEXT: or s1, a5, s1
; RV32I-NEXT: or t2, a6, t2
-; RV32I-NEXT: or t3, t0, t3
+; RV32I-NEXT: or t3, a7, t3
; RV32I-NEXT: sb t1, 28(a2)
; RV32I-NEXT: sb s4, 29(a2)
; RV32I-NEXT: sb s3, 30(a2)
@@ -5000,23 +5031,23 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: srli s6, s0, 24
; RV32I-NEXT: srli s7, s0, 16
; RV32I-NEXT: srli s0, s0, 8
-; RV32I-NEXT: sb t0, 24(a2)
-; RV32I-NEXT: srli t0, t5, 24
-; RV32I-NEXT: sb t3, 25(a2)
-; RV32I-NEXT: srli t3, t5, 16
+; RV32I-NEXT: srli s8, t5, 24
+; RV32I-NEXT: srli s9, t5, 16
; RV32I-NEXT: srli t5, t5, 8
+; RV32I-NEXT: srli s10, t4, 24
+; RV32I-NEXT: srli s11, t4, 16
+; RV32I-NEXT: srli t4, t4, 8
+; RV32I-NEXT: sb a7, 24(a2)
+; RV32I-NEXT: sb t3, 25(a2)
; RV32I-NEXT: sb t6, 26(a2)
-; RV32I-NEXT: srli t6, t4, 24
; RV32I-NEXT: sb t1, 27(a2)
-; RV32I-NEXT: srli t1, t4, 16
-; RV32I-NEXT: srli t4, t4, 8
+; RV32I-NEXT: srli a7, t0, 24
; RV32I-NEXT: sb a6, 16(a2)
-; RV32I-NEXT: srli a6, a7, 24
; RV32I-NEXT: sb t2, 17(a2)
; RV32I-NEXT: sb s3, 18(a2)
; RV32I-NEXT: sb s2, 19(a2)
-; RV32I-NEXT: srli t2, a7, 16
-; RV32I-NEXT: srli a7, a7, 8
+; RV32I-NEXT: srli a6, t0, 16
+; RV32I-NEXT: srli t0, t0, 8
; RV32I-NEXT: sb a5, 20(a2)
; RV32I-NEXT: sb s1, 21(a2)
; RV32I-NEXT: sb s5, 22(a2)
@@ -5027,29 +5058,30 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: sb s6, 11(a2)
; RV32I-NEXT: sb a3, 12(a2)
; RV32I-NEXT: sb t5, 13(a2)
-; RV32I-NEXT: sb t3, 14(a2)
-; RV32I-NEXT: sb t0, 15(a2)
+; RV32I-NEXT: sb s9, 14(a2)
+; RV32I-NEXT: sb s8, 15(a2)
; RV32I-NEXT: sb a1, 0(a2)
; RV32I-NEXT: sb t4, 1(a2)
-; RV32I-NEXT: sb t1, 2(a2)
-; RV32I-NEXT: sb t6, 3(a2)
+; RV32I-NEXT: sb s11, 2(a2)
+; RV32I-NEXT: sb s10, 3(a2)
; RV32I-NEXT: sb a0, 4(a2)
-; RV32I-NEXT: sb a7, 5(a2)
-; RV32I-NEXT: sb t2, 6(a2)
-; RV32I-NEXT: sb a6, 7(a2)
-; RV32I-NEXT: lw s0, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s1, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s2, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s3, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s4, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s5, 88(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s6, 84(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s7, 80(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s8, 76(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s9, 72(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s10, 68(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s11, 64(sp) # 4-byte Folded Reload
-; RV32I-NEXT: addi sp, sp, 112
+; RV32I-NEXT: sb t0, 5(a2)
+; RV32I-NEXT: sb a6, 6(a2)
+; RV32I-NEXT: sb a7, 7(a2)
+; RV32I-NEXT: lw ra, 124(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s0, 120(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 116(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s2, 112(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s3, 108(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s4, 104(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s5, 100(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s6, 96(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s7, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s8, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s9, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s10, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s11, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT: addi sp, sp, 128
; RV32I-NEXT: ret
%src = load i256, ptr %src.ptr, align 1
%byteOff = load i256, ptr %byteOff.ptr, align 1
@@ -5295,129 +5327,130 @@ define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
;
; RV32I-LABEL: ashr_32bytes_wordOff:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi sp, sp, -112
-; RV32I-NEXT: sw s0, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s1, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s2, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s3, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s4, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s5, 88(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s6, 84(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s7, 80(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s8, 76(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s9, 72(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s10, 68(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s11, 64(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 0(a0)
-; RV32I-NEXT: lbu a4, 1(a0)
-; RV32I-NEXT: lbu a5, 2(a0)
-; RV32I-NEXT: lbu a6, 3(a0)
-; RV32I-NEXT: lbu a7, 4(a0)
-; RV32I-NEXT: lbu t0, 5(a0)
-; RV32I-NEXT: lbu t1, 6(a0)
-; RV32I-NEXT: lbu t2, 7(a0)
-; RV32I-NEXT: lbu t3, 8(a0)
-; RV32I-NEXT: lbu t4, 9(a0)
-; RV32I-NEXT: lbu t5, 10(a0)
-; RV32I-NEXT: lbu t6, 11(a0)
-; RV32I-NEXT: lbu s0, 12(a0)
-; RV32I-NEXT: lbu s1, 13(a0)
-; RV32I-NEXT: lbu s2, 14(a0)
-; RV32I-NEXT: lbu s3, 15(a0)
-; RV32I-NEXT: lbu s4, 16(a0)
-; RV32I-NEXT: lbu s5, 17(a0)
-; RV32I-NEXT: lbu s6, 18(a0)
-; RV32I-NEXT: lbu s7, 19(a0)
-; RV32I-NEXT: slli a4, a4, 8
-; RV32I-NEXT: slli a5, a5, 16
-; RV32I-NEXT: slli a6, a6, 24
+; RV32I-NEXT: addi sp, sp, -128
+; RV32I-NEXT: sw ra, 124(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s0, 120(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 116(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s2, 112(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s3, 108(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s4, 104(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s5, 100(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s6, 96(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s7, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s8, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s9, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s10, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s11, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT: lbu a5, 0(a0)
+; RV32I-NEXT: lbu a6, 1(a0)
+; RV32I-NEXT: lbu a7, 2(a0)
+; RV32I-NEXT: lbu t1, 3(a0)
+; RV32I-NEXT: lbu s0, 4(a0)
+; RV32I-NEXT: lbu s2, 5(a0)
+; RV32I-NEXT: lbu s3, 6(a0)
+; RV32I-NEXT: lbu s6, 7(a0)
+; RV32I-NEXT: lbu s1, 8(a0)
+; RV32I-NEXT: lbu s7, 9(a0)
+; RV32I-NEXT: lbu s8, 10(a0)
+; RV32I-NEXT: lbu s9, 11(a0)
+; RV32I-NEXT: lbu s10, 12(a0)
+; RV32I-NEXT: lbu s11, 13(a0)
+; RV32I-NEXT: lbu s4, 14(a0)
+; RV32I-NEXT: lbu s5, 15(a0)
+; RV32I-NEXT: lbu a3, 16(a0)
+; RV32I-NEXT: lbu t0, 17(a0)
+; RV32I-NEXT: lbu t2, 18(a0)
+; RV32I-NEXT: lbu t3, 19(a0)
+; RV32I-NEXT: lbu a4, 20(a0)
+; RV32I-NEXT: lbu t4, 21(a0)
+; RV32I-NEXT: lbu t5, 22(a0)
+; RV32I-NEXT: lbu t6, 23(a0)
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: slli a7, a7, 16
+; RV32I-NEXT: slli t1, t1, 24
+; RV32I-NEXT: slli s2, s2, 8
+; RV32I-NEXT: slli s3, s3, 16
+; RV32I-NEXT: slli s6, s6, 24
+; RV32I-NEXT: or a5, a6, a5
+; RV32I-NEXT: or a6, t1, a7
+; RV32I-NEXT: or a7, s2, s0
+; RV32I-NEXT: or t1, s6, s3
+; RV32I-NEXT: lbu s0, 24(a0)
+; RV32I-NEXT: lbu s6, 25(a0)
+; RV32I-NEXT: lbu ra, 26(a0)
+; RV32I-NEXT: lbu s2, 27(a0)
+; RV32I-NEXT: slli s7, s7, 8
+; RV32I-NEXT: slli s8, s8, 16
+; RV32I-NEXT: slli s9, s9, 24
+; RV32I-NEXT: slli s11, s11, 8
+; RV32I-NEXT: or s1, s7, s1
+; RV32I-NEXT: or s7, s9, s8
+; RV32I-NEXT: or s3, s11, s10
+; RV32I-NEXT: lbu s8, 28(a0)
+; RV32I-NEXT: lbu s9, 29(a0)
+; RV32I-NEXT: lbu s10, 30(a0)
+; RV32I-NEXT: lbu a0, 31(a0)
+; RV32I-NEXT: lbu a1, 0(a1)
+; RV32I-NEXT: slli s4, s4, 16
+; RV32I-NEXT: slli s5, s5, 24
+; RV32I-NEXT: or s4, s5, s4
+; RV32I-NEXT: addi s5, sp, 8
; RV32I-NEXT: slli t0, t0, 8
-; RV32I-NEXT: slli t1, t1, 16
-; RV32I-NEXT: slli t2, t2, 24
-; RV32I-NEXT: or a3, a4, a3
-; RV32I-NEXT: or a4, a6, a5
-; RV32I-NEXT: or a5, t0, a7
-; RV32I-NEXT: or a6, t2, t1
-; RV32I-NEXT: lbu s8, 20(a0)
-; RV32I-NEXT: lbu s9, 21(a0)
-; RV32I-NEXT: lbu s10, 22(a0)
-; RV32I-NEXT: lbu s11, 23(a0)
+; RV32I-NEXT: slli t2, t2, 16
+; RV32I-NEXT: slli t3, t3, 24
; RV32I-NEXT: slli t4, t4, 8
; RV32I-NEXT: slli t5, t5, 16
; RV32I-NEXT: slli t6, t6, 24
-; RV32I-NEXT: slli s1, s1, 8
-; RV32I-NEXT: slli s2, s2, 16
-; RV32I-NEXT: slli s3, s3, 24
-; RV32I-NEXT: or a7, t4, t3
-; RV32I-NEXT: or t0, t6, t5
-; RV32I-NEXT: or t1, s1, s0
-; RV32I-NEXT: or t2, s3, s2
-; RV32I-NEXT: lbu t3, 24(a0)
-; RV32I-NEXT: lbu t5, 25(a0)
-; RV32I-NEXT: lbu t6, 26(a0)
-; RV32I-NEXT: lbu s0, 27(a0)
-; RV32I-NEXT: slli s5, s5, 8
-; RV32I-NEXT: slli s6, s6, 16
-; RV32I-NEXT: slli s7, s7, 24
+; RV32I-NEXT: slli s6, s6, 8
+; RV32I-NEXT: slli ra, ra, 16
+; RV32I-NEXT: slli s2, s2, 24
; RV32I-NEXT: slli s9, s9, 8
-; RV32I-NEXT: or t4, s5, s4
-; RV32I-NEXT: or s1, s7, s6
-; RV32I-NEXT: or s2, s9, s8
-; RV32I-NEXT: lbu s3, 28(a0)
-; RV32I-NEXT: lbu s4, 29(a0)
-; RV32I-NEXT: lbu s5, 30(a0)
-; RV32I-NEXT: lbu a0, 31(a0)
-; RV32I-NEXT: lbu a1, 0(a1)
; RV32I-NEXT: slli s10, s10, 16
-; RV32I-NEXT: slli s11, s11, 24
-; RV32I-NEXT: or s6, s11, s10
-; RV32I-NEXT: mv s7, sp
-; RV32I-NEXT: slli t5, t5, 8
-; RV32I-NEXT: slli t6, t6, 16
-; RV32I-NEXT: slli s0, s0, 24
-; RV32I-NEXT: slli s4, s4, 8
-; RV32I-NEXT: slli s5, s5, 16
; RV32I-NEXT: slli a0, a0, 24
; RV32I-NEXT: slli a1, a1, 2
-; RV32I-NEXT: or t3, t5, t3
-; RV32I-NEXT: or t5, s0, t6
-; RV32I-NEXT: or t6, s4, s3
-; RV32I-NEXT: or s0, a0, s5
+; RV32I-NEXT: or a3, t0, a3
+; RV32I-NEXT: or t0, t3, t2
+; RV32I-NEXT: or a4, t4, a4
+; RV32I-NEXT: or t2, t6, t5
+; RV32I-NEXT: or t3, s6, s0
+; RV32I-NEXT: or t4, s2, ra
+; RV32I-NEXT: or t5, s9, s8
+; RV32I-NEXT: or t6, a0, s10
; RV32I-NEXT: srai a0, a0, 31
; RV32I-NEXT: andi a1, a1, 28
-; RV32I-NEXT: or a3, a4, a3
-; RV32I-NEXT: or a4, a6, a5
-; RV32I-NEXT: or a5, t0, a7
-; RV32I-NEXT: or a6, t2, t1
-; RV32I-NEXT: or a7, s1, t4
-; RV32I-NEXT: or t0, s6, s2
-; RV32I-NEXT: or t1, t5, t3
-; RV32I-NEXT: or t2, s0, t6
-; RV32I-NEXT: sw a0, 48(sp)
-; RV32I-NEXT: sw a0, 52(sp)
+; RV32I-NEXT: or a5, a6, a5
+; RV32I-NEXT: or a6, t1, a7
+; RV32I-NEXT: or a7, s7, s1
+; RV32I-NEXT: or t1, s4, s3
+; RV32I-NEXT: or a3, t0, a3
+; RV32I-NEXT: or a4, t2, a4
+; RV32I-NEXT: or t0, t4, t3
+; RV32I-NEXT: or t2, t6, t5
; RV32I-NEXT: sw a0, 56(sp)
; RV32I-NEXT: sw a0, 60(sp)
-; RV32I-NEXT: sw a0, 32(sp)
-; RV32I-NEXT: sw a0, 36(sp)
+; RV32I-NEXT: sw a0, 64(sp)
+; RV32I-NEXT: sw a0, 68(sp)
; RV32I-NEXT: sw a0, 40(sp)
; RV32I-NEXT: sw a0, 44(sp)
-; RV32I-NEXT: add s7, s7, a1
-; RV32I-NEXT: sw a7, 16(sp)
-; RV32I-NEXT: sw t0, 20(sp)
-; RV32I-NEXT: sw t1, 24(sp)
-; RV32I-NEXT: sw t2, 28(sp)
-; RV32I-NEXT: sw a3, 0(sp)
-; RV32I-NEXT: sw a4, 4(sp)
+; RV32I-NEXT: sw a0, 48(sp)
+; RV32I-NEXT: sw a0, 52(sp)
+; RV32I-NEXT: add s5, s5, a1
+; RV32I-NEXT: sw a3, 24(sp)
+; RV32I-NEXT: sw a4, 28(sp)
+; RV32I-NEXT: sw t0, 32(sp)
+; RV32I-NEXT: sw t2, 36(sp)
; RV32I-NEXT: sw a5, 8(sp)
; RV32I-NEXT: sw a6, 12(sp)
-; RV32I-NEXT: lw a6, 16(s7)
-; RV32I-NEXT: lw a5, 20(s7)
-; RV32I-NEXT: lw a7, 24(s7)
-; RV32I-NEXT: lw a1, 0(s7)
-; RV32I-NEXT: lw a0, 4(s7)
-; RV32I-NEXT: lw a4, 8(s7)
-; RV32I-NEXT: lw a3, 12(s7)
-; RV32I-NEXT: lw t0, 28(s7)
+; RV32I-NEXT: sw a7, 16(sp)
+; RV32I-NEXT: sw t1, 20(sp)
+; RV32I-NEXT: lw a6, 16(s5)
+; RV32I-NEXT: lw a5, 20(s5)
+; RV32I-NEXT: lw a7, 24(s5)
+; RV32I-NEXT: lw a1, 0(s5)
+; RV32I-NEXT: lw a0, 4(s5)
+; RV32I-NEXT: lw a4, 8(s5)
+; RV32I-NEXT: lw a3, 12(s5)
+; RV32I-NEXT: lw t0, 28(s5)
; RV32I-NEXT: srli t1, a7, 24
; RV32I-NEXT: srli t2, a7, 16
; RV32I-NEXT: srli t3, a7, 8
@@ -5432,21 +5465,21 @@ define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
; RV32I-NEXT: srli s5, a5, 8
; RV32I-NEXT: srli s6, a4, 24
; RV32I-NEXT: srli s7, a4, 16
+; RV32I-NEXT: srli s8, a4, 8
+; RV32I-NEXT: srli s9, a3, 24
+; RV32I-NEXT: srli s10, a3, 16
+; RV32I-NEXT: srli s11, a3, 8
; RV32I-NEXT: sb a7, 24(a2)
-; RV32I-NEXT: srli a7, a4, 8
+; RV32I-NEXT: srli a7, a1, 24
; RV32I-NEXT: sb t3, 25(a2)
-; RV32I-NEXT: srli t3, a3, 24
; RV32I-NEXT: sb t2, 26(a2)
-; RV32I-NEXT: srli t2, a3, 16
; RV32I-NEXT: sb t1, 27(a2)
-; RV32I-NEXT: srli t1, a3, 8
+; RV32I-NEXT: srli t1, a1, 16
; RV32I-NEXT: sb t0, 28(a2)
-; RV32I-NEXT: srli t0, a1, 24
; RV32I-NEXT: sb t6, 29(a2)
-; RV32I-NEXT: srli t6, a1, 16
; RV32I-NEXT: sb t5, 30(a2)
; RV32I-NEXT: sb t4, 31(a2)
-; RV32I-NEXT: srli t4, a1, 8
+; RV32I-NEXT: srli t0, a1, 8
; RV32I-NEXT: sb a6, 16(a2)
; RV32I-NEXT: sb s2, 17(a2)
; RV32I-NEXT: sb s1, 18(a2)
@@ -5458,35 +5491,36 @@ define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
; RV32I-NEXT: sb s3, 23(a2)
; RV32I-NEXT: srli a5, a0, 16
; RV32I-NEXT: sb a4, 8(a2)
-; RV32I-NEXT: sb a7, 9(a2)
+; RV32I-NEXT: sb s8, 9(a2)
; RV32I-NEXT: sb s7, 10(a2)
; RV32I-NEXT: sb s6, 11(a2)
; RV32I-NEXT: srli a4, a0, 8
; RV32I-NEXT: sb a3, 12(a2)
-; RV32I-NEXT: sb t1, 13(a2)
-; RV32I-NEXT: sb t2, 14(a2)
-; RV32I-NEXT: sb t3, 15(a2)
+; RV32I-NEXT: sb s11, 13(a2)
+; RV32I-NEXT: sb s10, 14(a2)
+; RV32I-NEXT: sb s9, 15(a2)
; RV32I-NEXT: sb a1, 0(a2)
-; RV32I-NEXT: sb t4, 1(a2)
-; RV32I-NEXT: sb t6, 2(a2)
-; RV32I-NEXT: sb t0, 3(a2)
+; RV32I-NEXT: sb t0, 1(a2)
+; RV32I-NEXT: sb t1, 2(a2)
+; RV32I-NEXT: sb a7, 3(a2)
; RV32I-NEXT: sb a0, 4(a2)
; RV32I-NEXT: sb a4, 5(a2)
; RV32I-NEXT: sb a5, 6(a2)
; RV32I-NEXT: sb a6, 7(a2)
-; RV32I-NEXT: lw s0, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s1, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s2, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s3, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s4, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s5, 88(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s6, 84(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s7, 80(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s8, 76(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s9, 72(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s10, 68(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s11, 64(sp) # 4-byte Folded Reload
-; RV32I-NEXT: addi sp, sp, 112
+; RV32I-NEXT: lw ra, 124(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s0, 120(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 116(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s2, 112(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s3, 108(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s4, 104(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s5, 100(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s6, 96(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s7, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s8, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s9, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s10, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s11, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT: addi sp, sp, 128
; RV32I-NEXT: ret
%src = load i256, ptr %src.ptr, align 1
%wordOff = load i256, ptr %wordOff.ptr, align 1
@@ -5512,112 +5546,112 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
; RV64I-NEXT: sd s9, 80(sp) # 8-byte Folded Spill
; RV64I-NEXT: sd s10, 72(sp) # 8-byte Folded Spill
; RV64I-NEXT: sd s11, 64(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 0(a0)
-; RV64I-NEXT: lbu a4, 1(a0)
-; RV64I-NEXT: lbu a5, 2(a0)
-; RV64I-NEXT: lbu a6, 3(a0)
-; RV64I-NEXT: lbu a7, 4(a0)
-; RV64I-NEXT: lbu t0, 5(a0)
-; RV64I-NEXT: lbu t1, 6(a0)
-; RV64I-NEXT: lbu t2, 7(a0)
-; RV64I-NEXT: lbu t3, 8(a0)
-; RV64I-NEXT: lbu t4, 9(a0)
-; RV64I-NEXT: lbu t5, 10(a0)
-; RV64I-NEXT: lbu t6, 11(a0)
-; RV64I-NEXT: lbu s0, 12(a0)
-; RV64I-NEXT: lbu s1, 13(a0)
-; RV64I-NEXT: lbu s2, 14(a0)
-; RV64I-NEXT: lbu s3, 15(a0)
-; RV64I-NEXT: lbu s4, 16(a0)
-; RV64I-NEXT: lbu s5, 17(a0)
-; RV64I-NEXT: lbu s6, 18(a0)
-; RV64I-NEXT: lbu s7, 19(a0)
-; RV64I-NEXT: slli a4, a4, 8
-; RV64I-NEXT: slli a5, a5, 16
-; RV64I-NEXT: slli a6, a6, 24
-; RV64I-NEXT: slli t0, t0, 8
+; RV64I-NEXT: lbu a5, 0(a0)
+; RV64I-NEXT: lbu a7, 1(a0)
+; RV64I-NEXT: lbu t1, 2(a0)
+; RV64I-NEXT: lbu s3, 3(a0)
+; RV64I-NEXT: lbu t0, 4(a0)
+; RV64I-NEXT: lbu s8, 5(a0)
+; RV64I-NEXT: lbu s9, 6(a0)
+; RV64I-NEXT: lbu s10, 7(a0)
+; RV64I-NEXT: lbu s2, 8(a0)
+; RV64I-NEXT: lbu s4, 9(a0)
+; RV64I-NEXT: lbu s5, 10(a0)
+; RV64I-NEXT: lbu s6, 11(a0)
+; RV64I-NEXT: lbu s7, 12(a0)
+; RV64I-NEXT: lbu s11, 13(a0)
+; RV64I-NEXT: lbu t4, 14(a0)
+; RV64I-NEXT: lbu t5, 15(a0)
+; RV64I-NEXT: lbu a3, 16(a0)
+; RV64I-NEXT: lbu a6, 17(a0)
+; RV64I-NEXT: lbu t2, 18(a0)
+; RV64I-NEXT: lbu t3, 19(a0)
+; RV64I-NEXT: lbu a4, 20(a0)
+; RV64I-NEXT: lbu t6, 21(a0)
+; RV64I-NEXT: lbu s0, 22(a0)
+; RV64I-NEXT: lbu s1, 23(a0)
+; RV64I-NEXT: slli a7, a7, 8
; RV64I-NEXT: slli t1, t1, 16
-; RV64I-NEXT: slli t2, t2, 24
-; RV64I-NEXT: or a3, a4, a3
-; RV64I-NEXT: or a4, a6, a5
-; RV64I-NEXT: or a5, t0, a7
-; RV64I-NEXT: or a6, t2, t1
-; RV64I-NEXT: lbu s8, 20(a0)
-; RV64I-NEXT: lbu s9, 21(a0)
-; RV64I-NEXT: lbu s10, 22(a0)
-; RV64I-NEXT: lbu s11, 23(a0)
-; RV64I-NEXT: slli t4, t4, 8
-; RV64I-NEXT: slli t5, t5, 16
-; RV64I-NEXT: slli t6, t6, 24
-; RV64I-NEXT: slli s1, s1, 8
-; RV64I-NEXT: slli s2, s2, 16
; RV64I-NEXT: slli s3, s3, 24
-; RV64I-NEXT: or a7, t4, t3
-; RV64I-NEXT: or t0, t6, t5
-; RV64I-NEXT: or t1, s1, s0
-; RV64I-NEXT: or t2, s3, s2
-; RV64I-NEXT: lbu t3, 24(a0)
-; RV64I-NEXT: lbu t4, 25(a0)
-; RV64I-NEXT: lbu t5, 26(a0)
-; RV64I-NEXT: lbu t6, 27(a0)
-; RV64I-NEXT: slli s5, s5, 8
-; RV64I-NEXT: slli s6, s6, 16
-; RV64I-NEXT: slli s7, s7, 24
-; RV64I-NEXT: slli s9, s9, 8
-; RV64I-NEXT: or s0, s5, s4
-; RV64I-NEXT: or s1, s7, s6
-; RV64I-NEXT: or s2, s9, s8
-; RV64I-NEXT: lbu s3, 28(a0)
-; RV64I-NEXT: lbu s4, 29(a0)
-; RV64I-NEXT: lbu s5, 30(a0)
-; RV64I-NEXT: lbu a0, 31(a0)
-; RV64I-NEXT: lbu a1, 0(a1)
-; RV64I-NEXT: slli s10, s10, 16
-; RV64I-NEXT: slli s11, s11, 24
-; RV64I-NEXT: or s6, s11, s10
-; RV64I-NEXT: mv s7, sp
-; RV64I-NEXT: slli t4, t4, 8
-; RV64I-NEXT: slli t5, t5, 16
-; RV64I-NEXT: slli t6, t6, 24
+; RV64I-NEXT: slli s8, s8, 8
+; RV64I-NEXT: slli s9, s9, 16
+; RV64I-NEXT: slli s10, s10, 24
+; RV64I-NEXT: or a5, a7, a5
+; RV64I-NEXT: or a7, s3, t1
+; RV64I-NEXT: or t0, s8, t0
+; RV64I-NEXT: or t1, s10, s9
+; RV64I-NEXT: lbu s3, 24(a0)
+; RV64I-NEXT: lbu s8, 25(a0)
+; RV64I-NEXT: lbu s9, 26(a0)
+; RV64I-NEXT: lbu s10, 27(a0)
; RV64I-NEXT: slli s4, s4, 8
; RV64I-NEXT: slli s5, s5, 16
+; RV64I-NEXT: slli s6, s6, 24
+; RV64I-NEXT: slli s11, s11, 8
+; RV64I-NEXT: or s2, s4, s2
+; RV64I-NEXT: or s4, s6, s5
+; RV64I-NEXT: or s5, s11, s7
+; RV64I-NEXT: lbu s6, 28(a0)
+; RV64I-NEXT: lbu s7, 29(a0)
+; RV64I-NEXT: lbu s11, 30(a0)
+; RV64I-NEXT: lbu a0, 31(a0)
+; RV64I-NEXT: lbu a1, 0(a1)
+; RV64I-NEXT: slli t4, t4, 16
+; RV64I-NEXT: slli t5, t5, 24
+; RV64I-NEXT: or t4, t5, t4
+; RV64I-NEXT: mv t5, sp
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: slli t2, t2, 16
+; RV64I-NEXT: slli t3, t3, 24
+; RV64I-NEXT: slli t6, t6, 8
+; RV64I-NEXT: slli s0, s0, 16
+; RV64I-NEXT: slli s1, s1, 24
+; RV64I-NEXT: slli s8, s8, 8
+; RV64I-NEXT: slli s9, s9, 16
+; RV64I-NEXT: slli s10, s10, 24
+; RV64I-NEXT: slli s7, s7, 8
+; RV64I-NEXT: slli s11, s11, 16
; RV64I-NEXT: slli a0, a0, 24
; RV64I-NEXT: slli a1, a1, 3
-; RV64I-NEXT: or t3, t4, t3
-; RV64I-NEXT: or t4, t6, t5
-; RV64I-NEXT: or t5, s4, s3
-; RV64I-NEXT: or a0, a0, s5
-; RV64I-NEXT: andi a1, a1, 24
-; RV64I-NEXT: or a3, a4, a3
-; RV64I-NEXT: or a4, a6, a5
-; RV64I-NEXT: or a5, t0, a7
-; RV64I-NEXT: or a6, t2, t1
+; RV64I-NEXT: or a3, a6, a3
+; RV64I-NEXT: or a6, t3, t2
+; RV64I-NEXT: or a4, t6, a4
; RV64I-NEXT: or s0, s1, s0
-; RV64I-NEXT: or a7, s6, s2
-; RV64I-NEXT: or t0, t4, t3
-; RV64I-NEXT: or a0, a0, t5
-; RV64I-NEXT: add s7, s7, a1
-; RV64I-NEXT: slli a4, a4, 32
-; RV64I-NEXT: slli a6, a6, 32
+; RV64I-NEXT: or t2, s8, s3
+; RV64I-NEXT: or t3, s10, s9
+; RV64I-NEXT: or t6, s7, s6
+; RV64I-NEXT: or a0, a0, s11
+; RV64I-NEXT: andi a1, a1, 24
+; RV64I-NEXT: or a5, a7, a5
+; RV64I-NEXT: or a7, t1, t0
+; RV64I-NEXT: or t0, s4, s2
+; RV64I-NEXT: or t1, t4, s5
+; RV64I-NEXT: or a3, a6, a3
+; RV64I-NEXT: or a4, s0, a4
+; RV64I-NEXT: or a6, t3, t2
+; RV64I-NEXT: or a0, a0, t6
+; RV64I-NEXT: add t5, t5, a1
; RV64I-NEXT: slli a7, a7, 32
+; RV64I-NEXT: slli t1, t1, 32
+; RV64I-NEXT: slli a4, a4, 32
; RV64I-NEXT: slli a1, a0, 32
; RV64I-NEXT: sraiw a0, a0, 31
+; RV64I-NEXT: or a5, a7, a5
+; RV64I-NEXT: or a7, t1, t0
; RV64I-NEXT: or a3, a4, a3
-; RV64I-NEXT: or a4, a6, a5
-; RV64I-NEXT: or a5, a7, s0
-; RV64I-NEXT: or a1, a1, t0
+; RV64I-NEXT: or a1, a1, a6
; RV64I-NEXT: sd a0, 32(sp)
; RV64I-NEXT: sd a0, 40(sp)
; RV64I-NEXT: sd a0, 48(sp)
; RV64I-NEXT: sd a0, 56(sp)
-; RV64I-NEXT: sd a3, 0(sp)
-; RV64I-NEXT: sd a4, 8(sp)
-; RV64I-NEXT: sd a5, 16(sp)
+; RV64I-NEXT: sd a5, 0(sp)
+; RV64I-NEXT: sd a7, 8(sp)
+; RV64I-NEXT: sd a3, 16(sp)
; RV64I-NEXT: sd a1, 24(sp)
-; RV64I-NEXT: ld a4, 16(s7)
-; RV64I-NEXT: ld a0, 8(s7)
-; RV64I-NEXT: ld a1, 0(s7)
-; RV64I-NEXT: ld a3, 24(s7)
+; RV64I-NEXT: ld a4, 16(t5)
+; RV64I-NEXT: ld a0, 8(t5)
+; RV64I-NEXT: ld a1, 0(t5)
+; RV64I-NEXT: ld a3, 24(t5)
; RV64I-NEXT: srli a5, a4, 56
; RV64I-NEXT: srli a6, a4, 48
; RV64I-NEXT: srli a7, a4, 40
@@ -5636,25 +5670,25 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
; RV64I-NEXT: srli s5, a1, 48
; RV64I-NEXT: srli s6, a1, 40
; RV64I-NEXT: srli s7, a1, 32
+; RV64I-NEXT: srli s8, a1, 24
+; RV64I-NEXT: srli s9, a1, 16
+; RV64I-NEXT: srli s10, a1, 8
+; RV64I-NEXT: srli s11, a0, 56
; RV64I-NEXT: sb t0, 20(a2)
-; RV64I-NEXT: srli t0, a1, 24
; RV64I-NEXT: sb a7, 21(a2)
-; RV64I-NEXT: srli a7, a1, 16
; RV64I-NEXT: sb a6, 22(a2)
-; RV64I-NEXT: srli a6, a1, 8
; RV64I-NEXT: sb a5, 23(a2)
-; RV64I-NEXT: srli a5, a0, 56
+; RV64I-NEXT: srli a5, a0, 48
; RV64I-NEXT: sb a4, 16(a2)
-; RV64I-NEXT: srli a4, a0, 48
; RV64I-NEXT: sb t3, 17(a2)
; RV64I-NEXT: sb t2, 18(a2)
; RV64I-NEXT: sb t1, 19(a2)
-; RV64I-NEXT: srli t1, a0, 40
+; RV64I-NEXT: srli a4, a0, 40
; RV64I-NEXT: sb s0, 28(a2)
; RV64I-NEXT: sb t6, 29(a2)
; RV64I-NEXT: sb t5, 30(a2)
; RV64I-NEXT: sb t4, 31(a2)
-; RV64I-NEXT: srli t2, a0, 32
+; RV64I-NEXT: srli a6, a0, 32
; RV64I-NEXT: sb a3, 24(a2)
; RV64I-NEXT: sb s3, 25(a2)
; RV64I-NEXT: sb s2, 26(a2)
@@ -5664,19 +5698,19 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
; RV64I-NEXT: sb s6, 5(a2)
; RV64I-NEXT: sb s5, 6(a2)
; RV64I-NEXT: sb s4, 7(a2)
-; RV64I-NEXT: srli t3, a0, 16
+; RV64I-NEXT: srli a7, a0, 16
; RV64I-NEXT: sb a1, 0(a2)
-; RV64I-NEXT: sb a6, 1(a2)
-; RV64I-NEXT: sb a7, 2(a2)
-; RV64I-NEXT: sb t0, 3(a2)
+; RV64I-NEXT: sb s10, 1(a2)
+; RV64I-NEXT: sb s9, 2(a2)
+; RV64I-NEXT: sb s8, 3(a2)
; RV64I-NEXT: srli a1, a0, 8
-; RV64I-NEXT: sb t2, 12(a2)
-; RV64I-NEXT: sb t1, 13(a2)
-; RV64I-NEXT: sb a4, 14(a2)
-; RV64I-NEXT: sb a5, 15(a2)
+; RV64I-NEXT: sb a6, 12(a2)
+; RV64I-NEXT: sb a4, 13(a2)
+; RV64I-NEXT: sb a5, 14(a2)
+; RV64I-NEXT: sb s11, 15(a2)
; RV64I-NEXT: sb a0, 8(a2)
; RV64I-NEXT: sb a1, 9(a2)
-; RV64I-NEXT: sb t3, 10(a2)
+; RV64I-NEXT: sb a7, 10(a2)
; RV64I-NEXT: sb a3, 11(a2)
; RV64I-NEXT: ld s0, 152(sp) # 8-byte Folded Reload
; RV64I-NEXT: ld s1, 144(sp) # 8-byte Folded Reload
@@ -5695,129 +5729,130 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
;
; RV32I-LABEL: ashr_32bytes_dwordOff:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi sp, sp, -112
-; RV32I-NEXT: sw s0, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s1, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s2, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s3, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s4, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s5, 88(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s6, 84(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s7, 80(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s8, 76(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s9, 72(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s10, 68(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s11, 64(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 0(a0)
-; RV32I-NEXT: lbu a4, 1(a0)
-; RV32I-NEXT: lbu a5, 2(a0)
-; RV32I-NEXT: lbu a6, 3(a0)
-; RV32I-NEXT: lbu a7, 4(a0)
-; RV32I-NEXT: lbu t0, 5(a0)
-; RV32I-NEXT: lbu t1, 6(a0)
-; RV32I-NEXT: lbu t2, 7(a0)
-; RV32I-NEXT: lbu t3, 8(a0)
-; RV32I-NEXT: lbu t4, 9(a0)
-; RV32I-NEXT: lbu t5, 10(a0)
-; RV32I-NEXT: lbu t6, 11(a0)
-; RV32I-NEXT: lbu s0, 12(a0)
-; RV32I-NEXT: lbu s1, 13(a0)
-; RV32I-NEXT: lbu s2, 14(a0)
-; RV32I-NEXT: lbu s3, 15(a0)
-; RV32I-NEXT: lbu s4, 16(a0)
-; RV32I-NEXT: lbu s5, 17(a0)
-; RV32I-NEXT: lbu s6, 18(a0)
-; RV32I-NEXT: lbu s7, 19(a0)
-; RV32I-NEXT: slli a4, a4, 8
-; RV32I-NEXT: slli a5, a5, 16
-; RV32I-NEXT: slli a6, a6, 24
+; RV32I-NEXT: addi sp, sp, -128
+; RV32I-NEXT: sw ra, 124(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s0, 120(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 116(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s2, 112(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s3, 108(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s4, 104(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s5, 100(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s6, 96(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s7, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s8, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s9, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s10, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s11, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT: lbu a5, 0(a0)
+; RV32I-NEXT: lbu a6, 1(a0)
+; RV32I-NEXT: lbu a7, 2(a0)
+; RV32I-NEXT: lbu t1, 3(a0)
+; RV32I-NEXT: lbu s0, 4(a0)
+; RV32I-NEXT: lbu s2, 5(a0)
+; RV32I-NEXT: lbu s3, 6(a0)
+; RV32I-NEXT: lbu s6, 7(a0)
+; RV32I-NEXT: lbu s1, 8(a0)
+; RV32I-NEXT: lbu s7, 9(a0)
+; RV32I-NEXT: lbu s8, 10(a0)
+; RV32I-NEXT: lbu s9, 11(a0)
+; RV32I-NEXT: lbu s10, 12(a0)
+; RV32I-NEXT: lbu s11, 13(a0)
+; RV32I-NEXT: lbu s4, 14(a0)
+; RV32I-NEXT: lbu s5, 15(a0)
+; RV32I-NEXT: lbu a3, 16(a0)
+; RV32I-NEXT: lbu t0, 17(a0)
+; RV32I-NEXT: lbu t2, 18(a0)
+; RV32I-NEXT: lbu t3, 19(a0)
+; RV32I-NEXT: lbu a4, 20(a0)
+; RV32I-NEXT: lbu t4, 21(a0)
+; RV32I-NEXT: lbu t5, 22(a0)
+; RV32I-NEXT: lbu t6, 23(a0)
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: slli a7, a7, 16
+; RV32I-NEXT: slli t1, t1, 24
+; RV32I-NEXT: slli s2, s2, 8
+; RV32I-NEXT: slli s3, s3, 16
+; RV32I-NEXT: slli s6, s6, 24
+; RV32I-NEXT: or a5, a6, a5
+; RV32I-NEXT: or a6, t1, a7
+; RV32I-NEXT: or a7, s2, s0
+; RV32I-NEXT: or t1, s6, s3
+; RV32I-NEXT: lbu s0, 24(a0)
+; RV32I-NEXT: lbu s6, 25(a0)
+; RV32I-NEXT: lbu ra, 26(a0)
+; RV32I-NEXT: lbu s2, 27(a0)
+; RV32I-NEXT: slli s7, s7, 8
+; RV32I-NEXT: slli s8, s8, 16
+; RV32I-NEXT: slli s9, s9, 24
+; RV32I-NEXT: slli s11, s11, 8
+; RV32I-NEXT: or s1, s7, s1
+; RV32I-NEXT: or s7, s9, s8
+; RV32I-NEXT: or s3, s11, s10
+; RV32I-NEXT: lbu s8, 28(a0)
+; RV32I-NEXT: lbu s9, 29(a0)
+; RV32I-NEXT: lbu s10, 30(a0)
+; RV32I-NEXT: lbu a0, 31(a0)
+; RV32I-NEXT: lbu a1, 0(a1)
+; RV32I-NEXT: slli s4, s4, 16
+; RV32I-NEXT: slli s5, s5, 24
+; RV32I-NEXT: or s4, s5, s4
+; RV32I-NEXT: addi s5, sp, 8
; RV32I-NEXT: slli t0, t0, 8
-; RV32I-NEXT: slli t1, t1, 16
-; RV32I-NEXT: slli t2, t2, 24
-; RV32I-NEXT: or a3, a4, a3
-; RV32I-NEXT: or a4, a6, a5
-; RV32I-NEXT: or a5, t0, a7
-; RV32I-NEXT: or a6, t2, t1
-; RV32I-NEXT: lbu s8, 20(a0)
-; RV32I-NEXT: lbu s9, 21(a0)
-; RV32I-NEXT: lbu s10, 22(a0)
-; RV32I-NEXT: lbu s11, 23(a0)
+; RV32I-NEXT: slli t2, t2, 16
+; RV32I-NEXT: slli t3, t3, 24
; RV32I-NEXT: slli t4, t4, 8
; RV32I-NEXT: slli t5, t5, 16
; RV32I-NEXT: slli t6, t6, 24
-; RV32I-NEXT: slli s1, s1, 8
-; RV32I-NEXT: slli s2, s2, 16
-; RV32I-NEXT: slli s3, s3, 24
-; RV32I-NEXT: or a7, t4, t3
-; RV32I-NEXT: or t0, t6, t5
-; RV32I-NEXT: or t1, s1, s0
-; RV32I-NEXT: or t2, s3, s2
-; RV32I-NEXT: lbu t3, 24(a0)
-; RV32I-NEXT: lbu t5, 25(a0)
-; RV32I-NEXT: lbu t6, 26(a0)
-; RV32I-NEXT: lbu s0, 27(a0)
-; RV32I-NEXT: slli s5, s5, 8
-; RV32I-NEXT: slli s6, s6, 16
-; RV32I-NEXT: slli s7, s7, 24
+; RV32I-NEXT: slli s6, s6, 8
+; RV32I-NEXT: slli ra, ra, 16
+; RV32I-NEXT: slli s2, s2, 24
; RV32I-NEXT: slli s9, s9, 8
-; RV32I-NEXT: or t4, s5, s4
-; RV32I-NEXT: or s1, s7, s6
-; RV32I-NEXT: or s2, s9, s8
-; RV32I-NEXT: lbu s3, 28(a0)
-; RV32I-NEXT: lbu s4, 29(a0)
-; RV32I-NEXT: lbu s5, 30(a0)
-; RV32I-NEXT: lbu a0, 31(a0)
-; RV32I-NEXT: lbu a1, 0(a1)
; RV32I-NEXT: slli s10, s10, 16
-; RV32I-NEXT: slli s11, s11, 24
-; RV32I-NEXT: or s6, s11, s10
-; RV32I-NEXT: mv s7, sp
-; RV32I-NEXT: slli t5, t5, 8
-; RV32I-NEXT: slli t6, t6, 16
-; RV32I-NEXT: slli s0, s0, 24
-; RV32I-NEXT: slli s4, s4, 8
-; RV32I-NEXT: slli s5, s5, 16
; RV32I-NEXT: slli a0, a0, 24
; RV32I-NEXT: slli a1, a1, 3
-; RV32I-NEXT: or t3, t5, t3
-; RV32I-NEXT: or t5, s0, t6
-; RV32I-NEXT: or t6, s4, s3
-; RV32I-NEXT: or s0, a0, s5
+; RV32I-NEXT: or a3, t0, a3
+; RV32I-NEXT: or t0, t3, t2
+; RV32I-NEXT: or a4, t4, a4
+; RV32I-NEXT: or t2, t6, t5
+; RV32I-NEXT: or t3, s6, s0
+; RV32I-NEXT: or t4, s2, ra
+; RV32I-NEXT: or t5, s9, s8
+; RV32I-NEXT: or t6, a0, s10
; RV32I-NEXT: srai a0, a0, 31
; RV32I-NEXT: andi a1, a1, 24
-; RV32I-NEXT: or a3, a4, a3
-; RV32I-NEXT: or a4, a6, a5
-; RV32I-NEXT: or a5, t0, a7
-; RV32I-NEXT: or a6, t2, t1
-; RV32I-NEXT: or a7, s1, t4
-; RV32I-NEXT: or t0, s6, s2
-; RV32I-NEXT: or t1, t5, t3
-; RV32I-NEXT: or t2, s0, t6
-; RV32I-NEXT: sw a0, 48(sp)
-; RV32I-NEXT: sw a0, 52(sp)
+; RV32I-NEXT: or a5, a6, a5
+; RV32I-NEXT: or a6, t1, a7
+; RV32I-NEXT: or a7, s7, s1
+; RV32I-NEXT: or t1, s4, s3
+; RV32I-NEXT: or a3, t0, a3
+; RV32I-NEXT: or a4, t2, a4
+; RV32I-NEXT: or t0, t4, t3
+; RV32I-NEXT: or t2, t6, t5
; RV32I-NEXT: sw a0, 56(sp)
; RV32I-NEXT: sw a0, 60(sp)
-; RV32I-NEXT: sw a0, 32(sp)
-; RV32I-NEXT: sw a0, 36(sp)
+; RV32I-NEXT: sw a0, 64(sp)
+; RV32I-NEXT: sw a0, 68(sp)
; RV32I-NEXT: sw a0, 40(sp)
; RV32I-NEXT: sw a0, 44(sp)
-; RV32I-NEXT: add s7, s7, a1
-; RV32I-NEXT: sw a7, 16(sp)
-; RV32I-NEXT: sw t0, 20(sp)
-; RV32I-NEXT: sw t1, 24(sp)
-; RV32I-NEXT: sw t2, 28(sp)
-; RV32I-NEXT: sw a3, 0(sp)
-; RV32I-NEXT: sw a4, 4(sp)
+; RV32I-NEXT: sw a0, 48(sp)
+; RV32I-NEXT: sw a0, 52(sp)
+; RV32I-NEXT: add s5, s5, a1
+; RV32I-NEXT: sw a3, 24(sp)
+; RV32I-NEXT: sw a4, 28(sp)
+; RV32I-NEXT: sw t0, 32(sp)
+; RV32I-NEXT: sw t2, 36(sp)
; RV32I-NEXT: sw a5, 8(sp)
; RV32I-NEXT: sw a6, 12(sp)
-; RV32I-NEXT: lw a6, 16(s7)
-; RV32I-NEXT: lw a5, 20(s7)
-; RV32I-NEXT: lw a7, 24(s7)
-; RV32I-NEXT: lw a1, 0(s7)
-; RV32I-NEXT: lw a0, 4(s7)
-; RV32I-NEXT: lw a4, 8(s7)
-; RV32I-NEXT: lw a3, 12(s7)
-; RV32I-NEXT: lw t0, 28(s7)
+; RV32I-NEXT: sw a7, 16(sp)
+; RV32I-NEXT: sw t1, 20(sp)
+; RV32I-NEXT: lw a6, 16(s5)
+; RV32I-NEXT: lw a5, 20(s5)
+; RV32I-NEXT: lw a7, 24(s5)
+; RV32I-NEXT: lw a1, 0(s5)
+; RV32I-NEXT: lw a0, 4(s5)
+; RV32I-NEXT: lw a4, 8(s5)
+; RV32I-NEXT: lw a3, 12(s5)
+; RV32I-NEXT: lw t0, 28(s5)
; RV32I-NEXT: srli t1, a7, 24
; RV32I-NEXT: srli t2, a7, 16
; RV32I-NEXT: srli t3, a7, 8
@@ -5832,21 +5867,21 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
; RV32I-NEXT: srli s5, a5, 8
; RV32I-NEXT: srli s6, a4, 24
; RV32I-NEXT: srli s7, a4, 16
+; RV32I-NEXT: srli s8, a4, 8
+; RV32I-NEXT: srli s9, a3, 24
+; RV32I-NEXT: srli s10, a3, 16
+; RV32I-NEXT: srli s11, a3, 8
; RV32I-NEXT: sb a7, 24(a2)
-; RV32I-NEXT: srli a7, a4, 8
+; RV32I-NEXT: srli a7, a1, 24
; RV32I-NEXT: sb t3, 25(a2)
-; RV32I-NEXT: srli t3, a3, 24
; RV32I-NEXT: sb t2, 26(a2)
-; RV32I-NEXT: srli t2, a3, 16
; RV32I-NEXT: sb t1, 27(a2)
-; RV32I-NEXT: srli t1, a3, 8
+; RV32I-NEXT: srli t1, a1, 16
; RV32I-NEXT: sb t0, 28(a2)
-; RV32I-NEXT: srli t0, a1, 24
; RV32I-NEXT: sb t6, 29(a2)
-; RV32I-NEXT: srli t6, a1, 16
; RV32I-NEXT: sb t5, 30(a2)
; RV32I-NEXT: sb t4, 31(a2)
-; RV32I-NEXT: srli t4, a1, 8
+; RV32I-NEXT: srli t0, a1, 8
; RV32I-NEXT: sb a6, 16(a2)
; RV32I-NEXT: sb s2, 17(a2)
; RV32I-NEXT: sb s1, 18(a2)
@@ -5858,35 +5893,36 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
; RV32I-NEXT: sb s3, 23(a2)
; RV32I-NEXT: srli a5, a0, 16
; RV32I-NEXT: sb a4, 8(a2)
-; RV32I-NEXT: sb a7, 9(a2)
+; RV32I-NEXT: sb s8, 9(a2)
; RV32I-NEXT: sb s7, 10(a2)
; RV32I-NEXT: sb s6, 11(a2)
; RV32I-NEXT: srli a4, a0, 8
; RV32I-NEXT: sb a3, 12(a2)
-; RV32I-NEXT: sb t1, 13(a2)
-; RV32I-NEXT: sb t2, 14(a2)
-; RV32I-NEXT: sb t3, 15(a2)
+; RV32I-NEXT: sb s11, 13(a2)
+; RV32I-NEXT: sb s10, 14(a2)
+; RV32I-NEXT: sb s9, 15(a2)
; RV32I-NEXT: sb a1, 0(a2)
-; RV32I-NEXT: sb t4, 1(a2)
-; RV32I-NEXT: sb t6, 2(a2)
-; RV32I-NEXT: sb t0, 3(a2)
+; RV32I-NEXT: sb t0, 1(a2)
+; RV32I-NEXT: sb t1, 2(a2)
+; RV32I-NEXT: sb a7, 3(a2)
; RV32I-NEXT: sb a0, 4(a2)
; RV32I-NEXT: sb a4, 5(a2)
; RV32I-NEXT: sb a5, 6(a2)
; RV32I-NEXT: sb a6, 7(a2)
-; RV32I-NEXT: lw s0, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s1, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s2, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s3, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s4, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s5, 88(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s6, 84(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s7, 80(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s8, 76(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s9, 72(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s10, 68(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s11, 64(sp) # 4-byte Folded Reload
-; RV32I-NEXT: addi sp, sp, 112
+; RV32I-NEXT: lw ra, 124(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s0, 120(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 116(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s2, 112(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s3, 108(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s4, 104(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s5, 100(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s6, 96(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s7, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s8, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s9, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s10, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s11, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT: addi sp, sp, 128
; RV32I-NEXT: ret
%src = load i256, ptr %src.ptr, align 1
%dwordOff = load i256, ptr %dwordOff.ptr, align 1
diff --git a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
index b8952d2cb2b29e..b2c130c2d7c10a 100644
--- a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
+++ b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
@@ -1530,24 +1530,25 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
;
; RV32I-LABEL: lshr_32bytes:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi sp, sp, -112
-; RV32I-NEXT: sw s0, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s1, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s2, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s3, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s4, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s5, 88(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s6, 84(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s7, 80(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s8, 76(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s9, 72(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s10, 68(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s11, 64(sp) # 4-byte Folded Spill
+; RV32I-NEXT: addi sp, sp, -128
+; RV32I-NEXT: sw ra, 124(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s0, 120(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 116(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s2, 112(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s3, 108(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s4, 104(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s5, 100(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s6, 96(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s7, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s8, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s9, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s10, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s11, 76(sp) # 4-byte Folded Spill
; RV32I-NEXT: lbu a3, 0(a0)
; RV32I-NEXT: lbu a4, 1(a0)
-; RV32I-NEXT: lbu a5, 2(a0)
-; RV32I-NEXT: lbu a6, 3(a0)
-; RV32I-NEXT: lbu a7, 4(a0)
+; RV32I-NEXT: lbu a6, 2(a0)
+; RV32I-NEXT: lbu a7, 3(a0)
+; RV32I-NEXT: lbu a5, 4(a0)
; RV32I-NEXT: lbu t0, 5(a0)
; RV32I-NEXT: lbu t1, 6(a0)
; RV32I-NEXT: lbu t2, 7(a0)
@@ -1556,105 +1557,107 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: lbu t5, 10(a0)
; RV32I-NEXT: lbu t6, 11(a0)
; RV32I-NEXT: lbu s0, 12(a0)
-; RV32I-NEXT: lbu s1, 13(a0)
-; RV32I-NEXT: lbu s2, 14(a0)
-; RV32I-NEXT: lbu s3, 15(a0)
-; RV32I-NEXT: lbu s4, 16(a0)
-; RV32I-NEXT: lbu s5, 17(a0)
-; RV32I-NEXT: lbu s6, 18(a0)
-; RV32I-NEXT: lbu s7, 19(a0)
+; RV32I-NEXT: lbu s2, 13(a0)
+; RV32I-NEXT: lbu s4, 14(a0)
+; RV32I-NEXT: lbu s5, 15(a0)
+; RV32I-NEXT: lbu s6, 16(a0)
+; RV32I-NEXT: lbu s7, 17(a0)
+; RV32I-NEXT: lbu s8, 18(a0)
+; RV32I-NEXT: lbu s9, 19(a0)
; RV32I-NEXT: slli a4, a4, 8
-; RV32I-NEXT: slli a5, a5, 16
-; RV32I-NEXT: slli a6, a6, 24
+; RV32I-NEXT: slli a6, a6, 16
+; RV32I-NEXT: slli a7, a7, 24
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: sw a3, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT: or a4, a7, a6
+; RV32I-NEXT: lbu s10, 20(a0)
+; RV32I-NEXT: lbu s11, 21(a0)
+; RV32I-NEXT: lbu ra, 22(a0)
+; RV32I-NEXT: lbu a3, 23(a0)
; RV32I-NEXT: slli t0, t0, 8
; RV32I-NEXT: slli t1, t1, 16
; RV32I-NEXT: slli t2, t2, 24
-; RV32I-NEXT: or a3, a4, a3
-; RV32I-NEXT: or a4, a6, a5
-; RV32I-NEXT: or a5, t0, a7
-; RV32I-NEXT: or a6, t2, t1
-; RV32I-NEXT: lbu s8, 20(a0)
-; RV32I-NEXT: lbu s9, 21(a0)
-; RV32I-NEXT: lbu s10, 22(a0)
-; RV32I-NEXT: lbu s11, 23(a0)
; RV32I-NEXT: slli t4, t4, 8
; RV32I-NEXT: slli t5, t5, 16
; RV32I-NEXT: slli t6, t6, 24
-; RV32I-NEXT: slli s1, s1, 8
-; RV32I-NEXT: slli s2, s2, 16
-; RV32I-NEXT: slli s3, s3, 24
+; RV32I-NEXT: or a5, t0, a5
+; RV32I-NEXT: or a6, t2, t1
; RV32I-NEXT: or a7, t4, t3
; RV32I-NEXT: or t0, t6, t5
-; RV32I-NEXT: or t1, s1, s0
-; RV32I-NEXT: or t2, s3, s2
-; RV32I-NEXT: lbu t6, 24(a0)
-; RV32I-NEXT: lbu s0, 25(a0)
-; RV32I-NEXT: lbu s1, 26(a0)
-; RV32I-NEXT: lbu s2, 27(a0)
-; RV32I-NEXT: slli s5, s5, 8
-; RV32I-NEXT: slli s6, s6, 16
-; RV32I-NEXT: slli s7, s7, 24
-; RV32I-NEXT: slli s9, s9, 8
-; RV32I-NEXT: or t3, s5, s4
-; RV32I-NEXT: or t4, s7, s6
-; RV32I-NEXT: or t5, s9, s8
-; RV32I-NEXT: lbu s3, 28(a0)
+; RV32I-NEXT: lbu s1, 24(a0)
+; RV32I-NEXT: lbu s3, 25(a0)
+; RV32I-NEXT: lbu t4, 26(a0)
+; RV32I-NEXT: lbu t5, 27(a0)
+; RV32I-NEXT: slli s2, s2, 8
+; RV32I-NEXT: slli s4, s4, 16
+; RV32I-NEXT: slli s5, s5, 24
+; RV32I-NEXT: slli s7, s7, 8
+; RV32I-NEXT: or t1, s2, s0
+; RV32I-NEXT: or t2, s5, s4
+; RV32I-NEXT: or t3, s7, s6
+; RV32I-NEXT: lbu t6, 28(a0)
; RV32I-NEXT: lbu s4, 29(a0)
; RV32I-NEXT: lbu s5, 30(a0)
; RV32I-NEXT: lbu s6, 31(a0)
-; RV32I-NEXT: slli s10, s10, 16
-; RV32I-NEXT: slli s11, s11, 24
-; RV32I-NEXT: slli s0, s0, 8
-; RV32I-NEXT: slli s1, s1, 16
-; RV32I-NEXT: slli s2, s2, 24
-; RV32I-NEXT: or a0, s11, s10
-; RV32I-NEXT: or t6, s0, t6
-; RV32I-NEXT: or s0, s2, s1
-; RV32I-NEXT: lbu s1, 0(a1)
-; RV32I-NEXT: lbu s2, 1(a1)
-; RV32I-NEXT: lbu s7, 2(a1)
+; RV32I-NEXT: slli s8, s8, 16
+; RV32I-NEXT: slli s9, s9, 24
+; RV32I-NEXT: slli s11, s11, 8
+; RV32I-NEXT: slli ra, ra, 16
+; RV32I-NEXT: slli a3, a3, 24
+; RV32I-NEXT: or a0, s9, s8
+; RV32I-NEXT: or s0, s11, s10
+; RV32I-NEXT: or s2, a3, ra
+; RV32I-NEXT: lbu a3, 0(a1)
+; RV32I-NEXT: lbu s7, 1(a1)
+; RV32I-NEXT: lbu s8, 2(a1)
; RV32I-NEXT: lbu a1, 3(a1)
-; RV32I-NEXT: sw zero, 48(sp)
-; RV32I-NEXT: sw zero, 52(sp)
; RV32I-NEXT: sw zero, 56(sp)
; RV32I-NEXT: sw zero, 60(sp)
-; RV32I-NEXT: sw zero, 32(sp)
-; RV32I-NEXT: sw zero, 36(sp)
+; RV32I-NEXT: sw zero, 64(sp)
+; RV32I-NEXT: sw zero, 68(sp)
; RV32I-NEXT: sw zero, 40(sp)
; RV32I-NEXT: sw zero, 44(sp)
+; RV32I-NEXT: sw zero, 48(sp)
+; RV32I-NEXT: sw zero, 52(sp)
+; RV32I-NEXT: slli s3, s3, 8
+; RV32I-NEXT: or s1, s3, s1
+; RV32I-NEXT: addi s3, sp, 8
+; RV32I-NEXT: slli t4, t4, 16
+; RV32I-NEXT: slli t5, t5, 24
; RV32I-NEXT: slli s4, s4, 8
-; RV32I-NEXT: or s3, s4, s3
-; RV32I-NEXT: mv s4, sp
; RV32I-NEXT: slli s5, s5, 16
; RV32I-NEXT: slli s6, s6, 24
-; RV32I-NEXT: slli s2, s2, 8
-; RV32I-NEXT: slli s7, s7, 16
+; RV32I-NEXT: slli s7, s7, 8
+; RV32I-NEXT: slli s8, s8, 16
; RV32I-NEXT: slli a1, a1, 24
-; RV32I-NEXT: or s5, s6, s5
-; RV32I-NEXT: or s1, s2, s1
-; RV32I-NEXT: or a1, a1, s7
-; RV32I-NEXT: or a3, a4, a3
-; RV32I-NEXT: or a4, a6, a5
-; RV32I-NEXT: or a5, t0, a7
-; RV32I-NEXT: or a6, t2, t1
-; RV32I-NEXT: or a7, t4, t3
-; RV32I-NEXT: or t0, a0, t5
-; RV32I-NEXT: or t1, s0, t6
-; RV32I-NEXT: or t2, s5, s3
-; RV32I-NEXT: or a0, a1, s1
-; RV32I-NEXT: sw a7, 16(sp)
-; RV32I-NEXT: sw t0, 20(sp)
-; RV32I-NEXT: sw t1, 24(sp)
-; RV32I-NEXT: sw t2, 28(sp)
-; RV32I-NEXT: sw a3, 0(sp)
-; RV32I-NEXT: sw a4, 4(sp)
-; RV32I-NEXT: sw a5, 8(sp)
-; RV32I-NEXT: sw a6, 12(sp)
+; RV32I-NEXT: or t4, t5, t4
+; RV32I-NEXT: or t5, s4, t6
+; RV32I-NEXT: or t6, s6, s5
+; RV32I-NEXT: or a3, s7, a3
+; RV32I-NEXT: or a1, a1, s8
+; RV32I-NEXT: lw s4, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or a4, a4, s4
+; RV32I-NEXT: or a5, a6, a5
+; RV32I-NEXT: or a6, t0, a7
+; RV32I-NEXT: or a7, t2, t1
+; RV32I-NEXT: or t0, a0, t3
+; RV32I-NEXT: or t1, s2, s0
+; RV32I-NEXT: or t2, t4, s1
+; RV32I-NEXT: or t3, t6, t5
+; RV32I-NEXT: or a0, a1, a3
+; RV32I-NEXT: sw t0, 24(sp)
+; RV32I-NEXT: sw t1, 28(sp)
+; RV32I-NEXT: sw t2, 32(sp)
+; RV32I-NEXT: sw t3, 36(sp)
+; RV32I-NEXT: sw a4, 8(sp)
+; RV32I-NEXT: sw a5, 12(sp)
+; RV32I-NEXT: sw a6, 16(sp)
+; RV32I-NEXT: sw a7, 20(sp)
; RV32I-NEXT: srli a1, a0, 3
; RV32I-NEXT: andi a3, a0, 31
; RV32I-NEXT: andi a4, a1, 28
; RV32I-NEXT: xori a1, a3, 31
-; RV32I-NEXT: add a4, s4, a4
+; RV32I-NEXT: add a4, s3, a4
; RV32I-NEXT: lw a3, 0(a4)
; RV32I-NEXT: lw a5, 4(a4)
; RV32I-NEXT: lw a6, 8(a4)
@@ -1714,13 +1717,13 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: srli s5, a3, 24
; RV32I-NEXT: srli s6, a3, 16
; RV32I-NEXT: srli s7, a3, 8
+; RV32I-NEXT: srli s8, a1, 24
+; RV32I-NEXT: srli s9, a1, 16
; RV32I-NEXT: sb a7, 24(a2)
-; RV32I-NEXT: srli a7, a1, 24
; RV32I-NEXT: sb t2, 25(a2)
-; RV32I-NEXT: srli t2, a1, 16
; RV32I-NEXT: sb t1, 26(a2)
; RV32I-NEXT: sb t0, 27(a2)
-; RV32I-NEXT: srli t0, a1, 8
+; RV32I-NEXT: srli a7, a1, 8
; RV32I-NEXT: sb a6, 16(a2)
; RV32I-NEXT: sb t5, 17(a2)
; RV32I-NEXT: sb t4, 18(a2)
@@ -1741,26 +1744,27 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: sb s6, 14(a2)
; RV32I-NEXT: sb s5, 15(a2)
; RV32I-NEXT: sb a1, 0(a2)
-; RV32I-NEXT: sb t0, 1(a2)
-; RV32I-NEXT: sb t2, 2(a2)
-; RV32I-NEXT: sb a7, 3(a2)
+; RV32I-NEXT: sb a7, 1(a2)
+; RV32I-NEXT: sb s9, 2(a2)
+; RV32I-NEXT: sb s8, 3(a2)
; RV32I-NEXT: sb a0, 4(a2)
; RV32I-NEXT: sb a4, 5(a2)
; RV32I-NEXT: sb a5, 6(a2)
; RV32I-NEXT: sb a6, 7(a2)
-; RV32I-NEXT: lw s0, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s1, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s2, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s3, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s4, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s5, 88(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s6, 84(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s7, 80(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s8, 76(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s9, 72(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s10, 68(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s11, 64(sp) # 4-byte Folded Reload
-; RV32I-NEXT: addi sp, sp, 112
+; RV32I-NEXT: lw ra, 124(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s0, 120(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 116(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s2, 112(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s3, 108(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s4, 104(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s5, 100(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s6, 96(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s7, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s8, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s9, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s10, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s11, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT: addi sp, sp, 128
; RV32I-NEXT: ret
%src = load i256, ptr %src.ptr, align 1
%bitOff = load i256, ptr %bitOff.ptr, align 1
@@ -2002,24 +2006,25 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
;
; RV32I-LABEL: shl_32bytes:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi sp, sp, -112
-; RV32I-NEXT: sw s0, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s1, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s2, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s3, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s4, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s5, 88(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s6, 84(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s7, 80(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s8, 76(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s9, 72(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s10, 68(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s11, 64(sp) # 4-byte Folded Spill
+; RV32I-NEXT: addi sp, sp, -128
+; RV32I-NEXT: sw ra, 124(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s0, 120(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 116(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s2, 112(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s3, 108(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s4, 104(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s5, 100(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s6, 96(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s7, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s8, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s9, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s10, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s11, 76(sp) # 4-byte Folded Spill
; RV32I-NEXT: lbu a3, 0(a0)
; RV32I-NEXT: lbu a4, 1(a0)
-; RV32I-NEXT: lbu a5, 2(a0)
-; RV32I-NEXT: lbu a6, 3(a0)
-; RV32I-NEXT: lbu a7, 4(a0)
+; RV32I-NEXT: lbu a6, 2(a0)
+; RV32I-NEXT: lbu a7, 3(a0)
+; RV32I-NEXT: lbu a5, 4(a0)
; RV32I-NEXT: lbu t0, 5(a0)
; RV32I-NEXT: lbu t1, 6(a0)
; RV32I-NEXT: lbu t2, 7(a0)
@@ -2028,105 +2033,107 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: lbu t5, 10(a0)
; RV32I-NEXT: lbu t6, 11(a0)
; RV32I-NEXT: lbu s0, 12(a0)
-; RV32I-NEXT: lbu s1, 13(a0)
-; RV32I-NEXT: lbu s2, 14(a0)
-; RV32I-NEXT: lbu s3, 15(a0)
-; RV32I-NEXT: lbu s4, 16(a0)
-; RV32I-NEXT: lbu s5, 17(a0)
-; RV32I-NEXT: lbu s6, 18(a0)
-; RV32I-NEXT: lbu s7, 19(a0)
+; RV32I-NEXT: lbu s2, 13(a0)
+; RV32I-NEXT: lbu s4, 14(a0)
+; RV32I-NEXT: lbu s5, 15(a0)
+; RV32I-NEXT: lbu s6, 16(a0)
+; RV32I-NEXT: lbu s7, 17(a0)
+; RV32I-NEXT: lbu s8, 18(a0)
+; RV32I-NEXT: lbu s9, 19(a0)
; RV32I-NEXT: slli a4, a4, 8
-; RV32I-NEXT: slli a5, a5, 16
-; RV32I-NEXT: slli a6, a6, 24
+; RV32I-NEXT: slli a6, a6, 16
+; RV32I-NEXT: slli a7, a7, 24
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: sw a3, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT: or a4, a7, a6
+; RV32I-NEXT: lbu s10, 20(a0)
+; RV32I-NEXT: lbu s11, 21(a0)
+; RV32I-NEXT: lbu ra, 22(a0)
+; RV32I-NEXT: lbu a3, 23(a0)
; RV32I-NEXT: slli t0, t0, 8
; RV32I-NEXT: slli t1, t1, 16
; RV32I-NEXT: slli t2, t2, 24
-; RV32I-NEXT: or a3, a4, a3
-; RV32I-NEXT: or a4, a6, a5
-; RV32I-NEXT: or a5, t0, a7
-; RV32I-NEXT: or a6, t2, t1
-; RV32I-NEXT: lbu s8, 20(a0)
-; RV32I-NEXT: lbu s9, 21(a0)
-; RV32I-NEXT: lbu s10, 22(a0)
-; RV32I-NEXT: lbu s11, 23(a0)
; RV32I-NEXT: slli t4, t4, 8
; RV32I-NEXT: slli t5, t5, 16
; RV32I-NEXT: slli t6, t6, 24
-; RV32I-NEXT: slli s1, s1, 8
-; RV32I-NEXT: slli s2, s2, 16
-; RV32I-NEXT: slli s3, s3, 24
+; RV32I-NEXT: or a5, t0, a5
+; RV32I-NEXT: or a6, t2, t1
; RV32I-NEXT: or a7, t4, t3
; RV32I-NEXT: or t0, t6, t5
-; RV32I-NEXT: or t1, s1, s0
-; RV32I-NEXT: or t2, s3, s2
-; RV32I-NEXT: lbu t6, 24(a0)
-; RV32I-NEXT: lbu s0, 25(a0)
-; RV32I-NEXT: lbu s1, 26(a0)
-; RV32I-NEXT: lbu s2, 27(a0)
-; RV32I-NEXT: slli s5, s5, 8
-; RV32I-NEXT: slli s6, s6, 16
-; RV32I-NEXT: slli s7, s7, 24
-; RV32I-NEXT: slli s9, s9, 8
-; RV32I-NEXT: or t3, s5, s4
-; RV32I-NEXT: or t4, s7, s6
-; RV32I-NEXT: or t5, s9, s8
-; RV32I-NEXT: lbu s3, 28(a0)
+; RV32I-NEXT: lbu s1, 24(a0)
+; RV32I-NEXT: lbu s3, 25(a0)
+; RV32I-NEXT: lbu t4, 26(a0)
+; RV32I-NEXT: lbu t5, 27(a0)
+; RV32I-NEXT: slli s2, s2, 8
+; RV32I-NEXT: slli s4, s4, 16
+; RV32I-NEXT: slli s5, s5, 24
+; RV32I-NEXT: slli s7, s7, 8
+; RV32I-NEXT: or t1, s2, s0
+; RV32I-NEXT: or t2, s5, s4
+; RV32I-NEXT: or t3, s7, s6
+; RV32I-NEXT: lbu t6, 28(a0)
; RV32I-NEXT: lbu s4, 29(a0)
; RV32I-NEXT: lbu s5, 30(a0)
; RV32I-NEXT: lbu s6, 31(a0)
-; RV32I-NEXT: slli s10, s10, 16
-; RV32I-NEXT: slli s11, s11, 24
-; RV32I-NEXT: slli s0, s0, 8
-; RV32I-NEXT: slli s1, s1, 16
-; RV32I-NEXT: slli s2, s2, 24
-; RV32I-NEXT: or a0, s11, s10
-; RV32I-NEXT: or t6, s0, t6
-; RV32I-NEXT: or s0, s2, s1
-; RV32I-NEXT: lbu s1, 0(a1)
-; RV32I-NEXT: lbu s2, 1(a1)
-; RV32I-NEXT: lbu s7, 2(a1)
+; RV32I-NEXT: slli s8, s8, 16
+; RV32I-NEXT: slli s9, s9, 24
+; RV32I-NEXT: slli s11, s11, 8
+; RV32I-NEXT: slli ra, ra, 16
+; RV32I-NEXT: slli a3, a3, 24
+; RV32I-NEXT: or a0, s9, s8
+; RV32I-NEXT: or s0, s11, s10
+; RV32I-NEXT: or s2, a3, ra
+; RV32I-NEXT: lbu a3, 0(a1)
+; RV32I-NEXT: lbu s7, 1(a1)
+; RV32I-NEXT: lbu s8, 2(a1)
; RV32I-NEXT: lbu a1, 3(a1)
-; RV32I-NEXT: sw zero, 16(sp)
-; RV32I-NEXT: sw zero, 20(sp)
; RV32I-NEXT: sw zero, 24(sp)
; RV32I-NEXT: sw zero, 28(sp)
-; RV32I-NEXT: sw zero, 0(sp)
-; RV32I-NEXT: sw zero, 4(sp)
+; RV32I-NEXT: sw zero, 32(sp)
+; RV32I-NEXT: sw zero, 36(sp)
; RV32I-NEXT: sw zero, 8(sp)
; RV32I-NEXT: sw zero, 12(sp)
+; RV32I-NEXT: sw zero, 16(sp)
+; RV32I-NEXT: sw zero, 20(sp)
+; RV32I-NEXT: slli s3, s3, 8
+; RV32I-NEXT: or s1, s3, s1
+; RV32I-NEXT: addi s3, sp, 40
+; RV32I-NEXT: slli t4, t4, 16
+; RV32I-NEXT: slli t5, t5, 24
; RV32I-NEXT: slli s4, s4, 8
-; RV32I-NEXT: or s3, s4, s3
-; RV32I-NEXT: addi s4, sp, 32
; RV32I-NEXT: slli s5, s5, 16
; RV32I-NEXT: slli s6, s6, 24
-; RV32I-NEXT: slli s2, s2, 8
-; RV32I-NEXT: slli s7, s7, 16
+; RV32I-NEXT: slli s7, s7, 8
+; RV32I-NEXT: slli s8, s8, 16
; RV32I-NEXT: slli a1, a1, 24
-; RV32I-NEXT: or s5, s6, s5
-; RV32I-NEXT: or s1, s2, s1
-; RV32I-NEXT: or a1, a1, s7
-; RV32I-NEXT: or a3, a4, a3
-; RV32I-NEXT: or a4, a6, a5
-; RV32I-NEXT: or a5, t0, a7
-; RV32I-NEXT: or a6, t2, t1
-; RV32I-NEXT: or a7, t4, t3
-; RV32I-NEXT: or t0, a0, t5
-; RV32I-NEXT: or t1, s0, t6
-; RV32I-NEXT: or t2, s5, s3
-; RV32I-NEXT: or a0, a1, s1
-; RV32I-NEXT: sw a7, 48(sp)
-; RV32I-NEXT: sw t0, 52(sp)
-; RV32I-NEXT: sw t1, 56(sp)
-; RV32I-NEXT: sw t2, 60(sp)
-; RV32I-NEXT: sw a3, 32(sp)
-; RV32I-NEXT: sw a4, 36(sp)
-; RV32I-NEXT: sw a5, 40(sp)
-; RV32I-NEXT: sw a6, 44(sp)
+; RV32I-NEXT: or t4, t5, t4
+; RV32I-NEXT: or t5, s4, t6
+; RV32I-NEXT: or t6, s6, s5
+; RV32I-NEXT: or a3, s7, a3
+; RV32I-NEXT: or a1, a1, s8
+; RV32I-NEXT: lw s4, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or a4, a4, s4
+; RV32I-NEXT: or a5, a6, a5
+; RV32I-NEXT: or a6, t0, a7
+; RV32I-NEXT: or a7, t2, t1
+; RV32I-NEXT: or t0, a0, t3
+; RV32I-NEXT: or t1, s2, s0
+; RV32I-NEXT: or t2, t4, s1
+; RV32I-NEXT: or t3, t6, t5
+; RV32I-NEXT: or a0, a1, a3
+; RV32I-NEXT: sw t0, 56(sp)
+; RV32I-NEXT: sw t1, 60(sp)
+; RV32I-NEXT: sw t2, 64(sp)
+; RV32I-NEXT: sw t3, 68(sp)
+; RV32I-NEXT: sw a4, 40(sp)
+; RV32I-NEXT: sw a5, 44(sp)
+; RV32I-NEXT: sw a6, 48(sp)
+; RV32I-NEXT: sw a7, 52(sp)
; RV32I-NEXT: srli a1, a0, 3
; RV32I-NEXT: andi a3, a0, 31
; RV32I-NEXT: andi a4, a1, 28
; RV32I-NEXT: xori a1, a3, 31
-; RV32I-NEXT: sub a3, s4, a4
+; RV32I-NEXT: sub a3, s3, a4
; RV32I-NEXT: lw a4, 0(a3)
; RV32I-NEXT: lw a5, 4(a3)
; RV32I-NEXT: lw a6, 8(a3)
@@ -2186,13 +2193,13 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: srli s5, a3, 24
; RV32I-NEXT: srli s6, a3, 16
; RV32I-NEXT: srli s7, a3, 8
+; RV32I-NEXT: srli s8, a1, 24
+; RV32I-NEXT: srli s9, a1, 16
; RV32I-NEXT: sb a7, 24(a2)
-; RV32I-NEXT: srli a7, a1, 24
; RV32I-NEXT: sb t2, 25(a2)
-; RV32I-NEXT: srli t2, a1, 16
; RV32I-NEXT: sb t1, 26(a2)
; RV32I-NEXT: sb t0, 27(a2)
-; RV32I-NEXT: srli t0, a1, 8
+; RV32I-NEXT: srli a7, a1, 8
; RV32I-NEXT: sb a6, 28(a2)
; RV32I-NEXT: sb t5, 29(a2)
; RV32I-NEXT: sb t4, 30(a2)
@@ -2213,26 +2220,27 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: sb s6, 10(a2)
; RV32I-NEXT: sb s5, 11(a2)
; RV32I-NEXT: sb a1, 12(a2)
-; RV32I-NEXT: sb t0, 13(a2)
-; RV32I-NEXT: sb t2, 14(a2)
-; RV32I-NEXT: sb a7, 15(a2)
+; RV32I-NEXT: sb a7, 13(a2)
+; RV32I-NEXT: sb s9, 14(a2)
+; RV32I-NEXT: sb s8, 15(a2)
; RV32I-NEXT: sb a0, 4(a2)
; RV32I-NEXT: sb a4, 5(a2)
; RV32I-NEXT: sb a5, 6(a2)
; RV32I-NEXT: sb a6, 7(a2)
-; RV32I-NEXT: lw s0, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s1, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s2, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s3, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s4, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s5, 88(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s6, 84(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s7, 80(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s8, 76(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s9, 72(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s10, 68(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s11, 64(sp) # 4-byte Folded Reload
-; RV32I-NEXT: addi sp, sp, 112
+; RV32I-NEXT: lw ra, 124(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s0, 120(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 116(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s2, 112(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s3, 108(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s4, 104(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s5, 100(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s6, 96(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s7, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s8, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s9, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s10, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s11, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT: addi sp, sp, 128
; RV32I-NEXT: ret
%src = load i256, ptr %src.ptr, align 1
%bitOff = load i256, ptr %bitOff.ptr, align 1
@@ -2475,24 +2483,25 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
;
; RV32I-LABEL: ashr_32bytes:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi sp, sp, -112
-; RV32I-NEXT: sw s0, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s1, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s2, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s3, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s4, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s5, 88(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s6, 84(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s7, 80(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s8, 76(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s9, 72(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s10, 68(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s11, 64(sp) # 4-byte Folded Spill
+; RV32I-NEXT: addi sp, sp, -128
+; RV32I-NEXT: sw ra, 124(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s0, 120(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 116(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s2, 112(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s3, 108(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s4, 104(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s5, 100(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s6, 96(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s7, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s8, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s9, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s10, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s11, 76(sp) # 4-byte Folded Spill
; RV32I-NEXT: lbu a3, 0(a0)
; RV32I-NEXT: lbu a4, 1(a0)
-; RV32I-NEXT: lbu a5, 2(a0)
-; RV32I-NEXT: lbu a6, 3(a0)
-; RV32I-NEXT: lbu a7, 4(a0)
+; RV32I-NEXT: lbu a6, 2(a0)
+; RV32I-NEXT: lbu a7, 3(a0)
+; RV32I-NEXT: lbu a5, 4(a0)
; RV32I-NEXT: lbu t0, 5(a0)
; RV32I-NEXT: lbu t1, 6(a0)
; RV32I-NEXT: lbu t2, 7(a0)
@@ -2509,98 +2518,100 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: lbu s6, 18(a0)
; RV32I-NEXT: lbu s7, 19(a0)
; RV32I-NEXT: slli a4, a4, 8
-; RV32I-NEXT: slli a5, a5, 16
-; RV32I-NEXT: slli a6, a6, 24
-; RV32I-NEXT: slli t0, t0, 8
-; RV32I-NEXT: slli t1, t1, 16
-; RV32I-NEXT: slli t2, t2, 24
+; RV32I-NEXT: slli a6, a6, 16
+; RV32I-NEXT: slli a7, a7, 24
; RV32I-NEXT: or a3, a4, a3
-; RV32I-NEXT: or a4, a6, a5
-; RV32I-NEXT: or a5, t0, a7
-; RV32I-NEXT: or a6, t2, t1
+; RV32I-NEXT: sw a3, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT: or a4, a7, a6
; RV32I-NEXT: lbu s8, 20(a0)
; RV32I-NEXT: lbu s9, 21(a0)
; RV32I-NEXT: lbu s10, 22(a0)
; RV32I-NEXT: lbu s11, 23(a0)
+; RV32I-NEXT: slli t0, t0, 8
+; RV32I-NEXT: slli t1, t1, 16
+; RV32I-NEXT: slli t2, t2, 24
; RV32I-NEXT: slli t4, t4, 8
; RV32I-NEXT: slli t5, t5, 16
; RV32I-NEXT: slli t6, t6, 24
+; RV32I-NEXT: or a5, t0, a5
+; RV32I-NEXT: or a6, t2, t1
+; RV32I-NEXT: or a7, t4, t3
+; RV32I-NEXT: or t0, t6, t5
+; RV32I-NEXT: lbu ra, 24(a0)
+; RV32I-NEXT: lbu a3, 25(a0)
+; RV32I-NEXT: lbu t4, 26(a0)
+; RV32I-NEXT: lbu t5, 27(a0)
; RV32I-NEXT: slli s1, s1, 8
; RV32I-NEXT: slli s2, s2, 16
; RV32I-NEXT: slli s3, s3, 24
-; RV32I-NEXT: or a7, t4, t3
-; RV32I-NEXT: or t0, t6, t5
+; RV32I-NEXT: slli s5, s5, 8
; RV32I-NEXT: or t1, s1, s0
; RV32I-NEXT: or t2, s3, s2
-; RV32I-NEXT: lbu t6, 24(a0)
-; RV32I-NEXT: lbu s0, 25(a0)
-; RV32I-NEXT: lbu s1, 26(a0)
-; RV32I-NEXT: lbu s2, 27(a0)
-; RV32I-NEXT: slli s5, s5, 8
+; RV32I-NEXT: or t3, s5, s4
+; RV32I-NEXT: lbu t6, 28(a0)
+; RV32I-NEXT: lbu s0, 29(a0)
+; RV32I-NEXT: lbu s1, 30(a0)
+; RV32I-NEXT: lbu a0, 31(a0)
; RV32I-NEXT: slli s6, s6, 16
; RV32I-NEXT: slli s7, s7, 24
; RV32I-NEXT: slli s9, s9, 8
-; RV32I-NEXT: or t3, s5, s4
-; RV32I-NEXT: or t4, s7, s6
-; RV32I-NEXT: or t5, s9, s8
-; RV32I-NEXT: lbu s3, 28(a0)
-; RV32I-NEXT: lbu s4, 29(a0)
-; RV32I-NEXT: lbu s5, 30(a0)
-; RV32I-NEXT: lbu a0, 31(a0)
; RV32I-NEXT: slli s10, s10, 16
; RV32I-NEXT: slli s11, s11, 24
-; RV32I-NEXT: slli s0, s0, 8
-; RV32I-NEXT: slli s1, s1, 16
-; RV32I-NEXT: slli s2, s2, 24
-; RV32I-NEXT: or s6, s11, s10
-; RV32I-NEXT: or t6, s0, t6
-; RV32I-NEXT: or s0, s2, s1
-; RV32I-NEXT: lbu s1, 0(a1)
-; RV32I-NEXT: lbu s2, 1(a1)
+; RV32I-NEXT: or s2, s7, s6
+; RV32I-NEXT: or s3, s9, s8
+; RV32I-NEXT: or s4, s11, s10
+; RV32I-NEXT: lbu s5, 0(a1)
+; RV32I-NEXT: lbu s6, 1(a1)
; RV32I-NEXT: lbu s7, 2(a1)
; RV32I-NEXT: lbu a1, 3(a1)
-; RV32I-NEXT: slli s4, s4, 8
-; RV32I-NEXT: or s3, s4, s3
-; RV32I-NEXT: mv s4, sp
-; RV32I-NEXT: slli s5, s5, 16
+; RV32I-NEXT: slli a3, a3, 8
+; RV32I-NEXT: or a3, a3, ra
+; RV32I-NEXT: addi s8, sp, 8
+; RV32I-NEXT: slli t4, t4, 16
+; RV32I-NEXT: slli t5, t5, 24
+; RV32I-NEXT: slli s0, s0, 8
+; RV32I-NEXT: slli s1, s1, 16
; RV32I-NEXT: slli a0, a0, 24
-; RV32I-NEXT: slli s2, s2, 8
+; RV32I-NEXT: slli s6, s6, 8
; RV32I-NEXT: slli s7, s7, 16
; RV32I-NEXT: slli a1, a1, 24
-; RV32I-NEXT: or s5, a0, s5
-; RV32I-NEXT: or s1, s2, s1
+; RV32I-NEXT: or t4, t5, t4
+; RV32I-NEXT: or t5, s0, t6
+; RV32I-NEXT: or s1, a0, s1
+; RV32I-NEXT: or t6, s6, s5
; RV32I-NEXT: or a1, a1, s7
-; RV32I-NEXT: srai s2, a0, 31
-; RV32I-NEXT: or a3, a4, a3
-; RV32I-NEXT: or a4, a6, a5
-; RV32I-NEXT: or a5, t0, a7
-; RV32I-NEXT: or a6, t2, t1
-; RV32I-NEXT: or a7, t4, t3
-; RV32I-NEXT: or t0, s6, t5
-; RV32I-NEXT: or t1, s0, t6
-; RV32I-NEXT: or t2, s5, s3
-; RV32I-NEXT: or a0, a1, s1
-; RV32I-NEXT: sw s2, 48(sp)
-; RV32I-NEXT: sw s2, 52(sp)
-; RV32I-NEXT: sw s2, 56(sp)
-; RV32I-NEXT: sw s2, 60(sp)
-; RV32I-NEXT: sw s2, 32(sp)
-; RV32I-NEXT: sw s2, 36(sp)
-; RV32I-NEXT: sw s2, 40(sp)
-; RV32I-NEXT: sw s2, 44(sp)
-; RV32I-NEXT: sw a7, 16(sp)
-; RV32I-NEXT: sw t0, 20(sp)
-; RV32I-NEXT: sw t1, 24(sp)
-; RV32I-NEXT: sw t2, 28(sp)
-; RV32I-NEXT: sw a3, 0(sp)
-; RV32I-NEXT: sw a4, 4(sp)
-; RV32I-NEXT: sw a5, 8(sp)
-; RV32I-NEXT: sw a6, 12(sp)
+; RV32I-NEXT: srai s0, a0, 31
+; RV32I-NEXT: lw a0, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or a4, a4, a0
+; RV32I-NEXT: or a5, a6, a5
+; RV32I-NEXT: or a6, t0, a7
+; RV32I-NEXT: or a7, t2, t1
+; RV32I-NEXT: or t0, s2, t3
+; RV32I-NEXT: or t1, s4, s3
+; RV32I-NEXT: or a3, t4, a3
+; RV32I-NEXT: or t2, s1, t5
+; RV32I-NEXT: or a0, a1, t6
+; RV32I-NEXT: sw s0, 56(sp)
+; RV32I-NEXT: sw s0, 60(sp)
+; RV32I-NEXT: sw s0, 64(sp)
+; RV32I-NEXT: sw s0, 68(sp)
+; RV32I-NEXT: sw s0, 40(sp)
+; RV32I-NEXT: sw s0, 44(sp)
+; RV32I-NEXT: sw s0, 48(sp)
+; RV32I-NEXT: sw s0, 52(sp)
+; RV32I-NEXT: sw t0, 24(sp)
+; RV32I-NEXT: sw t1, 28(sp)
+; RV32I-NEXT: sw a3, 32(sp)
+; RV32I-NEXT: sw t2, 36(sp)
+; RV32I-NEXT: sw a4, 8(sp)
+; RV32I-NEXT: sw a5, 12(sp)
+; RV32I-NEXT: sw a6, 16(sp)
+; RV32I-NEXT: sw a7, 20(sp)
; RV32I-NEXT: srli a1, a0, 3
; RV32I-NEXT: andi a3, a0, 31
; RV32I-NEXT: andi a4, a1, 28
; RV32I-NEXT: xori a1, a3, 31
-; RV32I-NEXT: add a4, s4, a4
+; RV32I-NEXT: add a4, s8, a4
; RV32I-NEXT: lw a3, 0(a4)
; RV32I-NEXT: lw a5, 4(a4)
; RV32I-NEXT: lw a6, 8(a4)
@@ -2660,13 +2671,13 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: srli s5, a3, 24
; RV32I-NEXT: srli s6, a3, 16
; RV32I-NEXT: srli s7, a3, 8
+; RV32I-NEXT: srli s8, a1, 24
+; RV32I-NEXT: srli s9, a1, 16
; RV32I-NEXT: sb a7, 24(a2)
-; RV32I-NEXT: srli a7, a1, 24
; RV32I-NEXT: sb t2, 25(a2)
-; RV32I-NEXT: srli t2, a1, 16
; RV32I-NEXT: sb t1, 26(a2)
; RV32I-NEXT: sb t0, 27(a2)
-; RV32I-NEXT: srli t0, a1, 8
+; RV32I-NEXT: srli a7, a1, 8
; RV32I-NEXT: sb a6, 16(a2)
; RV32I-NEXT: sb t5, 17(a2)
; RV32I-NEXT: sb t4, 18(a2)
@@ -2687,26 +2698,27 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: sb s6, 14(a2)
; RV32I-NEXT: sb s5, 15(a2)
; RV32I-NEXT: sb a1, 0(a2)
-; RV32I-NEXT: sb t0, 1(a2)
-; RV32I-NEXT: sb t2, 2(a2)
-; RV32I-NEXT: sb a7, 3(a2)
+; RV32I-NEXT: sb a7, 1(a2)
+; RV32I-NEXT: sb s9, 2(a2)
+; RV32I-NEXT: sb s8, 3(a2)
; RV32I-NEXT: sb a0, 4(a2)
; RV32I-NEXT: sb a4, 5(a2)
; RV32I-NEXT: sb a5, 6(a2)
; RV32I-NEXT: sb a6, 7(a2)
-; RV32I-NEXT: lw s0, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s1, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s2, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s3, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s4, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s5, 88(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s6, 84(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s7, 80(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s8, 76(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s9, 72(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s10, 68(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s11, 64(sp) # 4-byte Folded Reload
-; RV32I-NEXT: addi sp, sp, 112
+; RV32I-NEXT: lw ra, 124(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s0, 120(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 116(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s2, 112(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s3, 108(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s4, 104(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s5, 100(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s6, 96(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s7, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s8, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s9, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s10, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s11, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT: addi sp, sp, 128
; RV32I-NEXT: ret
%src = load i256, ptr %src.ptr, align 1
%bitOff = load i256, ptr %bitOff.ptr, align 1
diff --git a/llvm/unittests/CodeGen/MFCommon.inc b/llvm/unittests/CodeGen/MFCommon.inc
index 67759bd5c4632e..2c4b1f36ffd23d 100644
--- a/llvm/unittests/CodeGen/MFCommon.inc
+++ b/llvm/unittests/CodeGen/MFCommon.inc
@@ -50,8 +50,8 @@ public:
const char *getRegPressureSetName(unsigned Idx) const override {
return "bogus";
}
- unsigned getRegPressureSetLimit(const MachineFunction &MF,
- unsigned Idx) const override {
+ unsigned getRegPressureSetLimit(const MachineFunction &MF, unsigned Idx,
+ bool RemoveReserved) const override {
return 0;
}
const int *
diff --git a/llvm/utils/TableGen/RegisterInfoEmitter.cpp b/llvm/utils/TableGen/RegisterInfoEmitter.cpp
index a6f87119aca5ba..674925c1b2acd3 100644
--- a/llvm/utils/TableGen/RegisterInfoEmitter.cpp
+++ b/llvm/utils/TableGen/RegisterInfoEmitter.cpp
@@ -275,7 +275,8 @@ void RegisterInfoEmitter::EmitRegUnitPressure(raw_ostream &OS,
OS << "// Get the register unit pressure limit for this dimension.\n"
<< "// This limit must be adjusted dynamically for reserved registers.\n"
<< "unsigned " << ClassName << "::\n"
- << "getRegPressureSetLimit(const MachineFunction &MF, unsigned Idx) const "
+ << "getRegPressureSetLimit(const MachineFunction &MF, unsigned Idx, bool "
+ "RemoveReserved) const "
"{\n"
<< " static const " << getMinimalTypeForRange(MaxRegUnitWeight, 32)
<< " PressureLimitTable[] = {\n";
@@ -1130,7 +1131,7 @@ void RegisterInfoEmitter::runTargetHeader(raw_ostream &OS) {
<< " unsigned getNumRegPressureSets() const override;\n"
<< " const char *getRegPressureSetName(unsigned Idx) const override;\n"
<< " unsigned getRegPressureSetLimit(const MachineFunction &MF, unsigned "
- "Idx) const override;\n"
+ "Idx, bool RemoveReserved = true) const override;\n"
<< " const int *getRegClassPressureSets("
<< "const TargetRegisterClass *RC) const override;\n"
<< " const int *getRegUnitPressureSets("
>From 6b0461f0b6b90dcd983cf288220879d6c087e99d Mon Sep 17 00:00:00 2001
From: Wang Pengcheng <wangpengcheng.pp at bytedance.com>
Date: Tue, 3 Dec 2024 21:47:30 +0800
Subject: [PATCH 3/3] Revert "Test commit: add a parameter to keep reserved"
This reverts commit e96f7f7898790da1fe9cdc5cd3be7e3ae8eb8705.
---
.../include/llvm/CodeGen/TargetRegisterInfo.h | 4 +-
llvm/lib/CodeGen/RegisterClassInfo.cpp | 3 +-
llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp | 3 +-
llvm/lib/Target/AMDGPU/SIRegisterInfo.h | 4 +-
llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp | 8 +-
llvm/lib/Target/RISCV/RISCVRegisterInfo.h | 4 +-
llvm/test/CodeGen/RISCV/pr69586.ll | 844 ++---
.../RISCV/rvv/fixed-vectors-masked-scatter.ll | 78 +-
.../RISCV/rvv/fixed-vectors-setcc-fp-vp.ll | 2104 ++++++-----
.../RISCV/rvv/intrinsic-vector-match.ll | 472 ++-
...lar-shift-by-byte-multiple-legalization.ll | 3242 ++++++++---------
.../RISCV/wide-scalar-shift-legalization.ll | 646 ++--
llvm/unittests/CodeGen/MFCommon.inc | 4 +-
llvm/utils/TableGen/RegisterInfoEmitter.cpp | 5 +-
14 files changed, 3606 insertions(+), 3815 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
index eaed26e33c4eb5..292fa3c94969be 100644
--- a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
@@ -914,10 +914,8 @@ class TargetRegisterInfo : public MCRegisterInfo {
/// Get the register unit pressure limit for this dimension.
/// This limit must be adjusted dynamically for reserved registers.
- /// If RemoveReserved is true, the target should remove reserved registers.
virtual unsigned getRegPressureSetLimit(const MachineFunction &MF,
- unsigned Idx,
- bool RemoveReserved = true) const = 0;
+ unsigned Idx) const = 0;
/// Get the dimensions of register pressure impacted by this register class.
/// Returns a -1 terminated array of pressure set IDs.
diff --git a/llvm/lib/CodeGen/RegisterClassInfo.cpp b/llvm/lib/CodeGen/RegisterClassInfo.cpp
index 0a33915ed1e40b..9312bc03bc522a 100644
--- a/llvm/lib/CodeGen/RegisterClassInfo.cpp
+++ b/llvm/lib/CodeGen/RegisterClassInfo.cpp
@@ -222,8 +222,7 @@ unsigned RegisterClassInfo::computePSetLimit(unsigned Idx) const {
assert(RC && "Failed to find register class");
compute(RC);
unsigned NAllocatableRegs = getNumAllocatableRegs(RC);
- unsigned RegPressureSetLimit =
- TRI->getRegPressureSetLimit(*MF, Idx, /*RemoveReserved=*/false);
+ unsigned RegPressureSetLimit = TRI->getRegPressureSetLimit(*MF, Idx);
// If all the regs are reserved, return raw RegPressureSetLimit.
// One example is VRSAVERC in PowerPC.
// Avoid returning zero, getRegPressureSetLimit(Idx) assumes computePSetLimit
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 9883454ed78298..049f4af4dd2f93 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -3640,8 +3640,7 @@ unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
}
unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF,
- unsigned Idx,
- bool RemoveReserved) const {
+ unsigned Idx) const {
if (Idx == AMDGPU::RegisterPressureSets::VGPR_32 ||
Idx == AMDGPU::RegisterPressureSets::AGPR_32)
return getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index b55f5f2c418b09..8e481e3ac23043 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -331,8 +331,8 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
unsigned getRegPressureLimit(const TargetRegisterClass *RC,
MachineFunction &MF) const override;
- unsigned getRegPressureSetLimit(const MachineFunction &MF, unsigned Idx,
- bool RemoveReserved = true) const override;
+ unsigned getRegPressureSetLimit(const MachineFunction &MF,
+ unsigned Idx) const override;
const int *getRegUnitPressureSets(unsigned RegUnit) const override;
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
index d5a769b6c78c7c..a73bd1621a739d 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
@@ -936,12 +936,8 @@ bool RISCVRegisterInfo::getRegAllocationHints(
}
unsigned RISCVRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF,
- unsigned Idx,
- bool RemoveReserved) const {
+ unsigned Idx) const {
if (Idx == RISCV::RegisterPressureSets::GPRAll) {
- if (!RemoveReserved)
- return 32;
-
unsigned Reserved = 0;
BitVector ReservedRegs = getReservedRegs(MF);
for (MCPhysReg Reg = RISCV::X0_H; Reg <= RISCV::X31_H; Reg++)
@@ -950,5 +946,5 @@ unsigned RISCVRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF,
return 32 - Reserved;
}
- return RISCVGenRegisterInfo::getRegPressureSetLimit(MF, Idx, RemoveReserved);
+ return RISCVGenRegisterInfo::getRegPressureSetLimit(MF, Idx);
}
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.h b/llvm/lib/Target/RISCV/RISCVRegisterInfo.h
index 58f97394ec559b..ca4934de2f52d2 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.h
@@ -144,8 +144,8 @@ struct RISCVRegisterInfo : public RISCVGenRegisterInfo {
static bool isRVVRegClass(const TargetRegisterClass *RC) {
return RISCVRI::isVRegClass(RC->TSFlags);
}
- unsigned getRegPressureSetLimit(const MachineFunction &MF, unsigned Idx,
- bool RemoveReserved = true) const override;
+ unsigned getRegPressureSetLimit(const MachineFunction &MF,
+ unsigned Idx) const override;
};
} // namespace llvm
diff --git a/llvm/test/CodeGen/RISCV/pr69586.ll b/llvm/test/CodeGen/RISCV/pr69586.ll
index 8e6a7add781c93..21e64ada7061aa 100644
--- a/llvm/test/CodeGen/RISCV/pr69586.ll
+++ b/llvm/test/CodeGen/RISCV/pr69586.ll
@@ -39,388 +39,384 @@ define void @test(ptr %0, ptr %1, i64 %2) {
; NOREMAT-NEXT: slli a2, a2, 1
; NOREMAT-NEXT: sub sp, sp, a2
; NOREMAT-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xf0, 0x05, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 752 + 2 * vlenb
-; NOREMAT-NEXT: li a7, 32
-; NOREMAT-NEXT: addi a6, a0, 512
-; NOREMAT-NEXT: addi a4, a0, 1024
-; NOREMAT-NEXT: addi a5, a0, 1536
-; NOREMAT-NEXT: li t0, 1
+; NOREMAT-NEXT: mv a7, a0
+; NOREMAT-NEXT: li a0, 32
+; NOREMAT-NEXT: addi a5, a7, 512
+; NOREMAT-NEXT: addi a4, a7, 1024
+; NOREMAT-NEXT: addi a6, a7, 1536
+; NOREMAT-NEXT: li t1, 1
; NOREMAT-NEXT: li a3, 5
-; NOREMAT-NEXT: li t1, 3
+; NOREMAT-NEXT: li t0, 3
; NOREMAT-NEXT: li a2, 7
; NOREMAT-NEXT: lui t2, 1
-; NOREMAT-NEXT: li s4, 9
-; NOREMAT-NEXT: li s6, 11
-; NOREMAT-NEXT: li s9, 13
-; NOREMAT-NEXT: lui s7, 2
-; NOREMAT-NEXT: lui s1, 3
-; NOREMAT-NEXT: lui ra, 4
-; NOREMAT-NEXT: lui t3, 5
-; NOREMAT-NEXT: lui s0, 6
-; NOREMAT-NEXT: lui s3, 7
-; NOREMAT-NEXT: vsetvli zero, a7, e32, m2, ta, ma
-; NOREMAT-NEXT: slli t0, t0, 11
-; NOREMAT-NEXT: sd t0, 504(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: slli t5, a3, 9
-; NOREMAT-NEXT: slli t6, t1, 10
-; NOREMAT-NEXT: slli s2, a2, 9
-; NOREMAT-NEXT: add a7, a0, t2
-; NOREMAT-NEXT: lui s11, 1
-; NOREMAT-NEXT: slli s4, s4, 9
-; NOREMAT-NEXT: slli s5, a3, 10
-; NOREMAT-NEXT: vle32.v v8, (a6)
-; NOREMAT-NEXT: slli s6, s6, 9
-; NOREMAT-NEXT: slli s8, t1, 11
+; NOREMAT-NEXT: li s5, 9
+; NOREMAT-NEXT: li s8, 11
+; NOREMAT-NEXT: lui s1, 2
+; NOREMAT-NEXT: lui t5, 3
+; NOREMAT-NEXT: lui s11, 4
+; NOREMAT-NEXT: lui ra, 5
+; NOREMAT-NEXT: lui t3, 6
+; NOREMAT-NEXT: lui s0, 7
+; NOREMAT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; NOREMAT-NEXT: slli t4, t1, 11
+; NOREMAT-NEXT: slli t6, a3, 9
+; NOREMAT-NEXT: slli s2, t0, 10
+; NOREMAT-NEXT: slli s4, a2, 9
+; NOREMAT-NEXT: add a0, a7, t2
+; NOREMAT-NEXT: vle32.v v8, (a5)
+; NOREMAT-NEXT: slli s5, s5, 9
; NOREMAT-NEXT: vle32.v v10, (a4)
; NOREMAT-NEXT: vle32.v v2, (a4)
-; NOREMAT-NEXT: slli s9, s9, 9
-; NOREMAT-NEXT: vle32.v v0, (a5)
-; NOREMAT-NEXT: vle32.v v12, (a5)
-; NOREMAT-NEXT: slli s10, a2, 10
-; NOREMAT-NEXT: vle32.v v4, (a7)
-; NOREMAT-NEXT: vle32.v v20, (a7)
-; NOREMAT-NEXT: add a4, a0, s7
+; NOREMAT-NEXT: slli s6, a3, 10
+; NOREMAT-NEXT: vle32.v v0, (a6)
+; NOREMAT-NEXT: vle32.v v12, (a6)
+; NOREMAT-NEXT: slli s8, s8, 9
+; NOREMAT-NEXT: slli s9, t0, 11
+; NOREMAT-NEXT: vle32.v v4, (a0)
+; NOREMAT-NEXT: vle32.v v20, (a0)
+; NOREMAT-NEXT: add a4, a7, s1
; NOREMAT-NEXT: vle32.v v6, (a4)
; NOREMAT-NEXT: vle32.v v30, (a4)
-; NOREMAT-NEXT: add a4, a0, s1
+; NOREMAT-NEXT: add a4, a7, t5
; NOREMAT-NEXT: vle32.v v28, (a4)
; NOREMAT-NEXT: vle32.v v26, (a4)
-; NOREMAT-NEXT: add a4, a0, ra
+; NOREMAT-NEXT: add a4, a7, s11
; NOREMAT-NEXT: vle32.v v24, (a4)
; NOREMAT-NEXT: vle32.v v22, (a4)
-; NOREMAT-NEXT: add a4, a0, t3
-; NOREMAT-NEXT: vle32.v v14, (a0)
+; NOREMAT-NEXT: add a4, a7, ra
+; NOREMAT-NEXT: vle32.v v14, (a7)
; NOREMAT-NEXT: vle32.v v18, (a4)
; NOREMAT-NEXT: vle32.v v16, (a4)
-; NOREMAT-NEXT: add a4, a0, s0
+; NOREMAT-NEXT: add a4, a7, t3
; NOREMAT-NEXT: sf.vc.vv 3, 0, v14, v8
; NOREMAT-NEXT: vle32.v v14, (a4)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v8, v10
; NOREMAT-NEXT: vle32.v v8, (a4)
-; NOREMAT-NEXT: addi a4, sp, 640
-; NOREMAT-NEXT: vs2r.v v8, (a4) # Unknown-size Folded Spill
-; NOREMAT-NEXT: add a4, a0, t0
+; NOREMAT-NEXT: addi a0, sp, 640
+; NOREMAT-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill
+; NOREMAT-NEXT: add a4, a7, t4
; NOREMAT-NEXT: vle32.v v10, (a4)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v2, v0
; NOREMAT-NEXT: vle32.v v2, (a4)
-; NOREMAT-NEXT: add a4, a0, t5
+; NOREMAT-NEXT: add a4, a7, t6
; NOREMAT-NEXT: vle32.v v0, (a4)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v10
; NOREMAT-NEXT: vle32.v v10, (a4)
-; NOREMAT-NEXT: add a4, a0, t6
+; NOREMAT-NEXT: add a4, a7, s2
; NOREMAT-NEXT: vle32.v v12, (a4)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v2, v0
; NOREMAT-NEXT: vle32.v v2, (a4)
-; NOREMAT-NEXT: add a4, a0, s2
+; NOREMAT-NEXT: add a4, a7, s4
; NOREMAT-NEXT: vle32.v v8, (a4)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v10, v12
; NOREMAT-NEXT: vle32.v v12, (a4)
-; NOREMAT-NEXT: add a4, a0, s3
+; NOREMAT-NEXT: add a4, a7, s0
; NOREMAT-NEXT: vle32.v v0, (a4)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v2, v8
; NOREMAT-NEXT: vle32.v v10, (a4)
-; NOREMAT-NEXT: add a4, a0, s4
+; NOREMAT-NEXT: add a4, a7, s5
; NOREMAT-NEXT: vle32.v v8, (a4)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v4
; NOREMAT-NEXT: vle32.v v12, (a4)
-; NOREMAT-NEXT: add a4, a0, s5
+; NOREMAT-NEXT: add a4, a7, s6
; NOREMAT-NEXT: vle32.v v4, (a4)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v20, v8
; NOREMAT-NEXT: vle32.v v8, (a4)
-; NOREMAT-NEXT: add a4, a0, s6
+; NOREMAT-NEXT: add a4, a7, s8
; NOREMAT-NEXT: vle32.v v20, (a4)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v4
; NOREMAT-NEXT: vle32.v v12, (a4)
-; NOREMAT-NEXT: add a4, a0, s8
+; NOREMAT-NEXT: add a4, a7, s9
; NOREMAT-NEXT: vle32.v v4, (a4)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v8, v20
; NOREMAT-NEXT: vle32.v v8, (a4)
-; NOREMAT-NEXT: add a4, a0, s9
+; NOREMAT-NEXT: li t5, 13
+; NOREMAT-NEXT: slli a4, t5, 9
+; NOREMAT-NEXT: sd a4, 624(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT: add a4, a7, a4
; NOREMAT-NEXT: vle32.v v20, (a4)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v4
; NOREMAT-NEXT: vle32.v v12, (a4)
-; NOREMAT-NEXT: add a4, a0, s10
+; NOREMAT-NEXT: slli a4, a2, 10
+; NOREMAT-NEXT: sd a4, 616(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT: add a4, a7, a4
; NOREMAT-NEXT: vle32.v v4, (a4)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v8, v20
; NOREMAT-NEXT: vle32.v v8, (a4)
-; NOREMAT-NEXT: li t2, 15
-; NOREMAT-NEXT: slli a4, t2, 9
-; NOREMAT-NEXT: sd a4, 624(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a4, a0, a4
+; NOREMAT-NEXT: li a6, 15
+; NOREMAT-NEXT: slli a4, a6, 9
+; NOREMAT-NEXT: sd a4, 608(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT: add a4, a7, a4
; NOREMAT-NEXT: vle32.v v2, (a4)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v4
-; NOREMAT-NEXT: lui t4, 8
-; NOREMAT-NEXT: add a5, a0, t4
+; NOREMAT-NEXT: lui t1, 8
+; NOREMAT-NEXT: add a5, a7, t1
; NOREMAT-NEXT: vle32.v v20, (a5)
; NOREMAT-NEXT: vle32.v v12, (a4)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v8, v2
; NOREMAT-NEXT: li a4, 17
; NOREMAT-NEXT: slli a4, a4, 9
-; NOREMAT-NEXT: li s1, 17
-; NOREMAT-NEXT: sd a4, 616(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a4, a0, a4
+; NOREMAT-NEXT: li t2, 17
+; NOREMAT-NEXT: sd a4, 600(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT: add a4, a7, a4
; NOREMAT-NEXT: vle32.v v8, (a4)
; NOREMAT-NEXT: vle32.v v4, (a4)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v6
; NOREMAT-NEXT: li a5, 9
; NOREMAT-NEXT: slli a4, a5, 10
-; NOREMAT-NEXT: sd a4, 608(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a4, a0, a4
+; NOREMAT-NEXT: sd a4, 592(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT: add a4, a7, a4
; NOREMAT-NEXT: vle32.v v12, (a4)
; NOREMAT-NEXT: vle32.v v6, (a4)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v8
; NOREMAT-NEXT: li a4, 19
; NOREMAT-NEXT: slli a4, a4, 9
-; NOREMAT-NEXT: li t1, 19
-; NOREMAT-NEXT: sd a4, 600(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a4, a0, a4
+; NOREMAT-NEXT: li s1, 19
+; NOREMAT-NEXT: sd a4, 584(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT: add a4, a7, a4
; NOREMAT-NEXT: vle32.v v8, (a4)
; NOREMAT-NEXT: vle32.v v30, (a4)
; NOREMAT-NEXT: slli a3, a3, 11
-; NOREMAT-NEXT: sd a3, 592(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT: sd a3, 576(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v12
-; NOREMAT-NEXT: add a3, a0, a3
+; NOREMAT-NEXT: add a3, a7, a3
; NOREMAT-NEXT: vle32.v v12, (a3)
; NOREMAT-NEXT: vle32.v v4, (a3)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v8
; NOREMAT-NEXT: li s7, 21
; NOREMAT-NEXT: slli a3, s7, 9
-; NOREMAT-NEXT: sd a3, 584(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a3, a0, a3
+; NOREMAT-NEXT: sd a3, 568(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT: add a3, a7, a3
; NOREMAT-NEXT: vle32.v v8, (a3)
; NOREMAT-NEXT: vle32.v v6, (a3)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v12
-; NOREMAT-NEXT: li a6, 11
-; NOREMAT-NEXT: slli a3, a6, 10
-; NOREMAT-NEXT: sd a3, 576(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a3, a0, a3
+; NOREMAT-NEXT: li a4, 11
+; NOREMAT-NEXT: slli a3, a4, 10
+; NOREMAT-NEXT: sd a3, 560(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT: add a3, a7, a3
; NOREMAT-NEXT: vle32.v v12, (a3)
; NOREMAT-NEXT: vle32.v v30, (a3)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v8
; NOREMAT-NEXT: li s3, 23
-; NOREMAT-NEXT: slli a3, s3, 9
-; NOREMAT-NEXT: sd a3, 568(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a3, a0, a3
+; NOREMAT-NEXT: slli s10, s3, 9
+; NOREMAT-NEXT: add a3, a7, s10
; NOREMAT-NEXT: vle32.v v8, (a3)
; NOREMAT-NEXT: vle32.v v4, (a3)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v12
; NOREMAT-NEXT: li s0, 25
; NOREMAT-NEXT: slli a3, s0, 9
-; NOREMAT-NEXT: sd a3, 560(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a3, a0, a3
+; NOREMAT-NEXT: sd a3, 552(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT: add a3, a7, a3
; NOREMAT-NEXT: vle32.v v12, (a3)
; NOREMAT-NEXT: vle32.v v6, (a3)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v8
-; NOREMAT-NEXT: li a7, 13
-; NOREMAT-NEXT: slli a3, a7, 10
-; NOREMAT-NEXT: sd a3, 552(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a3, a0, a3
+; NOREMAT-NEXT: slli a3, t5, 10
+; NOREMAT-NEXT: sd a3, 544(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT: add a3, a7, a3
; NOREMAT-NEXT: vle32.v v8, (a3)
; NOREMAT-NEXT: vle32.v v30, (a3)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v28
; NOREMAT-NEXT: li t3, 27
; NOREMAT-NEXT: slli a3, t3, 9
-; NOREMAT-NEXT: sd a3, 544(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a3, a0, a3
+; NOREMAT-NEXT: sd a3, 536(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT: add a3, a7, a3
; NOREMAT-NEXT: vle32.v v28, (a3)
; NOREMAT-NEXT: vle32.v v4, (a3)
; NOREMAT-NEXT: slli a2, a2, 11
-; NOREMAT-NEXT: sd a2, 536(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT: sd a2, 528(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: sf.vc.vv 3, 0, v26, v12
-; NOREMAT-NEXT: add a2, a0, a2
+; NOREMAT-NEXT: add a2, a7, a2
; NOREMAT-NEXT: vle32.v v12, (a2)
; NOREMAT-NEXT: vle32.v v26, (a2)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v8
; NOREMAT-NEXT: li t0, 29
; NOREMAT-NEXT: slli a2, t0, 9
-; NOREMAT-NEXT: sd a2, 528(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a2, a0, a2
+; NOREMAT-NEXT: sd a2, 520(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT: add a2, a7, a2
; NOREMAT-NEXT: vle32.v v8, (a2)
; NOREMAT-NEXT: vle32.v v6, (a2)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v28
-; NOREMAT-NEXT: slli a2, t2, 10
-; NOREMAT-NEXT: sd a2, 520(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: li t2, 15
-; NOREMAT-NEXT: add a2, a0, a2
+; NOREMAT-NEXT: slli a2, a6, 10
+; NOREMAT-NEXT: sd a2, 512(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT: add a2, a7, a2
; NOREMAT-NEXT: vle32.v v28, (a2)
; NOREMAT-NEXT: vle32.v v30, (a2)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v12
; NOREMAT-NEXT: li a3, 31
-; NOREMAT-NEXT: slli a2, a3, 9
-; NOREMAT-NEXT: sd a2, 512(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a2, a0, a2
-; NOREMAT-NEXT: vle32.v v12, (a2)
-; NOREMAT-NEXT: vle32.v v4, (a2)
+; NOREMAT-NEXT: slli a0, a3, 9
+; NOREMAT-NEXT: sd a0, 504(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT: add a0, a7, a0
+; NOREMAT-NEXT: vle32.v v12, (a0)
+; NOREMAT-NEXT: vle32.v v4, (a0)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v26, v8
-; NOREMAT-NEXT: addiw a2, ra, 512
+; NOREMAT-NEXT: addiw a2, s11, 512
; NOREMAT-NEXT: sd a2, 496(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a2, a0, a2
+; NOREMAT-NEXT: add a2, a7, a2
; NOREMAT-NEXT: vle32.v v8, (a2)
; NOREMAT-NEXT: vle32.v v26, (a2)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v28
-; NOREMAT-NEXT: slli a2, s1, 10
+; NOREMAT-NEXT: slli a2, t2, 10
; NOREMAT-NEXT: sd a2, 488(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a2, a0, a2
+; NOREMAT-NEXT: add a2, a7, a2
; NOREMAT-NEXT: vle32.v v28, (a2)
; NOREMAT-NEXT: vle32.v v6, (a2)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v12
-; NOREMAT-NEXT: addiw a2, ra, 1536
+; NOREMAT-NEXT: addiw a2, s11, 1536
; NOREMAT-NEXT: sd a2, 480(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a2, a0, a2
+; NOREMAT-NEXT: add a2, a7, a2
; NOREMAT-NEXT: vle32.v v12, (a2)
; NOREMAT-NEXT: vle32.v v30, (a2)
; NOREMAT-NEXT: slli a2, a5, 11
; NOREMAT-NEXT: sd a2, 472(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v24
-; NOREMAT-NEXT: add a2, a0, a2
+; NOREMAT-NEXT: add a2, a7, a2
; NOREMAT-NEXT: vle32.v v24, (a2)
; NOREMAT-NEXT: vle32.v v4, (a2)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v22, v8
-; NOREMAT-NEXT: lui a4, 5
-; NOREMAT-NEXT: addiw a2, a4, -1536
+; NOREMAT-NEXT: addiw a2, ra, -1536
; NOREMAT-NEXT: sd a2, 464(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a2, a0, a2
+; NOREMAT-NEXT: add a2, a7, a2
; NOREMAT-NEXT: vle32.v v8, (a2)
; NOREMAT-NEXT: vle32.v v22, (a2)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v26, v28
-; NOREMAT-NEXT: slli a2, t1, 10
+; NOREMAT-NEXT: slli a2, s1, 10
; NOREMAT-NEXT: sd a2, 456(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: li t1, 19
-; NOREMAT-NEXT: add a2, a0, a2
+; NOREMAT-NEXT: add a2, a7, a2
; NOREMAT-NEXT: vle32.v v26, (a2)
; NOREMAT-NEXT: vle32.v v28, (a2)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v12
-; NOREMAT-NEXT: addiw a2, a4, -512
+; NOREMAT-NEXT: addiw a2, ra, -512
; NOREMAT-NEXT: sd a2, 448(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a2, a0, a2
+; NOREMAT-NEXT: add a2, a7, a2
; NOREMAT-NEXT: vle32.v v12, (a2)
; NOREMAT-NEXT: vle32.v v6, (a2)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v24
-; NOREMAT-NEXT: addiw a2, a4, 512
+; NOREMAT-NEXT: addiw a2, ra, 512
; NOREMAT-NEXT: sd a2, 440(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a2, a0, a2
+; NOREMAT-NEXT: add a2, a7, a2
; NOREMAT-NEXT: vle32.v v24, (a2)
; NOREMAT-NEXT: vle32.v v30, (a2)
; NOREMAT-NEXT: slli a2, s7, 10
; NOREMAT-NEXT: sd a2, 432(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v8
-; NOREMAT-NEXT: add a2, a0, a2
+; NOREMAT-NEXT: add a2, a7, a2
; NOREMAT-NEXT: vle32.v v8, (a2)
; NOREMAT-NEXT: vle32.v v4, (a2)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v22, v26
-; NOREMAT-NEXT: addiw a2, a4, 1536
+; NOREMAT-NEXT: addiw a2, ra, 1536
; NOREMAT-NEXT: sd a2, 424(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a2, a0, a2
+; NOREMAT-NEXT: add a2, a7, a2
; NOREMAT-NEXT: vle32.v v22, (a2)
; NOREMAT-NEXT: vle32.v v26, (a2)
-; NOREMAT-NEXT: slli a2, a6, 11
+; NOREMAT-NEXT: slli a2, a4, 11
; NOREMAT-NEXT: sd a2, 416(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: sf.vc.vv 3, 0, v28, v12
-; NOREMAT-NEXT: add a2, a0, a2
+; NOREMAT-NEXT: add a2, a7, a2
; NOREMAT-NEXT: vle32.v v12, (a2)
; NOREMAT-NEXT: vle32.v v28, (a2)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v18
-; NOREMAT-NEXT: lui a5, 6
-; NOREMAT-NEXT: addiw a2, a5, -1536
+; NOREMAT-NEXT: lui a4, 6
+; NOREMAT-NEXT: addiw a2, a4, -1536
; NOREMAT-NEXT: sd a2, 408(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a2, a0, a2
+; NOREMAT-NEXT: add a2, a7, a2
; NOREMAT-NEXT: vle32.v v18, (a2)
; NOREMAT-NEXT: vle32.v v6, (a2)
; NOREMAT-NEXT: slli a2, s3, 10
; NOREMAT-NEXT: sd a2, 400(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: sf.vc.vv 3, 0, v16, v24
-; NOREMAT-NEXT: add a2, a0, a2
+; NOREMAT-NEXT: add a2, a7, a2
; NOREMAT-NEXT: vle32.v v16, (a2)
; NOREMAT-NEXT: vle32.v v24, (a2)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v8
-; NOREMAT-NEXT: addiw a2, a5, -512
+; NOREMAT-NEXT: addiw a2, a4, -512
; NOREMAT-NEXT: sd a2, 392(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a2, a0, a2
+; NOREMAT-NEXT: add a2, a7, a2
; NOREMAT-NEXT: vle32.v v8, (a2)
; NOREMAT-NEXT: vle32.v v30, (a2)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v22
-; NOREMAT-NEXT: addiw a2, a5, 512
+; NOREMAT-NEXT: addiw a2, a4, 512
; NOREMAT-NEXT: sd a2, 384(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a2, a0, a2
+; NOREMAT-NEXT: add a2, a7, a2
; NOREMAT-NEXT: vle32.v v22, (a2)
; NOREMAT-NEXT: vle32.v v4, (a2)
; NOREMAT-NEXT: slli a2, s0, 10
; NOREMAT-NEXT: sd a2, 376(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: sf.vc.vv 3, 0, v26, v12
-; NOREMAT-NEXT: add a2, a0, a2
+; NOREMAT-NEXT: add a2, a7, a2
; NOREMAT-NEXT: vle32.v v26, (a2)
; NOREMAT-NEXT: vle32.v v2, (a2)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v28, v18
-; NOREMAT-NEXT: addiw a2, a5, 1536
+; NOREMAT-NEXT: addiw a2, a4, 1536
; NOREMAT-NEXT: sd a2, 368(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a2, a0, a2
+; NOREMAT-NEXT: add a2, a7, a2
; NOREMAT-NEXT: vle32.v v18, (a2)
; NOREMAT-NEXT: vle32.v v28, (a2)
-; NOREMAT-NEXT: slli a2, a7, 11
+; NOREMAT-NEXT: slli a2, t5, 11
; NOREMAT-NEXT: sd a2, 360(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v16
-; NOREMAT-NEXT: add a2, a0, a2
+; NOREMAT-NEXT: add a2, a7, a2
; NOREMAT-NEXT: vle32.v v16, (a2)
; NOREMAT-NEXT: vle32.v v6, (a2)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v24, v8
-; NOREMAT-NEXT: lui a7, 7
-; NOREMAT-NEXT: addiw a2, a7, -1536
+; NOREMAT-NEXT: lui a5, 7
+; NOREMAT-NEXT: addiw a2, a5, -1536
; NOREMAT-NEXT: sd a2, 352(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a2, a0, a2
+; NOREMAT-NEXT: add a2, a7, a2
; NOREMAT-NEXT: vle32.v v8, (a2)
; NOREMAT-NEXT: vle32.v v24, (a2)
; NOREMAT-NEXT: slli a2, t3, 10
; NOREMAT-NEXT: sd a2, 344(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v14
-; NOREMAT-NEXT: add a2, a0, a2
+; NOREMAT-NEXT: add a2, a7, a2
; NOREMAT-NEXT: vle32.v v14, (a2)
; NOREMAT-NEXT: vle32.v v30, (a2)
-; NOREMAT-NEXT: addi a2, sp, 640
-; NOREMAT-NEXT: vl2r.v v12, (a2) # Unknown-size Folded Reload
+; NOREMAT-NEXT: addi a0, sp, 640
+; NOREMAT-NEXT: vl2r.v v12, (a0) # Unknown-size Folded Reload
; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v22
-; NOREMAT-NEXT: addiw a2, a7, -512
+; NOREMAT-NEXT: addiw a2, a5, -512
; NOREMAT-NEXT: sd a2, 336(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a2, a0, a2
+; NOREMAT-NEXT: add a2, a7, a2
; NOREMAT-NEXT: vle32.v v22, (a2)
; NOREMAT-NEXT: vle32.v v12, (a2)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v26
-; NOREMAT-NEXT: addiw a2, a7, 512
+; NOREMAT-NEXT: addiw a2, a5, 512
; NOREMAT-NEXT: sd a2, 328(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a2, a0, a2
+; NOREMAT-NEXT: add a2, a7, a2
; NOREMAT-NEXT: vle32.v v26, (a2)
; NOREMAT-NEXT: vle32.v v4, (a2)
; NOREMAT-NEXT: slli a2, t0, 10
; NOREMAT-NEXT: sd a2, 320(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: sf.vc.vv 3, 0, v2, v18
-; NOREMAT-NEXT: add a2, a0, a2
+; NOREMAT-NEXT: add a2, a7, a2
; NOREMAT-NEXT: vle32.v v18, (a2)
; NOREMAT-NEXT: vle32.v v2, (a2)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v28, v16
-; NOREMAT-NEXT: addiw a2, a7, 1536
+; NOREMAT-NEXT: addiw a2, a5, 1536
; NOREMAT-NEXT: sd a2, 312(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a2, a0, a2
+; NOREMAT-NEXT: add a2, a7, a2
; NOREMAT-NEXT: vle32.v v16, (a2)
; NOREMAT-NEXT: vle32.v v28, (a2)
-; NOREMAT-NEXT: slli a2, t2, 11
+; NOREMAT-NEXT: slli a2, a6, 11
; NOREMAT-NEXT: sd a2, 304(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v8
-; NOREMAT-NEXT: add a2, a0, a2
+; NOREMAT-NEXT: add a2, a7, a2
; NOREMAT-NEXT: vle32.v v8, (a2)
; NOREMAT-NEXT: vle32.v v6, (a2)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v24, v14
-; NOREMAT-NEXT: addiw a2, t4, -1536
+; NOREMAT-NEXT: addiw a2, t1, -1536
; NOREMAT-NEXT: sd a2, 296(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a2, a0, a2
+; NOREMAT-NEXT: add a2, a7, a2
; NOREMAT-NEXT: vle32.v v14, (a2)
; NOREMAT-NEXT: vle32.v v24, (a2)
; NOREMAT-NEXT: slli a2, a3, 10
; NOREMAT-NEXT: sd a2, 288(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v22
-; NOREMAT-NEXT: add a2, a0, a2
+; NOREMAT-NEXT: add a2, a7, a2
; NOREMAT-NEXT: vle32.v v22, (a2)
; NOREMAT-NEXT: vle32.v v30, (a2)
-; NOREMAT-NEXT: addiw a2, t4, -512
-; NOREMAT-NEXT: sd a2, 280(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a0, a0, a2
+; NOREMAT-NEXT: addiw a0, t1, -512
+; NOREMAT-NEXT: sd a0, 280(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT: add a0, a7, a0
; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v0
; NOREMAT-NEXT: vle32.v v12, (a0)
; NOREMAT-NEXT: vle32.v v0, (a0)
@@ -435,32 +431,33 @@ define void @test(ptr %0, ptr %1, i64 %2) {
; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0
; NOREMAT-NEXT: addi a0, a1, 1024
; NOREMAT-NEXT: vse32.v v8, (a0)
-; NOREMAT-NEXT: add s11, a1, s11
-; NOREMAT-NEXT: sd s11, 272(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT: lui a0, 1
+; NOREMAT-NEXT: add a0, a1, a0
+; NOREMAT-NEXT: sd a0, 272(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: lui a0, 2
; NOREMAT-NEXT: add a0, a1, a0
; NOREMAT-NEXT: sd a0, 264(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: lui a0, 3
; NOREMAT-NEXT: add a0, a1, a0
; NOREMAT-NEXT: sd a0, 256(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT: add s11, a1, s11
+; NOREMAT-NEXT: sd s11, 248(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: add ra, a1, ra
-; NOREMAT-NEXT: sd ra, 248(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT: sd ra, 240(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: add a4, a1, a4
-; NOREMAT-NEXT: sd a4, 240(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT: sd a4, 232(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: add a5, a1, a5
-; NOREMAT-NEXT: sd a5, 232(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a7, a1, a7
-; NOREMAT-NEXT: sd a7, 224(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a0, a1, t4
+; NOREMAT-NEXT: sd a5, 224(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT: add a0, a1, t1
; NOREMAT-NEXT: sd a0, 216(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: addiw a0, t4, 512
+; NOREMAT-NEXT: addiw a0, t1, 512
; NOREMAT-NEXT: sd a0, 192(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: addiw a0, t4, 1024
+; NOREMAT-NEXT: addiw a0, t1, 1024
; NOREMAT-NEXT: sd a0, 176(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: addiw a0, t4, 1536
+; NOREMAT-NEXT: addiw a0, t1, 1536
; NOREMAT-NEXT: sd a0, 160(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: slli s1, s1, 11
-; NOREMAT-NEXT: sd s1, 128(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT: slli t2, t2, 11
+; NOREMAT-NEXT: sd t2, 128(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: lui a0, 9
; NOREMAT-NEXT: addiw a2, a0, -1536
; NOREMAT-NEXT: sd a2, 88(sp) # 8-byte Folded Spill
@@ -473,7 +470,7 @@ define void @test(ptr %0, ptr %1, i64 %2) {
; NOREMAT-NEXT: addiw s11, a0, 512
; NOREMAT-NEXT: addiw s7, a0, 1024
; NOREMAT-NEXT: addiw s3, a0, 1536
-; NOREMAT-NEXT: slli s1, t1, 11
+; NOREMAT-NEXT: slli s1, s1, 11
; NOREMAT-NEXT: lui a0, 10
; NOREMAT-NEXT: addiw t2, a0, -1536
; NOREMAT-NEXT: addiw a7, a0, -1024
@@ -481,52 +478,52 @@ define void @test(ptr %0, ptr %1, i64 %2) {
; NOREMAT-NEXT: add a2, a1, a0
; NOREMAT-NEXT: sd a2, 200(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: addiw a0, a0, 512
-; NOREMAT-NEXT: ld a2, 504(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT: add a2, a1, a2
-; NOREMAT-NEXT: add a3, a1, t5
-; NOREMAT-NEXT: add a5, a1, t6
-; NOREMAT-NEXT: add a6, a1, s2
-; NOREMAT-NEXT: add t0, a1, s4
-; NOREMAT-NEXT: add t1, a1, s5
-; NOREMAT-NEXT: add t3, a1, s6
-; NOREMAT-NEXT: add t4, a1, s8
-; NOREMAT-NEXT: add t5, a1, s9
-; NOREMAT-NEXT: add t6, a1, s10
-; NOREMAT-NEXT: ld s0, 624(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT: add a2, a1, t4
+; NOREMAT-NEXT: add a3, a1, t6
+; NOREMAT-NEXT: add a5, a1, s2
+; NOREMAT-NEXT: add a6, a1, s4
+; NOREMAT-NEXT: add t0, a1, s5
+; NOREMAT-NEXT: add t1, a1, s6
+; NOREMAT-NEXT: add t3, a1, s8
+; NOREMAT-NEXT: add t4, a1, s9
+; NOREMAT-NEXT: ld t5, 624(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT: add t5, a1, t5
+; NOREMAT-NEXT: ld t6, 616(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT: add t6, a1, t6
+; NOREMAT-NEXT: ld s0, 608(sp) # 8-byte Folded Reload
; NOREMAT-NEXT: add s0, a1, s0
-; NOREMAT-NEXT: ld s2, 616(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT: ld s2, 600(sp) # 8-byte Folded Reload
; NOREMAT-NEXT: add s2, a1, s2
-; NOREMAT-NEXT: ld s4, 608(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT: ld s4, 592(sp) # 8-byte Folded Reload
; NOREMAT-NEXT: add s4, a1, s4
-; NOREMAT-NEXT: ld s5, 600(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT: ld s5, 584(sp) # 8-byte Folded Reload
; NOREMAT-NEXT: add s5, a1, s5
-; NOREMAT-NEXT: ld s6, 592(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT: ld s6, 576(sp) # 8-byte Folded Reload
; NOREMAT-NEXT: add s6, a1, s6
-; NOREMAT-NEXT: ld s8, 584(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT: ld s8, 568(sp) # 8-byte Folded Reload
; NOREMAT-NEXT: add s8, a1, s8
-; NOREMAT-NEXT: ld s9, 576(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT: ld s9, 560(sp) # 8-byte Folded Reload
; NOREMAT-NEXT: add s9, a1, s9
-; NOREMAT-NEXT: ld s10, 568(sp) # 8-byte Folded Reload
; NOREMAT-NEXT: add s10, a1, s10
-; NOREMAT-NEXT: ld ra, 560(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT: ld ra, 552(sp) # 8-byte Folded Reload
; NOREMAT-NEXT: add ra, a1, ra
; NOREMAT-NEXT: sd ra, 16(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: ld ra, 552(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT: ld ra, 544(sp) # 8-byte Folded Reload
; NOREMAT-NEXT: add ra, a1, ra
; NOREMAT-NEXT: sd ra, 24(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: ld ra, 544(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT: ld ra, 536(sp) # 8-byte Folded Reload
; NOREMAT-NEXT: add ra, a1, ra
; NOREMAT-NEXT: sd ra, 32(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: ld ra, 536(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT: ld ra, 528(sp) # 8-byte Folded Reload
; NOREMAT-NEXT: add ra, a1, ra
; NOREMAT-NEXT: sd ra, 48(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: ld ra, 528(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT: ld ra, 520(sp) # 8-byte Folded Reload
; NOREMAT-NEXT: add ra, a1, ra
; NOREMAT-NEXT: sd ra, 56(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: ld ra, 520(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT: ld ra, 512(sp) # 8-byte Folded Reload
; NOREMAT-NEXT: add ra, a1, ra
; NOREMAT-NEXT: sd ra, 64(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: ld ra, 512(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT: ld ra, 504(sp) # 8-byte Folded Reload
; NOREMAT-NEXT: add ra, a1, ra
; NOREMAT-NEXT: sd ra, 80(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: ld ra, 496(sp) # 8-byte Folded Reload
@@ -920,10 +917,9 @@ define void @test(ptr %0, ptr %1, i64 %2) {
; REMAT-NEXT: .cfi_offset s10, -96
; REMAT-NEXT: .cfi_offset s11, -104
; REMAT-NEXT: csrr a2, vlenb
-; REMAT-NEXT: li a3, 14
-; REMAT-NEXT: mul a2, a2, a3
+; REMAT-NEXT: slli a2, a2, 3
; REMAT-NEXT: sub sp, sp, a2
-; REMAT-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xa0, 0x04, 0x22, 0x11, 0x0e, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 544 + 14 * vlenb
+; REMAT-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xa0, 0x04, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 544 + 8 * vlenb
; REMAT-NEXT: li a4, 32
; REMAT-NEXT: addi a5, a0, 512
; REMAT-NEXT: addi a3, a0, 1024
@@ -960,20 +956,13 @@ define void @test(ptr %0, ptr %1, i64 %2) {
; REMAT-NEXT: slli s6, s6, 9
; REMAT-NEXT: li s7, 5
; REMAT-NEXT: slli s7, s7, 11
-; REMAT-NEXT: li s8, 21
-; REMAT-NEXT: slli s8, s8, 9
-; REMAT-NEXT: li s9, 11
-; REMAT-NEXT: slli s9, s9, 10
-; REMAT-NEXT: li s10, 23
-; REMAT-NEXT: slli s10, s10, 9
-; REMAT-NEXT: lui s11, 3
; REMAT-NEXT: vsetvli zero, a4, e32, m2, ta, ma
; REMAT-NEXT: vle32.v v8, (a5)
-; REMAT-NEXT: li a4, 25
+; REMAT-NEXT: li a4, 21
; REMAT-NEXT: slli a4, a4, 9
; REMAT-NEXT: vle32.v v10, (a3)
; REMAT-NEXT: vle32.v v12, (a3)
-; REMAT-NEXT: li a3, 13
+; REMAT-NEXT: li a3, 11
; REMAT-NEXT: slli a3, a3, 10
; REMAT-NEXT: vle32.v v14, (a2)
; REMAT-NEXT: vle32.v v16, (a2)
@@ -990,7 +979,7 @@ define void @test(ptr %0, ptr %1, i64 %2) {
; REMAT-NEXT: vle32.v v30, (a2)
; REMAT-NEXT: vle32.v v6, (a2)
; REMAT-NEXT: csrr a2, vlenb
-; REMAT-NEXT: li a5, 12
+; REMAT-NEXT: li a5, 6
; REMAT-NEXT: mul a2, a2, a5
; REMAT-NEXT: add a2, sp, a2
; REMAT-NEXT: addi a2, a2, 432
@@ -1000,8 +989,7 @@ define void @test(ptr %0, ptr %1, i64 %2) {
; REMAT-NEXT: vle32.v v2, (a2)
; REMAT-NEXT: vle32.v v6, (a2)
; REMAT-NEXT: csrr a2, vlenb
-; REMAT-NEXT: li a5, 10
-; REMAT-NEXT: mul a2, a2, a5
+; REMAT-NEXT: slli a2, a2, 2
; REMAT-NEXT: add a2, sp, a2
; REMAT-NEXT: addi a2, a2, 432
; REMAT-NEXT: vs2r.v v6, (a2) # Unknown-size Folded Spill
@@ -1015,16 +1003,11 @@ define void @test(ptr %0, ptr %1, i64 %2) {
; REMAT-NEXT: sf.vc.vv 3, 0, v12, v14
; REMAT-NEXT: vle32.v v0, (a2)
; REMAT-NEXT: add a2, a0, t5
-; REMAT-NEXT: vle32.v v14, (a2)
-; REMAT-NEXT: sf.vc.vv 3, 0, v16, v18
; REMAT-NEXT: vle32.v v8, (a2)
-; REMAT-NEXT: csrr a2, vlenb
-; REMAT-NEXT: slli a2, a2, 3
-; REMAT-NEXT: add a2, sp, a2
-; REMAT-NEXT: addi a2, a2, 432
-; REMAT-NEXT: vs2r.v v8, (a2) # Unknown-size Folded Spill
-; REMAT-NEXT: add a2, a0, t6
+; REMAT-NEXT: sf.vc.vv 3, 0, v16, v18
; REMAT-NEXT: vle32.v v18, (a2)
+; REMAT-NEXT: add a2, a0, t6
+; REMAT-NEXT: vle32.v v16, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v20, v22
; REMAT-NEXT: vle32.v v20, (a2)
; REMAT-NEXT: add a2, a0, s0
@@ -1034,383 +1017,340 @@ define void @test(ptr %0, ptr %1, i64 %2) {
; REMAT-NEXT: add a2, a0, s1
; REMAT-NEXT: vle32.v v26, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v28, v30
-; REMAT-NEXT: vle32.v v28, (a2)
+; REMAT-NEXT: vle32.v v14, (a2)
; REMAT-NEXT: add a2, a0, s2
-; REMAT-NEXT: vle32.v v8, (a2)
+; REMAT-NEXT: vle32.v v12, (a2)
; REMAT-NEXT: csrr a5, vlenb
-; REMAT-NEXT: li a6, 12
+; REMAT-NEXT: li a6, 6
; REMAT-NEXT: mul a5, a5, a6
; REMAT-NEXT: add a5, sp, a5
; REMAT-NEXT: addi a5, a5, 432
-; REMAT-NEXT: vl2r.v v12, (a5) # Unknown-size Folded Reload
-; REMAT-NEXT: sf.vc.vv 3, 0, v12, v2
+; REMAT-NEXT: vl2r.v v28, (a5) # Unknown-size Folded Reload
+; REMAT-NEXT: sf.vc.vv 3, 0, v28, v2
; REMAT-NEXT: vle32.v v2, (a2)
; REMAT-NEXT: add a2, a0, s3
-; REMAT-NEXT: vle32.v v12, (a2)
+; REMAT-NEXT: vle32.v v28, (a2)
; REMAT-NEXT: csrr a5, vlenb
-; REMAT-NEXT: li a6, 10
-; REMAT-NEXT: mul a5, a5, a6
+; REMAT-NEXT: slli a5, a5, 2
; REMAT-NEXT: add a5, sp, a5
; REMAT-NEXT: addi a5, a5, 432
-; REMAT-NEXT: vl2r.v v16, (a5) # Unknown-size Folded Reload
-; REMAT-NEXT: sf.vc.vv 3, 0, v16, v4
-; REMAT-NEXT: vle32.v v30, (a2)
+; REMAT-NEXT: vl2r.v v30, (a5) # Unknown-size Folded Reload
+; REMAT-NEXT: sf.vc.vv 3, 0, v30, v4
+; REMAT-NEXT: vle32.v v4, (a2)
; REMAT-NEXT: add a2, a0, s4
-; REMAT-NEXT: vle32.v v16, (a2)
+; REMAT-NEXT: vle32.v v30, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v6, v10
-; REMAT-NEXT: vle32.v v6, (a2)
-; REMAT-NEXT: add a2, a0, s5
; REMAT-NEXT: vle32.v v10, (a2)
-; REMAT-NEXT: sf.vc.vv 3, 0, v0, v14
-; REMAT-NEXT: vle32.v v4, (a2)
-; REMAT-NEXT: add a2, a0, s6
-; REMAT-NEXT: vle32.v v14, (a2)
-; REMAT-NEXT: csrr a5, vlenb
-; REMAT-NEXT: slli a5, a5, 3
-; REMAT-NEXT: add a5, sp, a5
-; REMAT-NEXT: addi a5, a5, 432
-; REMAT-NEXT: vl2r.v v0, (a5) # Unknown-size Folded Reload
-; REMAT-NEXT: sf.vc.vv 3, 0, v0, v18
+; REMAT-NEXT: add a2, a0, s5
+; REMAT-NEXT: vle32.v v6, (a2)
+; REMAT-NEXT: sf.vc.vv 3, 0, v0, v8
; REMAT-NEXT: vle32.v v0, (a2)
-; REMAT-NEXT: add a2, a0, s7
+; REMAT-NEXT: add a2, a0, s6
+; REMAT-NEXT: vle32.v v8, (a2)
+; REMAT-NEXT: sf.vc.vv 3, 0, v18, v16
; REMAT-NEXT: vle32.v v18, (a2)
+; REMAT-NEXT: add a2, a0, s7
+; REMAT-NEXT: vle32.v v16, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v20, v22
-; REMAT-NEXT: vle32.v v20, (a2)
-; REMAT-NEXT: csrr a2, vlenb
-; REMAT-NEXT: slli a2, a2, 3
-; REMAT-NEXT: add a2, sp, a2
-; REMAT-NEXT: addi a2, a2, 432
-; REMAT-NEXT: vs2r.v v20, (a2) # Unknown-size Folded Spill
-; REMAT-NEXT: add a2, a0, s8
+; REMAT-NEXT: vle32.v v22, (a2)
+; REMAT-NEXT: add a2, a0, a4
; REMAT-NEXT: vle32.v v20, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v24, v26
; REMAT-NEXT: vle32.v v24, (a2)
-; REMAT-NEXT: add a2, a0, s9
-; REMAT-NEXT: vle32.v v22, (a2)
-; REMAT-NEXT: sf.vc.vv 3, 0, v28, v8
-; REMAT-NEXT: vle32.v v26, (a2)
-; REMAT-NEXT: add a2, a0, s10
-; REMAT-NEXT: vle32.v v8, (a2)
-; REMAT-NEXT: sf.vc.vv 3, 0, v2, v12
-; REMAT-NEXT: vle32.v v12, (a2)
-; REMAT-NEXT: add a2, a0, s11
-; REMAT-NEXT: vle32.v v2, (a2)
-; REMAT-NEXT: sf.vc.vv 3, 0, v30, v16
-; REMAT-NEXT: vle32.v v16, (a2)
; REMAT-NEXT: addi a2, sp, 432
-; REMAT-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill
-; REMAT-NEXT: add a2, a0, a4
-; REMAT-NEXT: vle32.v v16, (a2)
-; REMAT-NEXT: sf.vc.vv 3, 0, v6, v10
-; REMAT-NEXT: vle32.v v10, (a2)
-; REMAT-NEXT: csrr a2, vlenb
-; REMAT-NEXT: slli a2, a2, 1
-; REMAT-NEXT: add a2, sp, a2
-; REMAT-NEXT: addi a2, a2, 432
-; REMAT-NEXT: vs2r.v v10, (a2) # Unknown-size Folded Spill
+; REMAT-NEXT: vs2r.v v24, (a2) # Unknown-size Folded Spill
; REMAT-NEXT: add a2, a0, a3
-; REMAT-NEXT: vle32.v v10, (a2)
-; REMAT-NEXT: sf.vc.vv 3, 0, v4, v14
-; REMAT-NEXT: vle32.v v14, (a2)
-; REMAT-NEXT: csrr a2, vlenb
-; REMAT-NEXT: li a3, 12
-; REMAT-NEXT: mul a2, a2, a3
-; REMAT-NEXT: add a2, sp, a2
-; REMAT-NEXT: addi a2, a2, 432
-; REMAT-NEXT: vs2r.v v14, (a2) # Unknown-size Folded Spill
-; REMAT-NEXT: li a5, 27
+; REMAT-NEXT: vle32.v v24, (a2)
+; REMAT-NEXT: sf.vc.vv 3, 0, v14, v12
+; REMAT-NEXT: vle32.v v12, (a2)
+; REMAT-NEXT: li a5, 23
; REMAT-NEXT: slli a5, a5, 9
; REMAT-NEXT: add a2, a0, a5
+; REMAT-NEXT: vle32.v v26, (a2)
+; REMAT-NEXT: sf.vc.vv 3, 0, v2, v28
; REMAT-NEXT: vle32.v v14, (a2)
-; REMAT-NEXT: sf.vc.vv 3, 0, v0, v18
-; REMAT-NEXT: vle32.v v18, (a2)
; REMAT-NEXT: csrr a2, vlenb
-; REMAT-NEXT: li a3, 10
+; REMAT-NEXT: li a3, 6
; REMAT-NEXT: mul a2, a2, a3
; REMAT-NEXT: add a2, sp, a2
; REMAT-NEXT: addi a2, a2, 432
-; REMAT-NEXT: vs2r.v v18, (a2) # Unknown-size Folded Spill
-; REMAT-NEXT: li ra, 7
-; REMAT-NEXT: slli ra, ra, 11
-; REMAT-NEXT: add a2, a0, ra
+; REMAT-NEXT: vs2r.v v14, (a2) # Unknown-size Folded Spill
+; REMAT-NEXT: lui s8, 3
+; REMAT-NEXT: add a2, a0, s8
; REMAT-NEXT: vle32.v v28, (a2)
-; REMAT-NEXT: csrr a3, vlenb
-; REMAT-NEXT: slli a3, a3, 3
-; REMAT-NEXT: add a3, sp, a3
-; REMAT-NEXT: addi a3, a3, 432
-; REMAT-NEXT: vl2r.v v18, (a3) # Unknown-size Folded Reload
-; REMAT-NEXT: sf.vc.vv 3, 0, v18, v20
-; REMAT-NEXT: vle32.v v18, (a2)
+; REMAT-NEXT: sf.vc.vv 3, 0, v4, v30
+; REMAT-NEXT: vle32.v v14, (a2)
; REMAT-NEXT: csrr a2, vlenb
-; REMAT-NEXT: slli a2, a2, 3
+; REMAT-NEXT: slli a2, a2, 2
; REMAT-NEXT: add a2, sp, a2
; REMAT-NEXT: addi a2, a2, 432
-; REMAT-NEXT: vs2r.v v18, (a2) # Unknown-size Folded Spill
-; REMAT-NEXT: li a2, 29
-; REMAT-NEXT: slli a2, a2, 9
-; REMAT-NEXT: add a2, a0, a2
+; REMAT-NEXT: vs2r.v v14, (a2) # Unknown-size Folded Spill
+; REMAT-NEXT: li s9, 25
+; REMAT-NEXT: slli s9, s9, 9
+; REMAT-NEXT: add a2, a0, s9
; REMAT-NEXT: vle32.v v30, (a2)
-; REMAT-NEXT: sf.vc.vv 3, 0, v24, v22
-; REMAT-NEXT: vle32.v v18, (a2)
-; REMAT-NEXT: csrr a2, vlenb
-; REMAT-NEXT: li a3, 6
-; REMAT-NEXT: mul a2, a2, a3
-; REMAT-NEXT: add a2, sp, a2
-; REMAT-NEXT: addi a2, a2, 432
-; REMAT-NEXT: vs2r.v v18, (a2) # Unknown-size Folded Spill
-; REMAT-NEXT: li a2, 15
-; REMAT-NEXT: slli a2, a2, 10
-; REMAT-NEXT: add a2, a0, a2
+; REMAT-NEXT: sf.vc.vv 3, 0, v10, v6
+; REMAT-NEXT: vle32.v v14, (a2)
+; REMAT-NEXT: li s10, 13
+; REMAT-NEXT: slli s10, s10, 10
+; REMAT-NEXT: add a2, a0, s10
; REMAT-NEXT: vle32.v v6, (a2)
-; REMAT-NEXT: sf.vc.vv 3, 0, v26, v8
+; REMAT-NEXT: sf.vc.vv 3, 0, v0, v8
; REMAT-NEXT: vle32.v v8, (a2)
; REMAT-NEXT: csrr a2, vlenb
-; REMAT-NEXT: slli a2, a2, 2
+; REMAT-NEXT: slli a2, a2, 1
; REMAT-NEXT: add a2, sp, a2
; REMAT-NEXT: addi a2, a2, 432
; REMAT-NEXT: vs2r.v v8, (a2) # Unknown-size Folded Spill
-; REMAT-NEXT: li a2, 31
-; REMAT-NEXT: slli a2, a2, 9
-; REMAT-NEXT: add a2, a0, a2
+; REMAT-NEXT: li s11, 27
+; REMAT-NEXT: slli s11, s11, 9
+; REMAT-NEXT: add a2, a0, s11
; REMAT-NEXT: vle32.v v4, (a2)
-; REMAT-NEXT: sf.vc.vv 3, 0, v12, v2
+; REMAT-NEXT: sf.vc.vv 3, 0, v18, v16
; REMAT-NEXT: vle32.v v18, (a2)
-; REMAT-NEXT: lui a2, 4
-; REMAT-NEXT: add a2, a0, a2
+; REMAT-NEXT: li ra, 7
+; REMAT-NEXT: slli ra, ra, 11
+; REMAT-NEXT: add a2, a0, ra
; REMAT-NEXT: vle32.v v2, (a2)
-; REMAT-NEXT: addi a3, sp, 432
-; REMAT-NEXT: vl2r.v v8, (a3) # Unknown-size Folded Reload
-; REMAT-NEXT: sf.vc.vv 3, 0, v8, v16
+; REMAT-NEXT: sf.vc.vv 3, 0, v22, v20
; REMAT-NEXT: vle32.v v20, (a2)
-; REMAT-NEXT: lui a2, 4
-; REMAT-NEXT: addiw a2, a2, 512
+; REMAT-NEXT: li a2, 29
+; REMAT-NEXT: slli a2, a2, 9
; REMAT-NEXT: add a2, a0, a2
; REMAT-NEXT: vle32.v v0, (a2)
-; REMAT-NEXT: csrr a3, vlenb
-; REMAT-NEXT: slli a3, a3, 1
-; REMAT-NEXT: add a3, sp, a3
-; REMAT-NEXT: addi a3, a3, 432
+; REMAT-NEXT: addi a3, sp, 432
; REMAT-NEXT: vl2r.v v8, (a3) # Unknown-size Folded Reload
-; REMAT-NEXT: sf.vc.vv 3, 0, v8, v10
+; REMAT-NEXT: sf.vc.vv 3, 0, v8, v24
; REMAT-NEXT: vle32.v v22, (a2)
-; REMAT-NEXT: li a2, 17
+; REMAT-NEXT: li a2, 15
; REMAT-NEXT: slli a2, a2, 10
; REMAT-NEXT: add a2, a0, a2
; REMAT-NEXT: vle32.v v24, (a2)
-; REMAT-NEXT: csrr a3, vlenb
-; REMAT-NEXT: li a4, 12
-; REMAT-NEXT: mul a3, a3, a4
-; REMAT-NEXT: add a3, sp, a3
-; REMAT-NEXT: addi a3, a3, 432
-; REMAT-NEXT: vl2r.v v8, (a3) # Unknown-size Folded Reload
-; REMAT-NEXT: sf.vc.vv 3, 0, v8, v14
+; REMAT-NEXT: sf.vc.vv 3, 0, v12, v26
; REMAT-NEXT: vle32.v v8, (a2)
-; REMAT-NEXT: lui a2, 4
-; REMAT-NEXT: addiw a2, a2, 1536
+; REMAT-NEXT: li a2, 31
+; REMAT-NEXT: slli a2, a2, 9
; REMAT-NEXT: add a2, a0, a2
; REMAT-NEXT: vle32.v v26, (a2)
; REMAT-NEXT: csrr a3, vlenb
-; REMAT-NEXT: li a4, 10
+; REMAT-NEXT: li a4, 6
; REMAT-NEXT: mul a3, a3, a4
; REMAT-NEXT: add a3, sp, a3
; REMAT-NEXT: addi a3, a3, 432
; REMAT-NEXT: vl2r.v v10, (a3) # Unknown-size Folded Reload
; REMAT-NEXT: sf.vc.vv 3, 0, v10, v28
; REMAT-NEXT: vle32.v v10, (a2)
-; REMAT-NEXT: li a2, 9
-; REMAT-NEXT: slli a2, a2, 11
+; REMAT-NEXT: lui a2, 4
; REMAT-NEXT: add a2, a0, a2
; REMAT-NEXT: vle32.v v28, (a2)
; REMAT-NEXT: csrr a3, vlenb
-; REMAT-NEXT: slli a3, a3, 3
+; REMAT-NEXT: slli a3, a3, 2
; REMAT-NEXT: add a3, sp, a3
; REMAT-NEXT: addi a3, a3, 432
; REMAT-NEXT: vl2r.v v12, (a3) # Unknown-size Folded Reload
; REMAT-NEXT: sf.vc.vv 3, 0, v12, v30
; REMAT-NEXT: vle32.v v12, (a2)
-; REMAT-NEXT: lui a2, 5
-; REMAT-NEXT: addiw a2, a2, -1536
+; REMAT-NEXT: lui a2, 4
+; REMAT-NEXT: addiw a2, a2, 512
; REMAT-NEXT: add a2, a0, a2
; REMAT-NEXT: vle32.v v30, (a2)
-; REMAT-NEXT: csrr a3, vlenb
-; REMAT-NEXT: li a4, 6
-; REMAT-NEXT: mul a3, a3, a4
-; REMAT-NEXT: add a3, sp, a3
-; REMAT-NEXT: addi a3, a3, 432
-; REMAT-NEXT: vl2r.v v14, (a3) # Unknown-size Folded Reload
; REMAT-NEXT: sf.vc.vv 3, 0, v14, v6
; REMAT-NEXT: vle32.v v14, (a2)
-; REMAT-NEXT: li a2, 19
+; REMAT-NEXT: li a2, 17
; REMAT-NEXT: slli a2, a2, 10
; REMAT-NEXT: add a2, a0, a2
; REMAT-NEXT: vle32.v v6, (a2)
; REMAT-NEXT: csrr a3, vlenb
-; REMAT-NEXT: slli a3, a3, 2
+; REMAT-NEXT: slli a3, a3, 1
; REMAT-NEXT: add a3, sp, a3
; REMAT-NEXT: addi a3, a3, 432
; REMAT-NEXT: vl2r.v v16, (a3) # Unknown-size Folded Reload
; REMAT-NEXT: sf.vc.vv 3, 0, v16, v4
; REMAT-NEXT: vle32.v v16, (a2)
-; REMAT-NEXT: lui a2, 5
-; REMAT-NEXT: addiw a2, a2, -512
+; REMAT-NEXT: lui a2, 4
+; REMAT-NEXT: addiw a2, a2, 1536
; REMAT-NEXT: add a2, a0, a2
; REMAT-NEXT: vle32.v v4, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v18, v2
; REMAT-NEXT: vle32.v v18, (a2)
-; REMAT-NEXT: lui a2, 5
+; REMAT-NEXT: li a2, 9
+; REMAT-NEXT: slli a2, a2, 11
; REMAT-NEXT: add a2, a0, a2
; REMAT-NEXT: vle32.v v2, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v20, v0
; REMAT-NEXT: vle32.v v20, (a2)
; REMAT-NEXT: lui a2, 5
-; REMAT-NEXT: addiw a2, a2, 512
+; REMAT-NEXT: addiw a2, a2, -1536
; REMAT-NEXT: add a2, a0, a2
; REMAT-NEXT: vle32.v v0, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v22, v24
; REMAT-NEXT: vle32.v v22, (a2)
-; REMAT-NEXT: li a2, 21
+; REMAT-NEXT: li a2, 19
; REMAT-NEXT: slli a2, a2, 10
; REMAT-NEXT: add a2, a0, a2
; REMAT-NEXT: vle32.v v24, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v8, v26
; REMAT-NEXT: vle32.v v8, (a2)
-; REMAT-NEXT: lui s4, 5
-; REMAT-NEXT: addiw s4, s4, 1536
-; REMAT-NEXT: add a2, a0, s4
+; REMAT-NEXT: lui a2, 5
+; REMAT-NEXT: addiw a2, a2, -512
+; REMAT-NEXT: add a2, a0, a2
; REMAT-NEXT: vle32.v v26, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v10, v28
; REMAT-NEXT: vle32.v v10, (a2)
-; REMAT-NEXT: li a2, 11
-; REMAT-NEXT: slli a2, a2, 11
+; REMAT-NEXT: lui a2, 5
; REMAT-NEXT: add a2, a0, a2
; REMAT-NEXT: vle32.v v28, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v12, v30
; REMAT-NEXT: vle32.v v12, (a2)
-; REMAT-NEXT: lui s3, 6
-; REMAT-NEXT: addiw s3, s3, -1536
-; REMAT-NEXT: add a2, a0, s3
+; REMAT-NEXT: lui a2, 5
+; REMAT-NEXT: addiw a2, a2, 512
+; REMAT-NEXT: add a2, a0, a2
; REMAT-NEXT: vle32.v v30, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v14, v6
; REMAT-NEXT: vle32.v v14, (a2)
-; REMAT-NEXT: li s2, 23
-; REMAT-NEXT: slli s2, s2, 10
-; REMAT-NEXT: add a2, a0, s2
+; REMAT-NEXT: li a2, 21
+; REMAT-NEXT: slli a2, a2, 10
+; REMAT-NEXT: add a2, a0, a2
; REMAT-NEXT: vle32.v v6, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v16, v4
; REMAT-NEXT: vle32.v v16, (a2)
-; REMAT-NEXT: lui a2, 6
-; REMAT-NEXT: addiw a2, a2, -512
+; REMAT-NEXT: lui a2, 5
+; REMAT-NEXT: addiw a2, a2, 1536
; REMAT-NEXT: add a2, a0, a2
; REMAT-NEXT: vle32.v v4, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v18, v2
; REMAT-NEXT: vle32.v v18, (a2)
-; REMAT-NEXT: lui a2, 6
+; REMAT-NEXT: li a2, 11
+; REMAT-NEXT: slli a2, a2, 11
; REMAT-NEXT: add a2, a0, a2
-; REMAT-NEXT: lui s1, 6
; REMAT-NEXT: vle32.v v2, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v20, v0
; REMAT-NEXT: vle32.v v20, (a2)
-; REMAT-NEXT: lui s0, 6
-; REMAT-NEXT: addiw s0, s0, 512
-; REMAT-NEXT: add a2, a0, s0
+; REMAT-NEXT: lui a2, 6
+; REMAT-NEXT: addiw a2, a2, -1536
+; REMAT-NEXT: add a2, a0, a2
; REMAT-NEXT: vle32.v v0, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v22, v24
; REMAT-NEXT: vle32.v v22, (a2)
-; REMAT-NEXT: li a2, 25
+; REMAT-NEXT: li a2, 23
; REMAT-NEXT: slli a2, a2, 10
; REMAT-NEXT: add a2, a0, a2
; REMAT-NEXT: vle32.v v24, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v8, v26
; REMAT-NEXT: vle32.v v8, (a2)
-; REMAT-NEXT: lui t6, 6
-; REMAT-NEXT: addiw t6, t6, 1536
-; REMAT-NEXT: add a2, a0, t6
+; REMAT-NEXT: lui a2, 6
+; REMAT-NEXT: addiw a2, a2, -512
+; REMAT-NEXT: add a2, a0, a2
; REMAT-NEXT: vle32.v v26, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v10, v28
; REMAT-NEXT: vle32.v v10, (a2)
-; REMAT-NEXT: li t5, 13
-; REMAT-NEXT: slli t5, t5, 11
-; REMAT-NEXT: add a2, a0, t5
+; REMAT-NEXT: lui a2, 6
+; REMAT-NEXT: add a2, a0, a2
+; REMAT-NEXT: lui s1, 6
; REMAT-NEXT: vle32.v v28, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v12, v30
; REMAT-NEXT: vle32.v v12, (a2)
-; REMAT-NEXT: lui a2, 7
-; REMAT-NEXT: addiw a2, a2, -1536
-; REMAT-NEXT: add a2, a0, a2
+; REMAT-NEXT: lui s0, 6
+; REMAT-NEXT: addiw s0, s0, 512
+; REMAT-NEXT: add a2, a0, s0
; REMAT-NEXT: vle32.v v30, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v14, v6
; REMAT-NEXT: vle32.v v14, (a2)
-; REMAT-NEXT: li t4, 27
-; REMAT-NEXT: slli t4, t4, 10
-; REMAT-NEXT: add a2, a0, t4
+; REMAT-NEXT: li a2, 25
+; REMAT-NEXT: slli a2, a2, 10
+; REMAT-NEXT: add a2, a0, a2
; REMAT-NEXT: vle32.v v6, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v16, v4
; REMAT-NEXT: vle32.v v16, (a2)
-; REMAT-NEXT: lui a2, 7
-; REMAT-NEXT: addiw a2, a2, -512
-; REMAT-NEXT: add a2, a0, a2
+; REMAT-NEXT: lui t6, 6
+; REMAT-NEXT: addiw t6, t6, 1536
+; REMAT-NEXT: add a2, a0, t6
; REMAT-NEXT: vle32.v v4, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v18, v2
; REMAT-NEXT: vle32.v v18, (a2)
-; REMAT-NEXT: lui a2, 7
-; REMAT-NEXT: add a2, a0, a2
-; REMAT-NEXT: lui t3, 7
+; REMAT-NEXT: li t5, 13
+; REMAT-NEXT: slli t5, t5, 11
+; REMAT-NEXT: add a2, a0, t5
; REMAT-NEXT: vle32.v v2, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v20, v0
; REMAT-NEXT: vle32.v v20, (a2)
-; REMAT-NEXT: lui t2, 7
-; REMAT-NEXT: addiw t2, t2, 512
-; REMAT-NEXT: add a2, a0, t2
+; REMAT-NEXT: lui a2, 7
+; REMAT-NEXT: addiw a2, a2, -1536
+; REMAT-NEXT: add a2, a0, a2
; REMAT-NEXT: vle32.v v0, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v22, v24
; REMAT-NEXT: vle32.v v22, (a2)
-; REMAT-NEXT: li t1, 29
-; REMAT-NEXT: slli t1, t1, 10
-; REMAT-NEXT: add a2, a0, t1
+; REMAT-NEXT: li t4, 27
+; REMAT-NEXT: slli t4, t4, 10
+; REMAT-NEXT: add a2, a0, t4
; REMAT-NEXT: vle32.v v24, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v8, v26
; REMAT-NEXT: vle32.v v8, (a2)
-; REMAT-NEXT: lui t0, 7
-; REMAT-NEXT: addiw t0, t0, 1536
-; REMAT-NEXT: add a2, a0, t0
+; REMAT-NEXT: lui a2, 7
+; REMAT-NEXT: addiw a2, a2, -512
+; REMAT-NEXT: add a2, a0, a2
; REMAT-NEXT: vle32.v v26, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v10, v28
; REMAT-NEXT: vle32.v v10, (a2)
-; REMAT-NEXT: li a7, 15
-; REMAT-NEXT: slli a7, a7, 11
-; REMAT-NEXT: add a2, a0, a7
+; REMAT-NEXT: lui a2, 7
+; REMAT-NEXT: add a2, a0, a2
+; REMAT-NEXT: lui t3, 7
; REMAT-NEXT: vle32.v v28, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v12, v30
; REMAT-NEXT: vle32.v v12, (a2)
-; REMAT-NEXT: lui a6, 8
-; REMAT-NEXT: addiw a6, a6, -1536
-; REMAT-NEXT: add a2, a0, a6
+; REMAT-NEXT: lui t2, 7
+; REMAT-NEXT: addiw t2, t2, 512
+; REMAT-NEXT: add a2, a0, t2
; REMAT-NEXT: vle32.v v30, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v14, v6
; REMAT-NEXT: vle32.v v14, (a2)
-; REMAT-NEXT: li a4, 31
-; REMAT-NEXT: slli a4, a4, 10
-; REMAT-NEXT: add a2, a0, a4
+; REMAT-NEXT: li t1, 29
+; REMAT-NEXT: slli t1, t1, 10
+; REMAT-NEXT: add a2, a0, t1
; REMAT-NEXT: vle32.v v6, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v16, v4
; REMAT-NEXT: vle32.v v16, (a2)
-; REMAT-NEXT: lui a3, 8
-; REMAT-NEXT: addiw a3, a3, -512
-; REMAT-NEXT: add a2, a0, a3
+; REMAT-NEXT: lui t0, 7
+; REMAT-NEXT: addiw t0, t0, 1536
+; REMAT-NEXT: add a2, a0, t0
; REMAT-NEXT: vle32.v v4, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v18, v2
; REMAT-NEXT: vle32.v v18, (a2)
-; REMAT-NEXT: lui a2, 8
-; REMAT-NEXT: add a0, a0, a2
-; REMAT-NEXT: vle32.v v2, (a0)
+; REMAT-NEXT: li a7, 15
+; REMAT-NEXT: slli a7, a7, 11
+; REMAT-NEXT: add a2, a0, a7
+; REMAT-NEXT: vle32.v v2, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v20, v0
+; REMAT-NEXT: vle32.v v20, (a2)
+; REMAT-NEXT: lui a6, 8
+; REMAT-NEXT: addiw a6, a6, -1536
+; REMAT-NEXT: add a2, a0, a6
+; REMAT-NEXT: vle32.v v0, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v22, v24
+; REMAT-NEXT: vle32.v v22, (a2)
+; REMAT-NEXT: li a4, 31
+; REMAT-NEXT: slli a4, a4, 10
+; REMAT-NEXT: add a2, a0, a4
+; REMAT-NEXT: vle32.v v24, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v8, v26
+; REMAT-NEXT: vle32.v v8, (a2)
+; REMAT-NEXT: lui a3, 8
+; REMAT-NEXT: addiw a3, a3, -512
+; REMAT-NEXT: add a2, a0, a3
+; REMAT-NEXT: vle32.v v26, (a2)
; REMAT-NEXT: sf.vc.vv 3, 0, v10, v28
+; REMAT-NEXT: vle32.v v10, (a2)
+; REMAT-NEXT: lui a2, 8
+; REMAT-NEXT: add a0, a0, a2
+; REMAT-NEXT: vle32.v v28, (a0)
; REMAT-NEXT: sf.vc.vv 3, 0, v12, v30
; REMAT-NEXT: sf.vc.vv 3, 0, v14, v6
; REMAT-NEXT: sf.vc.vv 3, 0, v16, v4
; REMAT-NEXT: sf.vc.vv 3, 0, v18, v2
+; REMAT-NEXT: sf.vc.vv 3, 0, v20, v0
+; REMAT-NEXT: sf.vc.vv 3, 0, v22, v24
+; REMAT-NEXT: sf.vc.vv 3, 0, v8, v26
+; REMAT-NEXT: sf.vc.vv 3, 0, v10, v28
; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0
; REMAT-NEXT: addi a0, a1, 1024
; REMAT-NEXT: vse32.v v8, (a0)
@@ -1457,41 +1397,36 @@ define void @test(ptr %0, ptr %1, i64 %2) {
; REMAT-NEXT: slli a0, a0, 10
; REMAT-NEXT: add a0, a1, a0
; REMAT-NEXT: sd a0, 336(sp) # 8-byte Folded Spill
-; REMAT-NEXT: li a0, 15
-; REMAT-NEXT: slli a0, a0, 9
-; REMAT-NEXT: add a0, a1, a0
-; REMAT-NEXT: sd a0, 328(sp) # 8-byte Folded Spill
-; REMAT-NEXT: lui a0, 2
-; REMAT-NEXT: add a0, a1, a0
-; REMAT-NEXT: sd a0, 320(sp) # 8-byte Folded Spill
-; REMAT-NEXT: li a0, 17
-; REMAT-NEXT: slli a0, a0, 9
-; REMAT-NEXT: add a0, a1, a0
-; REMAT-NEXT: sd a0, 312(sp) # 8-byte Folded Spill
+; REMAT-NEXT: add s2, a1, s2
+; REMAT-NEXT: sd s2, 328(sp) # 8-byte Folded Spill
+; REMAT-NEXT: add s3, a1, s3
+; REMAT-NEXT: sd s3, 320(sp) # 8-byte Folded Spill
+; REMAT-NEXT: add s4, a1, s4
+; REMAT-NEXT: sd s4, 312(sp) # 8-byte Folded Spill
; REMAT-NEXT: add s5, a1, s5
; REMAT-NEXT: sd s5, 304(sp) # 8-byte Folded Spill
; REMAT-NEXT: add s6, a1, s6
; REMAT-NEXT: sd s6, 296(sp) # 8-byte Folded Spill
; REMAT-NEXT: add s7, a1, s7
; REMAT-NEXT: sd s7, 288(sp) # 8-byte Folded Spill
-; REMAT-NEXT: add s8, a1, s8
-; REMAT-NEXT: sd s8, 280(sp) # 8-byte Folded Spill
-; REMAT-NEXT: add s9, a1, s9
-; REMAT-NEXT: sd s9, 272(sp) # 8-byte Folded Spill
-; REMAT-NEXT: add s10, a1, s10
-; REMAT-NEXT: sd s10, 264(sp) # 8-byte Folded Spill
-; REMAT-NEXT: add s11, a1, s11
-; REMAT-NEXT: sd s11, 256(sp) # 8-byte Folded Spill
-; REMAT-NEXT: li a0, 25
+; REMAT-NEXT: li a0, 21
; REMAT-NEXT: slli a0, a0, 9
; REMAT-NEXT: add a0, a1, a0
-; REMAT-NEXT: sd a0, 248(sp) # 8-byte Folded Spill
-; REMAT-NEXT: li a0, 13
+; REMAT-NEXT: sd a0, 280(sp) # 8-byte Folded Spill
+; REMAT-NEXT: li a0, 11
; REMAT-NEXT: slli a0, a0, 10
; REMAT-NEXT: add a0, a1, a0
-; REMAT-NEXT: sd a0, 240(sp) # 8-byte Folded Spill
+; REMAT-NEXT: sd a0, 272(sp) # 8-byte Folded Spill
; REMAT-NEXT: add a5, a1, a5
-; REMAT-NEXT: sd a5, 232(sp) # 8-byte Folded Spill
+; REMAT-NEXT: sd a5, 264(sp) # 8-byte Folded Spill
+; REMAT-NEXT: add s8, a1, s8
+; REMAT-NEXT: sd s8, 256(sp) # 8-byte Folded Spill
+; REMAT-NEXT: add s9, a1, s9
+; REMAT-NEXT: sd s9, 248(sp) # 8-byte Folded Spill
+; REMAT-NEXT: add s10, a1, s10
+; REMAT-NEXT: sd s10, 240(sp) # 8-byte Folded Spill
+; REMAT-NEXT: add s11, a1, s11
+; REMAT-NEXT: sd s11, 232(sp) # 8-byte Folded Spill
; REMAT-NEXT: add ra, a1, ra
; REMAT-NEXT: sd ra, 224(sp) # 8-byte Folded Spill
; REMAT-NEXT: li a0, 29
@@ -1548,16 +1483,22 @@ define void @test(ptr %0, ptr %1, i64 %2) {
; REMAT-NEXT: slli a0, a0, 10
; REMAT-NEXT: add a0, a1, a0
; REMAT-NEXT: sd a0, 112(sp) # 8-byte Folded Spill
-; REMAT-NEXT: add s4, a1, s4
-; REMAT-NEXT: sd s4, 104(sp) # 8-byte Folded Spill
+; REMAT-NEXT: lui a0, 5
+; REMAT-NEXT: addiw a0, a0, 1536
+; REMAT-NEXT: add a0, a1, a0
+; REMAT-NEXT: sd a0, 104(sp) # 8-byte Folded Spill
; REMAT-NEXT: li a0, 11
; REMAT-NEXT: slli a0, a0, 11
; REMAT-NEXT: add a0, a1, a0
; REMAT-NEXT: sd a0, 96(sp) # 8-byte Folded Spill
-; REMAT-NEXT: add s3, a1, s3
-; REMAT-NEXT: sd s3, 88(sp) # 8-byte Folded Spill
-; REMAT-NEXT: add s2, a1, s2
-; REMAT-NEXT: sd s2, 80(sp) # 8-byte Folded Spill
+; REMAT-NEXT: lui a0, 6
+; REMAT-NEXT: addiw a0, a0, -1536
+; REMAT-NEXT: add a0, a1, a0
+; REMAT-NEXT: sd a0, 88(sp) # 8-byte Folded Spill
+; REMAT-NEXT: li a0, 23
+; REMAT-NEXT: slli a0, a0, 10
+; REMAT-NEXT: add a0, a1, a0
+; REMAT-NEXT: sd a0, 80(sp) # 8-byte Folded Spill
; REMAT-NEXT: lui a0, 6
; REMAT-NEXT: addiw a0, a0, -512
; REMAT-NEXT: add a0, a1, a0
@@ -1854,8 +1795,7 @@ define void @test(ptr %0, ptr %1, i64 %2) {
; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0
; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0
; REMAT-NEXT: csrr a0, vlenb
-; REMAT-NEXT: li a1, 14
-; REMAT-NEXT: mul a0, a0, a1
+; REMAT-NEXT: slli a0, a0, 3
; REMAT-NEXT: add sp, sp, a0
; REMAT-NEXT: .cfi_def_cfa sp, 544
; REMAT-NEXT: ld ra, 536(sp) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll
index 575a757149ebba..0b5856a7000dd4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll
@@ -5682,28 +5682,16 @@ define void @mscatter_baseidx_v8i64(<8 x i64> %val, ptr %base, <8 x i64> %idxs,
;
; RV32ZVE32F-LABEL: mscatter_baseidx_v8i64:
; RV32ZVE32F: # %bb.0:
-; RV32ZVE32F-NEXT: addi sp, sp, -48
-; RV32ZVE32F-NEXT: .cfi_def_cfa_offset 48
-; RV32ZVE32F-NEXT: sw s0, 44(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT: sw s1, 40(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT: sw s2, 36(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT: sw s3, 32(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT: sw s4, 28(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT: sw s5, 24(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT: sw s6, 20(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT: sw s7, 16(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT: sw s8, 12(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT: sw s9, 8(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT: addi sp, sp, -16
+; RV32ZVE32F-NEXT: .cfi_def_cfa_offset 16
+; RV32ZVE32F-NEXT: sw s0, 12(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT: sw s1, 8(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT: sw s2, 4(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT: sw s3, 0(sp) # 4-byte Folded Spill
; RV32ZVE32F-NEXT: .cfi_offset s0, -4
; RV32ZVE32F-NEXT: .cfi_offset s1, -8
; RV32ZVE32F-NEXT: .cfi_offset s2, -12
; RV32ZVE32F-NEXT: .cfi_offset s3, -16
-; RV32ZVE32F-NEXT: .cfi_offset s4, -20
-; RV32ZVE32F-NEXT: .cfi_offset s5, -24
-; RV32ZVE32F-NEXT: .cfi_offset s6, -28
-; RV32ZVE32F-NEXT: .cfi_offset s7, -32
-; RV32ZVE32F-NEXT: .cfi_offset s8, -36
-; RV32ZVE32F-NEXT: .cfi_offset s9, -40
; RV32ZVE32F-NEXT: .cfi_remember_state
; RV32ZVE32F-NEXT: lw a3, 56(a0)
; RV32ZVE32F-NEXT: lw a4, 60(a0)
@@ -5715,30 +5703,30 @@ define void @mscatter_baseidx_v8i64(<8 x i64> %val, ptr %base, <8 x i64> %idxs,
; RV32ZVE32F-NEXT: lw t4, 28(a0)
; RV32ZVE32F-NEXT: lw t1, 32(a0)
; RV32ZVE32F-NEXT: lw t2, 36(a0)
+; RV32ZVE32F-NEXT: lw t5, 0(a2)
+; RV32ZVE32F-NEXT: lw t6, 8(a2)
+; RV32ZVE32F-NEXT: lw s0, 16(a2)
+; RV32ZVE32F-NEXT: lw s1, 24(a2)
+; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; RV32ZVE32F-NEXT: vmv.v.x v8, t5
+; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, t6
+; RV32ZVE32F-NEXT: lw t5, 32(a2)
+; RV32ZVE32F-NEXT: lw t6, 40(a2)
+; RV32ZVE32F-NEXT: lw s2, 48(a2)
+; RV32ZVE32F-NEXT: lw s3, 56(a2)
+; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, s0
+; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, s1
+; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, t5
+; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, t6
; RV32ZVE32F-NEXT: lw s0, 8(a0)
; RV32ZVE32F-NEXT: lw s1, 12(a0)
; RV32ZVE32F-NEXT: lw t5, 16(a0)
; RV32ZVE32F-NEXT: lw t6, 20(a0)
-; RV32ZVE32F-NEXT: lw s2, 32(a2)
-; RV32ZVE32F-NEXT: lw s3, 40(a2)
-; RV32ZVE32F-NEXT: lw s4, 48(a2)
-; RV32ZVE32F-NEXT: lw s5, 56(a2)
-; RV32ZVE32F-NEXT: lw s6, 0(a2)
-; RV32ZVE32F-NEXT: lw s7, 8(a2)
-; RV32ZVE32F-NEXT: lw s8, 16(a2)
-; RV32ZVE32F-NEXT: lw s9, 24(a2)
-; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; RV32ZVE32F-NEXT: vmv.v.x v8, s6
+; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, s2
; RV32ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma
; RV32ZVE32F-NEXT: vmv.x.s a2, v0
; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma
-; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, s7
-; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, s8
-; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, s9
-; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, s2
; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, s3
-; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, s4
-; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, s5
; RV32ZVE32F-NEXT: vsll.vi v8, v8, 3
; RV32ZVE32F-NEXT: andi s2, a2, 1
; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1
@@ -5771,27 +5759,15 @@ define void @mscatter_baseidx_v8i64(<8 x i64> %val, ptr %base, <8 x i64> %idxs,
; RV32ZVE32F-NEXT: sw a3, 0(a0)
; RV32ZVE32F-NEXT: sw a4, 4(a0)
; RV32ZVE32F-NEXT: .LBB51_9: # %else14
-; RV32ZVE32F-NEXT: lw s0, 44(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT: lw s1, 40(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT: lw s2, 36(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT: lw s3, 32(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT: lw s4, 28(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT: lw s5, 24(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT: lw s6, 20(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT: lw s7, 16(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT: lw s8, 12(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT: lw s9, 8(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT: lw s0, 12(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT: lw s1, 8(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT: lw s2, 4(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT: lw s3, 0(sp) # 4-byte Folded Reload
; RV32ZVE32F-NEXT: .cfi_restore s0
; RV32ZVE32F-NEXT: .cfi_restore s1
; RV32ZVE32F-NEXT: .cfi_restore s2
; RV32ZVE32F-NEXT: .cfi_restore s3
-; RV32ZVE32F-NEXT: .cfi_restore s4
-; RV32ZVE32F-NEXT: .cfi_restore s5
-; RV32ZVE32F-NEXT: .cfi_restore s6
-; RV32ZVE32F-NEXT: .cfi_restore s7
-; RV32ZVE32F-NEXT: .cfi_restore s8
-; RV32ZVE32F-NEXT: .cfi_restore s9
-; RV32ZVE32F-NEXT: addi sp, sp, 48
+; RV32ZVE32F-NEXT: addi sp, sp, 16
; RV32ZVE32F-NEXT: .cfi_def_cfa_offset 0
; RV32ZVE32F-NEXT: ret
; RV32ZVE32F-NEXT: .LBB51_10: # %cond.store
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll
index a11c02dd5e2cb4..036fee6a13ca4c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll
@@ -1306,6 +1306,12 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
; ZVFHMIN32-NEXT: sb a0, 219(sp)
; ZVFHMIN32-NEXT: lh a0, 564(sp)
; ZVFHMIN32-NEXT: lh a1, 308(sp)
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT: sb a0, 218(sp)
+; ZVFHMIN32-NEXT: lh a0, 562(sp)
+; ZVFHMIN32-NEXT: lh a1, 306(sp)
; ZVFHMIN32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
; ZVFHMIN32-NEXT: vslidedown.vi v10, v8, 7
; ZVFHMIN32-NEXT: csrr a2, vlenb
@@ -1358,86 +1364,82 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
; ZVFHMIN32-NEXT: vs1r.v v10, (a2) # Unknown-size Folded Spill
; ZVFHMIN32-NEXT: vsetivli zero, 1, e16, m2, ta, ma
; ZVFHMIN32-NEXT: vslidedown.vi v26, v8, 15
-; ZVFHMIN32-NEXT: vslidedown.vi v20, v8, 14
-; ZVFHMIN32-NEXT: vslidedown.vi v28, v8, 13
-; ZVFHMIN32-NEXT: vslidedown.vi v10, v8, 12
-; ZVFHMIN32-NEXT: csrr a2, vlenb
-; ZVFHMIN32-NEXT: slli a2, a2, 1
-; ZVFHMIN32-NEXT: add a2, sp, a2
-; ZVFHMIN32-NEXT: addi a2, a2, 848
+; ZVFHMIN32-NEXT: vslidedown.vi v28, v8, 14
+; ZVFHMIN32-NEXT: vslidedown.vi v10, v8, 13
+; ZVFHMIN32-NEXT: addi a2, sp, 848
; ZVFHMIN32-NEXT: vs2r.v v10, (a2) # Unknown-size Folded Spill
-; ZVFHMIN32-NEXT: vslidedown.vi v4, v8, 11
-; ZVFHMIN32-NEXT: vslidedown.vi v2, v8, 10
-; ZVFHMIN32-NEXT: vslidedown.vi v30, v8, 9
-; ZVFHMIN32-NEXT: vslidedown.vi v22, v8, 8
-; ZVFHMIN32-NEXT: vmv.x.s t5, v16
+; ZVFHMIN32-NEXT: vslidedown.vi v6, v8, 12
+; ZVFHMIN32-NEXT: vslidedown.vi v2, v8, 11
+; ZVFHMIN32-NEXT: vslidedown.vi v22, v8, 10
+; ZVFHMIN32-NEXT: vslidedown.vi v20, v8, 9
+; ZVFHMIN32-NEXT: vslidedown.vi v18, v8, 8
+; ZVFHMIN32-NEXT: vmv.x.s a3, v16
; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT: sb a0, 218(sp)
-; ZVFHMIN32-NEXT: lh a0, 562(sp)
-; ZVFHMIN32-NEXT: lh a1, 306(sp)
+; ZVFHMIN32-NEXT: sb a0, 217(sp)
+; ZVFHMIN32-NEXT: lh a0, 560(sp)
+; ZVFHMIN32-NEXT: lh a1, 304(sp)
; ZVFHMIN32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; ZVFHMIN32-NEXT: vslidedown.vi v3, v16, 7
-; ZVFHMIN32-NEXT: vslidedown.vi v31, v16, 6
-; ZVFHMIN32-NEXT: vslidedown.vi v5, v16, 5
+; ZVFHMIN32-NEXT: vslidedown.vi v21, v16, 7
+; ZVFHMIN32-NEXT: vslidedown.vi v3, v16, 6
+; ZVFHMIN32-NEXT: vslidedown.vi v19, v16, 5
; ZVFHMIN32-NEXT: vslidedown.vi v23, v16, 4
; ZVFHMIN32-NEXT: vslidedown.vi v8, v16, 3
; ZVFHMIN32-NEXT: csrr a2, vlenb
-; ZVFHMIN32-NEXT: li a3, 18
-; ZVFHMIN32-NEXT: mul a2, a2, a3
+; ZVFHMIN32-NEXT: li a4, 10
+; ZVFHMIN32-NEXT: mul a2, a2, a4
; ZVFHMIN32-NEXT: add a2, sp, a2
; ZVFHMIN32-NEXT: addi a2, a2, 848
; ZVFHMIN32-NEXT: vs1r.v v8, (a2) # Unknown-size Folded Spill
; ZVFHMIN32-NEXT: vslidedown.vi v8, v16, 2
; ZVFHMIN32-NEXT: csrr a2, vlenb
-; ZVFHMIN32-NEXT: li a3, 22
-; ZVFHMIN32-NEXT: mul a2, a2, a3
+; ZVFHMIN32-NEXT: slli a2, a2, 4
; ZVFHMIN32-NEXT: add a2, sp, a2
; ZVFHMIN32-NEXT: addi a2, a2, 848
; ZVFHMIN32-NEXT: vs1r.v v8, (a2) # Unknown-size Folded Spill
; ZVFHMIN32-NEXT: vslidedown.vi v8, v16, 1
; ZVFHMIN32-NEXT: csrr a2, vlenb
-; ZVFHMIN32-NEXT: li a3, 21
-; ZVFHMIN32-NEXT: mul a2, a2, a3
+; ZVFHMIN32-NEXT: slli a4, a2, 4
+; ZVFHMIN32-NEXT: sub a2, a4, a2
; ZVFHMIN32-NEXT: add a2, sp, a2
; ZVFHMIN32-NEXT: addi a2, a2, 848
; ZVFHMIN32-NEXT: vs1r.v v8, (a2) # Unknown-size Folded Spill
; ZVFHMIN32-NEXT: vsetivli zero, 1, e16, m2, ta, ma
-; ZVFHMIN32-NEXT: vslidedown.vi v18, v16, 15
-; ZVFHMIN32-NEXT: vslidedown.vi v14, v16, 14
-; ZVFHMIN32-NEXT: vslidedown.vi v12, v16, 13
-; ZVFHMIN32-NEXT: vslidedown.vi v10, v16, 12
-; ZVFHMIN32-NEXT: vslidedown.vi v8, v16, 11
-; ZVFHMIN32-NEXT: vslidedown.vi v6, v16, 10
+; ZVFHMIN32-NEXT: vslidedown.vi v14, v16, 15
+; ZVFHMIN32-NEXT: vslidedown.vi v8, v16, 14
+; ZVFHMIN32-NEXT: vslidedown.vi v10, v16, 13
+; ZVFHMIN32-NEXT: vslidedown.vi v12, v16, 12
+; ZVFHMIN32-NEXT: vslidedown.vi v30, v16, 11
; ZVFHMIN32-NEXT: csrr a2, vlenb
-; ZVFHMIN32-NEXT: li a3, 19
-; ZVFHMIN32-NEXT: mul a2, a2, a3
+; ZVFHMIN32-NEXT: slli a4, a2, 4
+; ZVFHMIN32-NEXT: add a2, a4, a2
; ZVFHMIN32-NEXT: add a2, sp, a2
; ZVFHMIN32-NEXT: addi a2, a2, 848
-; ZVFHMIN32-NEXT: vs2r.v v6, (a2) # Unknown-size Folded Spill
-; ZVFHMIN32-NEXT: vslidedown.vi v6, v16, 9
+; ZVFHMIN32-NEXT: vs2r.v v30, (a2) # Unknown-size Folded Spill
+; ZVFHMIN32-NEXT: vslidedown.vi v30, v16, 10
; ZVFHMIN32-NEXT: csrr a2, vlenb
-; ZVFHMIN32-NEXT: li a3, 14
-; ZVFHMIN32-NEXT: mul a2, a2, a3
+; ZVFHMIN32-NEXT: li a4, 11
+; ZVFHMIN32-NEXT: mul a2, a2, a4
; ZVFHMIN32-NEXT: add a2, sp, a2
; ZVFHMIN32-NEXT: addi a2, a2, 848
-; ZVFHMIN32-NEXT: vs2r.v v6, (a2) # Unknown-size Folded Spill
-; ZVFHMIN32-NEXT: vslidedown.vi v6, v16, 8
+; ZVFHMIN32-NEXT: vs2r.v v30, (a2) # Unknown-size Folded Spill
+; ZVFHMIN32-NEXT: vslidedown.vi v4, v16, 9
+; ZVFHMIN32-NEXT: vslidedown.vi v30, v16, 8
; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT: sb a0, 217(sp)
-; ZVFHMIN32-NEXT: lh a0, 560(sp)
-; ZVFHMIN32-NEXT: lh a1, 304(sp)
+; ZVFHMIN32-NEXT: sb a0, 216(sp)
+; ZVFHMIN32-NEXT: lh a0, 558(sp)
+; ZVFHMIN32-NEXT: lh a1, 302(sp)
; ZVFHMIN32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; ZVFHMIN32-NEXT: vslidedown.vi v9, v0, 7
-; ZVFHMIN32-NEXT: vslidedown.vi v11, v0, 6
-; ZVFHMIN32-NEXT: vslidedown.vi v13, v0, 5
+; ZVFHMIN32-NEXT: vslidedown.vi v11, v0, 7
+; ZVFHMIN32-NEXT: vslidedown.vi v7, v0, 6
+; ZVFHMIN32-NEXT: vslidedown.vi v9, v0, 5
; ZVFHMIN32-NEXT: vslidedown.vi v29, v0, 4
-; ZVFHMIN32-NEXT: vslidedown.vi v27, v0, 3
-; ZVFHMIN32-NEXT: vslidedown.vi v7, v0, 2
-; ZVFHMIN32-NEXT: vslidedown.vi v21, v0, 1
+; ZVFHMIN32-NEXT: vslidedown.vi v31, v0, 3
+; ZVFHMIN32-NEXT: vslidedown.vi v5, v0, 2
+; ZVFHMIN32-NEXT: vslidedown.vi v27, v0, 1
; ZVFHMIN32-NEXT: vsetivli zero, 1, e16, m2, ta, ma
; ZVFHMIN32-NEXT: vslidedown.vi v16, v0, 15
; ZVFHMIN32-NEXT: csrr a2, vlenb
@@ -1447,99 +1449,88 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
; ZVFHMIN32-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill
; ZVFHMIN32-NEXT: vslidedown.vi v16, v0, 14
; ZVFHMIN32-NEXT: csrr a2, vlenb
-; ZVFHMIN32-NEXT: slli a2, a2, 3
+; ZVFHMIN32-NEXT: slli a2, a2, 1
; ZVFHMIN32-NEXT: add a2, sp, a2
; ZVFHMIN32-NEXT: addi a2, a2, 848
; ZVFHMIN32-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill
; ZVFHMIN32-NEXT: vslidedown.vi v16, v0, 13
; ZVFHMIN32-NEXT: csrr a2, vlenb
-; ZVFHMIN32-NEXT: li a3, 6
-; ZVFHMIN32-NEXT: mul a2, a2, a3
+; ZVFHMIN32-NEXT: li a4, 6
+; ZVFHMIN32-NEXT: mul a2, a2, a4
; ZVFHMIN32-NEXT: add a2, sp, a2
; ZVFHMIN32-NEXT: addi a2, a2, 848
; ZVFHMIN32-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill
; ZVFHMIN32-NEXT: vslidedown.vi v16, v0, 12
; ZVFHMIN32-NEXT: csrr a2, vlenb
-; ZVFHMIN32-NEXT: li a3, 12
-; ZVFHMIN32-NEXT: mul a2, a2, a3
+; ZVFHMIN32-NEXT: slli a2, a2, 3
; ZVFHMIN32-NEXT: add a2, sp, a2
; ZVFHMIN32-NEXT: addi a2, a2, 848
; ZVFHMIN32-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill
; ZVFHMIN32-NEXT: vslidedown.vi v16, v0, 11
; ZVFHMIN32-NEXT: csrr a2, vlenb
-; ZVFHMIN32-NEXT: li a3, 10
-; ZVFHMIN32-NEXT: mul a2, a2, a3
+; ZVFHMIN32-NEXT: li a4, 13
+; ZVFHMIN32-NEXT: mul a2, a2, a4
; ZVFHMIN32-NEXT: add a2, sp, a2
; ZVFHMIN32-NEXT: addi a2, a2, 848
; ZVFHMIN32-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill
; ZVFHMIN32-NEXT: vslidedown.vi v16, v0, 10
; ZVFHMIN32-NEXT: csrr a2, vlenb
-; ZVFHMIN32-NEXT: slli a2, a2, 4
+; ZVFHMIN32-NEXT: li a4, 19
+; ZVFHMIN32-NEXT: mul a2, a2, a4
; ZVFHMIN32-NEXT: add a2, sp, a2
; ZVFHMIN32-NEXT: addi a2, a2, 848
; ZVFHMIN32-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill
; ZVFHMIN32-NEXT: vslidedown.vi v16, v0, 9
+; ZVFHMIN32-NEXT: csrr a2, vlenb
+; ZVFHMIN32-NEXT: li a4, 21
+; ZVFHMIN32-NEXT: mul a2, a2, a4
+; ZVFHMIN32-NEXT: add a2, sp, a2
+; ZVFHMIN32-NEXT: addi a2, a2, 848
+; ZVFHMIN32-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill
; ZVFHMIN32-NEXT: vslidedown.vi v0, v0, 8
-; ZVFHMIN32-NEXT: addi a2, sp, 848
-; ZVFHMIN32-NEXT: vs2r.v v0, (a2) # Unknown-size Folded Spill
-; ZVFHMIN32-NEXT: vmv.x.s t4, v26
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT: sb a0, 216(sp)
-; ZVFHMIN32-NEXT: lh a0, 558(sp)
-; ZVFHMIN32-NEXT: lh a1, 302(sp)
-; ZVFHMIN32-NEXT: vmv.x.s t3, v20
-; ZVFHMIN32-NEXT: vmv.x.s t1, v28
; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
; ZVFHMIN32-NEXT: sb a0, 215(sp)
; ZVFHMIN32-NEXT: lh a0, 556(sp)
; ZVFHMIN32-NEXT: lh a1, 300(sp)
-; ZVFHMIN32-NEXT: csrr a2, vlenb
-; ZVFHMIN32-NEXT: slli a2, a2, 1
-; ZVFHMIN32-NEXT: add a2, sp, a2
-; ZVFHMIN32-NEXT: addi a2, a2, 848
-; ZVFHMIN32-NEXT: vl2r.v v0, (a2) # Unknown-size Folded Reload
-; ZVFHMIN32-NEXT: vmv.x.s t2, v0
-; ZVFHMIN32-NEXT: vmv.x.s t0, v4
+; ZVFHMIN32-NEXT: vmv.x.s t3, v26
+; ZVFHMIN32-NEXT: vmv.x.s t2, v28
; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
; ZVFHMIN32-NEXT: sb a0, 214(sp)
; ZVFHMIN32-NEXT: lh a0, 554(sp)
; ZVFHMIN32-NEXT: lh a1, 298(sp)
-; ZVFHMIN32-NEXT: vmv.x.s a7, v2
-; ZVFHMIN32-NEXT: vmv.x.s a6, v30
+; ZVFHMIN32-NEXT: addi a2, sp, 848
+; ZVFHMIN32-NEXT: vl2r.v v16, (a2) # Unknown-size Folded Reload
+; ZVFHMIN32-NEXT: vmv.x.s t1, v16
+; ZVFHMIN32-NEXT: vmv.x.s t0, v6
; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
; ZVFHMIN32-NEXT: sb a0, 213(sp)
; ZVFHMIN32-NEXT: lh a0, 552(sp)
; ZVFHMIN32-NEXT: lh a1, 296(sp)
-; ZVFHMIN32-NEXT: vmv.x.s a2, v22
-; ZVFHMIN32-NEXT: sw a2, 104(sp) # 4-byte Folded Spill
-; ZVFHMIN32-NEXT: vmv.x.s a2, v18
-; ZVFHMIN32-NEXT: sw a2, 108(sp) # 4-byte Folded Spill
+; ZVFHMIN32-NEXT: vmv.x.s a7, v2
+; ZVFHMIN32-NEXT: vmv.x.s a6, v22
; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
; ZVFHMIN32-NEXT: sb a0, 212(sp)
; ZVFHMIN32-NEXT: lh a0, 550(sp)
; ZVFHMIN32-NEXT: lh a1, 294(sp)
-; ZVFHMIN32-NEXT: vmv.x.s a2, v14
-; ZVFHMIN32-NEXT: sw a2, 112(sp) # 4-byte Folded Spill
-; ZVFHMIN32-NEXT: vmv.x.s a2, v12
-; ZVFHMIN32-NEXT: sw a2, 116(sp) # 4-byte Folded Spill
+; ZVFHMIN32-NEXT: vmv.x.s a5, v20
+; ZVFHMIN32-NEXT: vmv.x.s a2, v18
+; ZVFHMIN32-NEXT: sw a2, 108(sp) # 4-byte Folded Spill
; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
; ZVFHMIN32-NEXT: sb a0, 211(sp)
; ZVFHMIN32-NEXT: lh a0, 548(sp)
; ZVFHMIN32-NEXT: lh a1, 292(sp)
-; ZVFHMIN32-NEXT: vmv.x.s a2, v10
-; ZVFHMIN32-NEXT: sw a2, 120(sp) # 4-byte Folded Spill
+; ZVFHMIN32-NEXT: vmv.x.s a2, v14
+; ZVFHMIN32-NEXT: sw a2, 116(sp) # 4-byte Folded Spill
; ZVFHMIN32-NEXT: vmv.x.s a2, v8
; ZVFHMIN32-NEXT: sw a2, 124(sp) # 4-byte Folded Spill
; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
@@ -1548,33 +1539,27 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
; ZVFHMIN32-NEXT: sb a0, 210(sp)
; ZVFHMIN32-NEXT: lh a0, 546(sp)
; ZVFHMIN32-NEXT: lh a1, 290(sp)
-; ZVFHMIN32-NEXT: fmv.h.x fa5, t5
-; ZVFHMIN32-NEXT: vmv.x.s t5, v24
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a3
+; ZVFHMIN32-NEXT: vmv.x.s a3, v24
; ZVFHMIN32-NEXT: fmv.h.x fa4, a0
; ZVFHMIN32-NEXT: fmv.h.x fa3, a1
; ZVFHMIN32-NEXT: feq.h a0, fa4, fa3
; ZVFHMIN32-NEXT: sb a0, 209(sp)
; ZVFHMIN32-NEXT: lh a0, 544(sp)
; ZVFHMIN32-NEXT: lh a1, 288(sp)
-; ZVFHMIN32-NEXT: fmv.h.x fa4, t5
-; ZVFHMIN32-NEXT: feq.h t5, fa5, fa4
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a3
+; ZVFHMIN32-NEXT: feq.h a3, fa5, fa4
; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT: sb t5, 192(sp)
+; ZVFHMIN32-NEXT: sb a3, 192(sp)
; ZVFHMIN32-NEXT: sb a0, 208(sp)
; ZVFHMIN32-NEXT: lh a0, 738(sp)
; ZVFHMIN32-NEXT: lh a1, 482(sp)
-; ZVFHMIN32-NEXT: csrr a2, vlenb
-; ZVFHMIN32-NEXT: li a3, 29
-; ZVFHMIN32-NEXT: mul a2, a2, a3
-; ZVFHMIN32-NEXT: add a2, sp, a2
-; ZVFHMIN32-NEXT: lh s7, 848(a2) # 8-byte Folded Reload
-; ZVFHMIN32-NEXT: csrr a2, vlenb
-; ZVFHMIN32-NEXT: li a3, 28
-; ZVFHMIN32-NEXT: mul a2, a2, a3
-; ZVFHMIN32-NEXT: add a2, sp, a2
-; ZVFHMIN32-NEXT: lh s4, 848(a2) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT: vmv.x.s a2, v10
+; ZVFHMIN32-NEXT: sw a2, 112(sp) # 4-byte Folded Spill
+; ZVFHMIN32-NEXT: vmv.x.s a2, v12
+; ZVFHMIN32-NEXT: sw a2, 120(sp) # 4-byte Folded Spill
; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
@@ -1582,15 +1567,15 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
; ZVFHMIN32-NEXT: lh a0, 736(sp)
; ZVFHMIN32-NEXT: lh a1, 480(sp)
; ZVFHMIN32-NEXT: csrr a2, vlenb
-; ZVFHMIN32-NEXT: li a3, 27
+; ZVFHMIN32-NEXT: li a3, 29
; ZVFHMIN32-NEXT: mul a2, a2, a3
; ZVFHMIN32-NEXT: add a2, sp, a2
-; ZVFHMIN32-NEXT: lh s8, 848(a2) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT: lh s5, 848(a2) # 8-byte Folded Reload
; ZVFHMIN32-NEXT: csrr a2, vlenb
-; ZVFHMIN32-NEXT: li a3, 26
+; ZVFHMIN32-NEXT: li a3, 28
; ZVFHMIN32-NEXT: mul a2, a2, a3
; ZVFHMIN32-NEXT: add a2, sp, a2
-; ZVFHMIN32-NEXT: lh s5, 848(a2) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT: lh s2, 848(a2) # 8-byte Folded Reload
; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
@@ -1598,15 +1583,15 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
; ZVFHMIN32-NEXT: lh a0, 734(sp)
; ZVFHMIN32-NEXT: lh a1, 478(sp)
; ZVFHMIN32-NEXT: csrr a2, vlenb
-; ZVFHMIN32-NEXT: li a3, 25
+; ZVFHMIN32-NEXT: li a3, 27
; ZVFHMIN32-NEXT: mul a2, a2, a3
; ZVFHMIN32-NEXT: add a2, sp, a2
-; ZVFHMIN32-NEXT: lh s9, 848(a2) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT: lh s6, 848(a2) # 8-byte Folded Reload
; ZVFHMIN32-NEXT: csrr a2, vlenb
-; ZVFHMIN32-NEXT: li a3, 24
+; ZVFHMIN32-NEXT: li a3, 26
; ZVFHMIN32-NEXT: mul a2, a2, a3
; ZVFHMIN32-NEXT: add a2, sp, a2
-; ZVFHMIN32-NEXT: lh s6, 848(a2) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT: lh s3, 848(a2) # 8-byte Folded Reload
; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
@@ -1614,138 +1599,148 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
; ZVFHMIN32-NEXT: lh a0, 732(sp)
; ZVFHMIN32-NEXT: lh a1, 476(sp)
; ZVFHMIN32-NEXT: csrr a2, vlenb
-; ZVFHMIN32-NEXT: li a3, 23
+; ZVFHMIN32-NEXT: li a3, 25
; ZVFHMIN32-NEXT: mul a2, a2, a3
; ZVFHMIN32-NEXT: add a2, sp, a2
-; ZVFHMIN32-NEXT: lh s3, 848(a2) # 8-byte Folded Reload
-; ZVFHMIN32-NEXT: vmv.x.s t5, v3
+; ZVFHMIN32-NEXT: lh s7, 848(a2) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT: csrr a2, vlenb
+; ZVFHMIN32-NEXT: li a3, 24
+; ZVFHMIN32-NEXT: mul a2, a2, a3
+; ZVFHMIN32-NEXT: add a2, sp, a2
+; ZVFHMIN32-NEXT: lh s4, 848(a2) # 8-byte Folded Reload
; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
; ZVFHMIN32-NEXT: sb a0, 174(sp)
; ZVFHMIN32-NEXT: lh a0, 730(sp)
; ZVFHMIN32-NEXT: lh a1, 474(sp)
-; ZVFHMIN32-NEXT: vmv.x.s s2, v31
-; ZVFHMIN32-NEXT: vmv.x.s t6, v5
+; ZVFHMIN32-NEXT: csrr a2, vlenb
+; ZVFHMIN32-NEXT: li a3, 23
+; ZVFHMIN32-NEXT: mul a2, a2, a3
+; ZVFHMIN32-NEXT: add a2, sp, a2
+; ZVFHMIN32-NEXT: lh s8, 848(a2) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT: vmv.x.s t4, v21
; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
; ZVFHMIN32-NEXT: sb a0, 173(sp)
-; ZVFHMIN32-NEXT: lh a1, 728(sp)
-; ZVFHMIN32-NEXT: lh s10, 472(sp)
-; ZVFHMIN32-NEXT: vmv.x.s a3, v9
-; ZVFHMIN32-NEXT: vmv.x.s a4, v11
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT: fmv.h.x fa4, s10
-; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4
-; ZVFHMIN32-NEXT: sb a1, 172(sp)
-; ZVFHMIN32-NEXT: lh a1, 726(sp)
-; ZVFHMIN32-NEXT: lh s10, 470(sp)
-; ZVFHMIN32-NEXT: vmv.x.s a2, v13
-; ZVFHMIN32-NEXT: vmv.x.s s11, v29
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT: fmv.h.x fa4, s10
-; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4
-; ZVFHMIN32-NEXT: sb a1, 171(sp)
-; ZVFHMIN32-NEXT: lh ra, 724(sp)
-; ZVFHMIN32-NEXT: lh a0, 468(sp)
-; ZVFHMIN32-NEXT: vmv.x.s a5, v27
-; ZVFHMIN32-NEXT: vmv.x.s s10, v7
-; ZVFHMIN32-NEXT: fmv.h.x fa5, ra
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a0
+; ZVFHMIN32-NEXT: lh a0, 728(sp)
+; ZVFHMIN32-NEXT: lh a1, 472(sp)
+; ZVFHMIN32-NEXT: vmv.x.s t6, v3
+; ZVFHMIN32-NEXT: vmv.x.s t5, v19
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT: sb a0, 172(sp)
+; ZVFHMIN32-NEXT: lh a0, 726(sp)
+; ZVFHMIN32-NEXT: lh a1, 470(sp)
+; ZVFHMIN32-NEXT: vmv.x.s s10, v11
+; ZVFHMIN32-NEXT: vmv.x.s s11, v7
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT: sb a0, 171(sp)
+; ZVFHMIN32-NEXT: lh a0, 724(sp)
+; ZVFHMIN32-NEXT: lh s9, 468(sp)
+; ZVFHMIN32-NEXT: vmv.x.s a4, v9
+; ZVFHMIN32-NEXT: vmv.x.s ra, v29
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT: fmv.h.x fa4, s9
; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
; ZVFHMIN32-NEXT: sb a0, 170(sp)
; ZVFHMIN32-NEXT: lh a0, 722(sp)
; ZVFHMIN32-NEXT: lh a1, 466(sp)
-; ZVFHMIN32-NEXT: vmv.x.s ra, v21
-; ZVFHMIN32-NEXT: fmv.h.x fa5, s7
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a0
-; ZVFHMIN32-NEXT: fmv.h.x fa3, a1
-; ZVFHMIN32-NEXT: feq.h a0, fa4, fa3
+; ZVFHMIN32-NEXT: vmv.x.s s9, v31
+; ZVFHMIN32-NEXT: vmv.x.s a3, v5
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
; ZVFHMIN32-NEXT: sb a0, 169(sp)
; ZVFHMIN32-NEXT: lh a0, 720(sp)
; ZVFHMIN32-NEXT: lh a1, 464(sp)
-; ZVFHMIN32-NEXT: fmv.h.x fa4, s4
-; ZVFHMIN32-NEXT: fmv.h.x fa3, s8
-; ZVFHMIN32-NEXT: fmv.h.x fa2, a0
-; ZVFHMIN32-NEXT: fmv.h.x fa1, a1
-; ZVFHMIN32-NEXT: feq.h a0, fa2, fa1
+; ZVFHMIN32-NEXT: vmv.x.s a2, v27
+; ZVFHMIN32-NEXT: fmv.h.x fa5, s5
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a0
+; ZVFHMIN32-NEXT: fmv.h.x fa3, a1
+; ZVFHMIN32-NEXT: feq.h a0, fa4, fa3
; ZVFHMIN32-NEXT: sb a0, 168(sp)
; ZVFHMIN32-NEXT: lh a0, 718(sp)
; ZVFHMIN32-NEXT: lh a1, 462(sp)
-; ZVFHMIN32-NEXT: fmv.h.x fa2, s5
-; ZVFHMIN32-NEXT: fmv.h.x fa1, s9
-; ZVFHMIN32-NEXT: fmv.h.x fa0, a0
-; ZVFHMIN32-NEXT: fmv.h.x ft0, a1
-; ZVFHMIN32-NEXT: feq.h a0, fa0, ft0
+; ZVFHMIN32-NEXT: fmv.h.x fa4, s2
+; ZVFHMIN32-NEXT: fmv.h.x fa3, s6
+; ZVFHMIN32-NEXT: fmv.h.x fa2, a0
+; ZVFHMIN32-NEXT: fmv.h.x fa1, a1
+; ZVFHMIN32-NEXT: feq.h a0, fa2, fa1
; ZVFHMIN32-NEXT: sb a0, 167(sp)
; ZVFHMIN32-NEXT: lh a0, 716(sp)
-; ZVFHMIN32-NEXT: fmv.h.x fa0, s6
; ZVFHMIN32-NEXT: lh a1, 460(sp)
-; ZVFHMIN32-NEXT: fmv.h.x ft0, a3
+; ZVFHMIN32-NEXT: fmv.h.x fa2, s3
+; ZVFHMIN32-NEXT: fmv.h.x fa1, s7
+; ZVFHMIN32-NEXT: fmv.h.x fa0, a0
+; ZVFHMIN32-NEXT: fmv.h.x ft0, a1
+; ZVFHMIN32-NEXT: feq.h a0, fa0, ft0
+; ZVFHMIN32-NEXT: sb a0, 166(sp)
+; ZVFHMIN32-NEXT: lh a0, 714(sp)
+; ZVFHMIN32-NEXT: lh a1, 458(sp)
+; ZVFHMIN32-NEXT: fmv.h.x fa0, s4
+; ZVFHMIN32-NEXT: fmv.h.x ft0, s8
; ZVFHMIN32-NEXT: fmv.h.x ft1, a0
-; ZVFHMIN32-NEXT: feq.h a0, fa5, ft0
+; ZVFHMIN32-NEXT: fmv.h.x ft2, a1
+; ZVFHMIN32-NEXT: feq.h a0, ft1, ft2
+; ZVFHMIN32-NEXT: sb a0, 165(sp)
+; ZVFHMIN32-NEXT: lh a0, 712(sp)
+; ZVFHMIN32-NEXT: lh a1, 456(sp)
+; ZVFHMIN32-NEXT: fmv.h.x ft1, s10
+; ZVFHMIN32-NEXT: fmv.h.x ft2, s11
+; ZVFHMIN32-NEXT: fmv.h.x ft3, a0
+; ZVFHMIN32-NEXT: fmv.h.x ft4, a1
+; ZVFHMIN32-NEXT: feq.h a0, ft3, ft4
+; ZVFHMIN32-NEXT: sb a0, 164(sp)
+; ZVFHMIN32-NEXT: lh a0, 710(sp)
+; ZVFHMIN32-NEXT: fmv.h.x ft3, a4
+; ZVFHMIN32-NEXT: lh a1, 454(sp)
+; ZVFHMIN32-NEXT: fmv.h.x ft4, ra
+; ZVFHMIN32-NEXT: fmv.h.x ft5, a0
+; ZVFHMIN32-NEXT: feq.h a0, fa5, ft1
; ZVFHMIN32-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT: feq.h a1, ft1, fa5
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a4
-; ZVFHMIN32-NEXT: sb a1, 166(sp)
-; ZVFHMIN32-NEXT: lh a1, 714(sp)
-; ZVFHMIN32-NEXT: fmv.h.x ft0, a2
-; ZVFHMIN32-NEXT: lh a2, 458(sp)
-; ZVFHMIN32-NEXT: feq.h a3, fa4, fa5
+; ZVFHMIN32-NEXT: feq.h a1, ft5, fa5
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a3
+; ZVFHMIN32-NEXT: sb a1, 163(sp)
+; ZVFHMIN32-NEXT: lh a1, 708(sp)
+; ZVFHMIN32-NEXT: fmv.h.x ft1, a2
+; ZVFHMIN32-NEXT: lh a2, 452(sp)
+; ZVFHMIN32-NEXT: feq.h a3, fa0, fa5
; ZVFHMIN32-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT: feq.h a1, fa3, ft0
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a2
-; ZVFHMIN32-NEXT: feq.h a2, fa5, fa4
-; ZVFHMIN32-NEXT: fmv.h.x fa5, s3
-; ZVFHMIN32-NEXT: sb a2, 165(sp)
-; ZVFHMIN32-NEXT: lh a2, 712(sp)
-; ZVFHMIN32-NEXT: lh a4, 456(sp)
-; ZVFHMIN32-NEXT: fmv.h.x fa4, s11
-; ZVFHMIN32-NEXT: feq.h s3, fa2, fa4
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a2
-; ZVFHMIN32-NEXT: fmv.h.x fa3, a4
-; ZVFHMIN32-NEXT: feq.h a2, fa4, fa3
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a5
-; ZVFHMIN32-NEXT: sb a2, 164(sp)
-; ZVFHMIN32-NEXT: lh a2, 710(sp)
-; ZVFHMIN32-NEXT: lh a4, 454(sp)
-; ZVFHMIN32-NEXT: fmv.h.x fa3, s10
-; ZVFHMIN32-NEXT: feq.h a5, fa1, fa4
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a2
-; ZVFHMIN32-NEXT: fmv.h.x fa2, a4
-; ZVFHMIN32-NEXT: feq.h a2, fa4, fa2
-; ZVFHMIN32-NEXT: fmv.h.x fa4, ra
-; ZVFHMIN32-NEXT: sb a2, 163(sp)
-; ZVFHMIN32-NEXT: lh a2, 708(sp)
-; ZVFHMIN32-NEXT: lh a4, 452(sp)
-; ZVFHMIN32-NEXT: feq.h s4, fa0, fa3
-; ZVFHMIN32-NEXT: feq.h s5, fa5, fa4
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a2
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a4
-; ZVFHMIN32-NEXT: feq.h a2, fa5, fa4
+; ZVFHMIN32-NEXT: feq.h a1, ft0, ft1
+; ZVFHMIN32-NEXT: fmv.h.x fa0, a2
+; ZVFHMIN32-NEXT: feq.h a2, fa5, fa0
+; ZVFHMIN32-NEXT: fmv.h.x fa5, s9
; ZVFHMIN32-NEXT: sb a2, 162(sp)
; ZVFHMIN32-NEXT: lh a2, 706(sp)
; ZVFHMIN32-NEXT: lh a4, 450(sp)
-; ZVFHMIN32-NEXT: sb s5, 129(sp)
-; ZVFHMIN32-NEXT: sb s4, 130(sp)
-; ZVFHMIN32-NEXT: sb a5, 131(sp)
-; ZVFHMIN32-NEXT: sb s3, 132(sp)
+; ZVFHMIN32-NEXT: sb a1, 129(sp)
+; ZVFHMIN32-NEXT: feq.h a1, fa1, fa5
+; ZVFHMIN32-NEXT: sb a3, 130(sp)
+; ZVFHMIN32-NEXT: feq.h a3, fa2, ft4
+; ZVFHMIN32-NEXT: sb a1, 131(sp)
+; ZVFHMIN32-NEXT: feq.h a1, fa4, ft2
+; ZVFHMIN32-NEXT: sb a3, 132(sp)
+; ZVFHMIN32-NEXT: feq.h a3, fa3, ft3
; ZVFHMIN32-NEXT: fmv.h.x fa5, a2
; ZVFHMIN32-NEXT: fmv.h.x fa4, a4
; ZVFHMIN32-NEXT: feq.h a2, fa5, fa4
-; ZVFHMIN32-NEXT: sb a1, 133(sp)
-; ZVFHMIN32-NEXT: sb a3, 134(sp)
+; ZVFHMIN32-NEXT: sb a3, 133(sp)
+; ZVFHMIN32-NEXT: sb a1, 134(sp)
; ZVFHMIN32-NEXT: sb a0, 135(sp)
; ZVFHMIN32-NEXT: sb a2, 161(sp)
; ZVFHMIN32-NEXT: lh a0, 610(sp)
; ZVFHMIN32-NEXT: lh a1, 354(sp)
-; ZVFHMIN32-NEXT: vmv.x.s s6, v23
+; ZVFHMIN32-NEXT: vmv.x.s s4, v23
; ZVFHMIN32-NEXT: csrr a2, vlenb
-; ZVFHMIN32-NEXT: li a3, 18
+; ZVFHMIN32-NEXT: li a3, 10
; ZVFHMIN32-NEXT: mul a2, a2, a3
; ZVFHMIN32-NEXT: add a2, sp, a2
-; ZVFHMIN32-NEXT: lh s5, 848(a2) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT: lh s2, 848(a2) # 8-byte Folded Reload
; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
@@ -1753,13 +1748,12 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
; ZVFHMIN32-NEXT: lh a0, 608(sp)
; ZVFHMIN32-NEXT: lh a1, 352(sp)
; ZVFHMIN32-NEXT: csrr a2, vlenb
-; ZVFHMIN32-NEXT: li a3, 22
-; ZVFHMIN32-NEXT: mul a2, a2, a3
+; ZVFHMIN32-NEXT: slli a2, a2, 4
; ZVFHMIN32-NEXT: add a2, sp, a2
-; ZVFHMIN32-NEXT: lh s4, 848(a2) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT: lh s5, 848(a2) # 8-byte Folded Reload
; ZVFHMIN32-NEXT: csrr a2, vlenb
-; ZVFHMIN32-NEXT: li a3, 21
-; ZVFHMIN32-NEXT: mul a2, a2, a3
+; ZVFHMIN32-NEXT: slli a3, a2, 4
+; ZVFHMIN32-NEXT: sub a2, a3, a2
; ZVFHMIN32-NEXT: add a2, sp, a2
; ZVFHMIN32-NEXT: lh s3, 848(a2) # 8-byte Folded Reload
; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
@@ -1768,148 +1762,148 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
; ZVFHMIN32-NEXT: sb a0, 240(sp)
; ZVFHMIN32-NEXT: lh a0, 606(sp)
; ZVFHMIN32-NEXT: lh a1, 350(sp)
-; ZVFHMIN32-NEXT: fmv.h.x fa3, t5
-; ZVFHMIN32-NEXT: fmv.h.x fa5, s2
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a0
-; ZVFHMIN32-NEXT: fmv.h.x fa2, a1
-; ZVFHMIN32-NEXT: feq.h a0, fa4, fa2
+; ZVFHMIN32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 7
+; ZVFHMIN32-NEXT: vmv.x.s s6, v8
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
; ZVFHMIN32-NEXT: sb a0, 239(sp)
; ZVFHMIN32-NEXT: lh a0, 604(sp)
; ZVFHMIN32-NEXT: lh a1, 348(sp)
-; ZVFHMIN32-NEXT: fmv.h.x fa4, t6
-; ZVFHMIN32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 7
-; ZVFHMIN32-NEXT: fmv.h.x fa2, a0
-; ZVFHMIN32-NEXT: fmv.h.x fa1, a1
-; ZVFHMIN32-NEXT: feq.h a0, fa2, fa1
+; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 6
+; ZVFHMIN32-NEXT: vmv.x.s s7, v8
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
; ZVFHMIN32-NEXT: sb a0, 238(sp)
; ZVFHMIN32-NEXT: lh a0, 602(sp)
; ZVFHMIN32-NEXT: lh a1, 346(sp)
-; ZVFHMIN32-NEXT: vmv.x.s a2, v8
-; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 6
-; ZVFHMIN32-NEXT: fmv.h.x fa2, a0
-; ZVFHMIN32-NEXT: fmv.h.x fa1, a1
-; ZVFHMIN32-NEXT: feq.h a0, fa2, fa1
+; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 5
+; ZVFHMIN32-NEXT: vmv.x.s s8, v8
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
; ZVFHMIN32-NEXT: sb a0, 237(sp)
; ZVFHMIN32-NEXT: lh a0, 600(sp)
; ZVFHMIN32-NEXT: lh a1, 344(sp)
-; ZVFHMIN32-NEXT: vmv.x.s a3, v8
-; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 5
-; ZVFHMIN32-NEXT: fmv.h.x fa2, a0
-; ZVFHMIN32-NEXT: fmv.h.x fa1, a1
-; ZVFHMIN32-NEXT: feq.h a0, fa2, fa1
+; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 4
+; ZVFHMIN32-NEXT: vmv.x.s s9, v8
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
; ZVFHMIN32-NEXT: sb a0, 236(sp)
; ZVFHMIN32-NEXT: lh a0, 598(sp)
; ZVFHMIN32-NEXT: lh a1, 342(sp)
-; ZVFHMIN32-NEXT: vmv.x.s a4, v8
-; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 4
-; ZVFHMIN32-NEXT: fmv.h.x fa2, a0
-; ZVFHMIN32-NEXT: fmv.h.x fa1, a1
-; ZVFHMIN32-NEXT: feq.h a0, fa2, fa1
+; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 3
+; ZVFHMIN32-NEXT: vmv.x.s s10, v8
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
; ZVFHMIN32-NEXT: sb a0, 235(sp)
; ZVFHMIN32-NEXT: lh a0, 596(sp)
; ZVFHMIN32-NEXT: lh a1, 340(sp)
-; ZVFHMIN32-NEXT: vmv.x.s a5, v8
-; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 3
-; ZVFHMIN32-NEXT: fmv.h.x fa2, a0
-; ZVFHMIN32-NEXT: fmv.h.x fa1, a1
-; ZVFHMIN32-NEXT: feq.h a0, fa2, fa1
+; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 2
+; ZVFHMIN32-NEXT: vmv.x.s s11, v8
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
; ZVFHMIN32-NEXT: sb a0, 234(sp)
; ZVFHMIN32-NEXT: lh a0, 594(sp)
; ZVFHMIN32-NEXT: lh a1, 338(sp)
-; ZVFHMIN32-NEXT: vmv.x.s t6, v8
-; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 2
-; ZVFHMIN32-NEXT: fmv.h.x fa2, a0
-; ZVFHMIN32-NEXT: fmv.h.x fa1, a1
-; ZVFHMIN32-NEXT: feq.h a0, fa2, fa1
+; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 1
+; ZVFHMIN32-NEXT: vmv.x.s ra, v8
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
; ZVFHMIN32-NEXT: sb a0, 233(sp)
; ZVFHMIN32-NEXT: lh a0, 592(sp)
; ZVFHMIN32-NEXT: lh a1, 336(sp)
-; ZVFHMIN32-NEXT: vmv.x.s s2, v8
-; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 1
-; ZVFHMIN32-NEXT: fmv.h.x fa2, a0
-; ZVFHMIN32-NEXT: fmv.h.x fa1, a1
-; ZVFHMIN32-NEXT: feq.h a0, fa2, fa1
+; ZVFHMIN32-NEXT: fmv.h.x fa5, t4
+; ZVFHMIN32-NEXT: fmv.h.x fa4, t6
+; ZVFHMIN32-NEXT: fmv.h.x fa3, a0
+; ZVFHMIN32-NEXT: fmv.h.x fa2, a1
+; ZVFHMIN32-NEXT: feq.h a0, fa3, fa2
; ZVFHMIN32-NEXT: sb a0, 232(sp)
; ZVFHMIN32-NEXT: lh a0, 590(sp)
-; ZVFHMIN32-NEXT: fmv.h.x fa2, a2
; ZVFHMIN32-NEXT: lh a1, 334(sp)
-; ZVFHMIN32-NEXT: vmv.x.s a2, v8
+; ZVFHMIN32-NEXT: fmv.h.x fa3, t5
+; ZVFHMIN32-NEXT: fmv.h.x fa2, s4
; ZVFHMIN32-NEXT: fmv.h.x fa1, a0
-; ZVFHMIN32-NEXT: feq.h t5, fa3, fa2
-; ZVFHMIN32-NEXT: fmv.h.x fa3, a1
-; ZVFHMIN32-NEXT: feq.h a0, fa1, fa3
-; ZVFHMIN32-NEXT: fmv.h.x fa3, a3
+; ZVFHMIN32-NEXT: fmv.h.x fa0, a1
+; ZVFHMIN32-NEXT: feq.h a0, fa1, fa0
; ZVFHMIN32-NEXT: sb a0, 231(sp)
; ZVFHMIN32-NEXT: lh a0, 588(sp)
-; ZVFHMIN32-NEXT: fmv.h.x fa2, a4
; ZVFHMIN32-NEXT: lh a1, 332(sp)
-; ZVFHMIN32-NEXT: feq.h a3, fa5, fa3
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT: feq.h a0, fa4, fa2
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4
-; ZVFHMIN32-NEXT: fmv.h.x fa5, s6
-; ZVFHMIN32-NEXT: sb a1, 230(sp)
-; ZVFHMIN32-NEXT: lh a1, 586(sp)
-; ZVFHMIN32-NEXT: lh a4, 330(sp)
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a5
-; ZVFHMIN32-NEXT: feq.h a5, fa5, fa4
+; ZVFHMIN32-NEXT: fmv.h.x fa1, s2
+; ZVFHMIN32-NEXT: fmv.h.x fa0, s5
+; ZVFHMIN32-NEXT: fmv.h.x ft0, a0
+; ZVFHMIN32-NEXT: fmv.h.x ft1, a1
+; ZVFHMIN32-NEXT: feq.h a0, ft0, ft1
+; ZVFHMIN32-NEXT: sb a0, 230(sp)
+; ZVFHMIN32-NEXT: lh a0, 586(sp)
+; ZVFHMIN32-NEXT: fmv.h.x ft0, s3
+; ZVFHMIN32-NEXT: lh a1, 330(sp)
+; ZVFHMIN32-NEXT: fmv.h.x ft1, s6
+; ZVFHMIN32-NEXT: fmv.h.x ft2, a0
+; ZVFHMIN32-NEXT: feq.h a0, fa5, ft1
; ZVFHMIN32-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a4
-; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4
-; ZVFHMIN32-NEXT: fmv.h.x fa5, s5
+; ZVFHMIN32-NEXT: feq.h a1, ft2, fa5
+; ZVFHMIN32-NEXT: fmv.h.x fa5, s7
; ZVFHMIN32-NEXT: sb a1, 229(sp)
; ZVFHMIN32-NEXT: lh a1, 584(sp)
-; ZVFHMIN32-NEXT: lh a4, 328(sp)
-; ZVFHMIN32-NEXT: fmv.h.x fa4, t6
-; ZVFHMIN32-NEXT: feq.h t6, fa5, fa4
+; ZVFHMIN32-NEXT: fmv.h.x ft1, s8
+; ZVFHMIN32-NEXT: lh a2, 328(sp)
+; ZVFHMIN32-NEXT: feq.h a3, fa4, fa5
; ZVFHMIN32-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a4
-; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4
-; ZVFHMIN32-NEXT: fmv.h.x fa5, s4
-; ZVFHMIN32-NEXT: sb a1, 228(sp)
-; ZVFHMIN32-NEXT: lh a1, 582(sp)
-; ZVFHMIN32-NEXT: lh a4, 326(sp)
-; ZVFHMIN32-NEXT: fmv.h.x fa4, s2
-; ZVFHMIN32-NEXT: feq.h s2, fa5, fa4
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a4
-; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4
-; ZVFHMIN32-NEXT: fmv.h.x fa5, s3
-; ZVFHMIN32-NEXT: sb a1, 227(sp)
-; ZVFHMIN32-NEXT: lh a1, 580(sp)
-; ZVFHMIN32-NEXT: lh a4, 324(sp)
+; ZVFHMIN32-NEXT: feq.h a1, fa3, ft1
; ZVFHMIN32-NEXT: fmv.h.x fa4, a2
; ZVFHMIN32-NEXT: feq.h a2, fa5, fa4
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a4
-; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4
-; ZVFHMIN32-NEXT: sb a1, 226(sp)
-; ZVFHMIN32-NEXT: lh a1, 578(sp)
+; ZVFHMIN32-NEXT: fmv.h.x fa5, s9
+; ZVFHMIN32-NEXT: sb a2, 228(sp)
+; ZVFHMIN32-NEXT: lh a2, 582(sp)
+; ZVFHMIN32-NEXT: lh a4, 326(sp)
+; ZVFHMIN32-NEXT: fmv.h.x fa4, s10
+; ZVFHMIN32-NEXT: feq.h t4, fa2, fa5
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a2
+; ZVFHMIN32-NEXT: fmv.h.x fa3, a4
+; ZVFHMIN32-NEXT: feq.h a2, fa5, fa3
+; ZVFHMIN32-NEXT: fmv.h.x fa5, s11
+; ZVFHMIN32-NEXT: fmv.h.x fa3, ra
+; ZVFHMIN32-NEXT: sb a2, 227(sp)
+; ZVFHMIN32-NEXT: lh a2, 580(sp)
+; ZVFHMIN32-NEXT: lh a4, 324(sp)
+; ZVFHMIN32-NEXT: feq.h t5, fa0, fa5
+; ZVFHMIN32-NEXT: feq.h t6, ft0, fa3
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a2
+; ZVFHMIN32-NEXT: fmv.h.x fa3, a4
+; ZVFHMIN32-NEXT: feq.h a2, fa5, fa3
+; ZVFHMIN32-NEXT: sb a2, 226(sp)
+; ZVFHMIN32-NEXT: lh a2, 578(sp)
; ZVFHMIN32-NEXT: lh a4, 322(sp)
-; ZVFHMIN32-NEXT: sb a2, 193(sp)
-; ZVFHMIN32-NEXT: sb s2, 194(sp)
+; ZVFHMIN32-NEXT: sb t6, 193(sp)
+; ZVFHMIN32-NEXT: feq.h t6, fa1, fa4
+; ZVFHMIN32-NEXT: sb t5, 194(sp)
; ZVFHMIN32-NEXT: sb t6, 195(sp)
-; ZVFHMIN32-NEXT: sb a5, 196(sp)
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT: sb t4, 196(sp)
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a2
; ZVFHMIN32-NEXT: fmv.h.x fa4, a4
-; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4
-; ZVFHMIN32-NEXT: sb a0, 197(sp)
+; ZVFHMIN32-NEXT: feq.h a2, fa5, fa4
+; ZVFHMIN32-NEXT: sb a1, 197(sp)
; ZVFHMIN32-NEXT: sb a3, 198(sp)
-; ZVFHMIN32-NEXT: sb t5, 199(sp)
-; ZVFHMIN32-NEXT: sb a1, 225(sp)
+; ZVFHMIN32-NEXT: sb a0, 199(sp)
+; ZVFHMIN32-NEXT: sb a2, 225(sp)
; ZVFHMIN32-NEXT: lh a0, 766(sp)
; ZVFHMIN32-NEXT: lh a1, 510(sp)
; ZVFHMIN32-NEXT: csrr a2, vlenb
-; ZVFHMIN32-NEXT: li a3, 19
-; ZVFHMIN32-NEXT: mul a2, a2, a3
+; ZVFHMIN32-NEXT: slli a3, a2, 4
+; ZVFHMIN32-NEXT: add a2, a3, a2
; ZVFHMIN32-NEXT: add a2, sp, a2
; ZVFHMIN32-NEXT: addi a2, a2, 848
; ZVFHMIN32-NEXT: vl2r.v v8, (a2) # Unknown-size Folded Reload
; ZVFHMIN32-NEXT: vmv.x.s s2, v8
; ZVFHMIN32-NEXT: csrr a2, vlenb
-; ZVFHMIN32-NEXT: li a3, 14
+; ZVFHMIN32-NEXT: li a3, 11
; ZVFHMIN32-NEXT: mul a2, a2, a3
; ZVFHMIN32-NEXT: add a2, sp, a2
; ZVFHMIN32-NEXT: addi a2, a2, 848
@@ -1921,301 +1915,305 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
; ZVFHMIN32-NEXT: sb a0, 191(sp)
; ZVFHMIN32-NEXT: lh a0, 764(sp)
; ZVFHMIN32-NEXT: lh a1, 508(sp)
-; ZVFHMIN32-NEXT: vmv.x.s t5, v6
-; ZVFHMIN32-NEXT: csrr a2, vlenb
-; ZVFHMIN32-NEXT: slli a2, a2, 2
-; ZVFHMIN32-NEXT: add a2, sp, a2
-; ZVFHMIN32-NEXT: addi a2, a2, 848
-; ZVFHMIN32-NEXT: vl2r.v v8, (a2) # Unknown-size Folded Reload
-; ZVFHMIN32-NEXT: vmv.x.s a2, v8
+; ZVFHMIN32-NEXT: vmv.x.s t5, v4
+; ZVFHMIN32-NEXT: vmv.x.s t4, v30
; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
; ZVFHMIN32-NEXT: sb a0, 190(sp)
; ZVFHMIN32-NEXT: lh a0, 762(sp)
; ZVFHMIN32-NEXT: lh a1, 506(sp)
+; ZVFHMIN32-NEXT: csrr a2, vlenb
+; ZVFHMIN32-NEXT: slli a2, a2, 2
+; ZVFHMIN32-NEXT: add a2, sp, a2
+; ZVFHMIN32-NEXT: addi a2, a2, 848
+; ZVFHMIN32-NEXT: vl2r.v v8, (a2) # Unknown-size Folded Reload
+; ZVFHMIN32-NEXT: vmv.x.s a2, v8
; ZVFHMIN32-NEXT: csrr a3, vlenb
-; ZVFHMIN32-NEXT: slli a3, a3, 3
+; ZVFHMIN32-NEXT: slli a3, a3, 1
; ZVFHMIN32-NEXT: add a3, sp, a3
; ZVFHMIN32-NEXT: addi a3, a3, 848
; ZVFHMIN32-NEXT: vl2r.v v8, (a3) # Unknown-size Folded Reload
; ZVFHMIN32-NEXT: vmv.x.s a3, v8
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT: sb a0, 189(sp)
+; ZVFHMIN32-NEXT: lh a0, 760(sp)
+; ZVFHMIN32-NEXT: lh a1, 504(sp)
+; ZVFHMIN32-NEXT: fmv.h.x fa5, t3
; ZVFHMIN32-NEXT: csrr a4, vlenb
-; ZVFHMIN32-NEXT: li a5, 6
-; ZVFHMIN32-NEXT: mul a4, a4, a5
+; ZVFHMIN32-NEXT: li t3, 6
+; ZVFHMIN32-NEXT: mul a4, a4, t3
; ZVFHMIN32-NEXT: add a4, sp, a4
; ZVFHMIN32-NEXT: addi a4, a4, 848
; ZVFHMIN32-NEXT: vl2r.v v8, (a4) # Unknown-size Folded Reload
; ZVFHMIN32-NEXT: vmv.x.s a4, v8
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT: sb a0, 189(sp)
-; ZVFHMIN32-NEXT: lh a1, 760(sp)
-; ZVFHMIN32-NEXT: lh a5, 504(sp)
-; ZVFHMIN32-NEXT: csrr a0, vlenb
-; ZVFHMIN32-NEXT: li s3, 12
-; ZVFHMIN32-NEXT: mul a0, a0, s3
-; ZVFHMIN32-NEXT: add a0, sp, a0
-; ZVFHMIN32-NEXT: addi a0, a0, 848
-; ZVFHMIN32-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload
-; ZVFHMIN32-NEXT: vmv.x.s s5, v8
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a0
+; ZVFHMIN32-NEXT: fmv.h.x fa3, a1
+; ZVFHMIN32-NEXT: feq.h a0, fa4, fa3
+; ZVFHMIN32-NEXT: sb a0, 188(sp)
+; ZVFHMIN32-NEXT: lh a0, 758(sp)
+; ZVFHMIN32-NEXT: lh a1, 502(sp)
+; ZVFHMIN32-NEXT: fmv.h.x fa4, t2
+; ZVFHMIN32-NEXT: csrr t2, vlenb
+; ZVFHMIN32-NEXT: slli t2, t2, 3
+; ZVFHMIN32-NEXT: add t2, sp, t2
+; ZVFHMIN32-NEXT: addi t2, t2, 848
+; ZVFHMIN32-NEXT: vl2r.v v8, (t2) # Unknown-size Folded Reload
+; ZVFHMIN32-NEXT: vmv.x.s t2, v8
+; ZVFHMIN32-NEXT: fmv.h.x fa3, a0
+; ZVFHMIN32-NEXT: fmv.h.x fa2, a1
+; ZVFHMIN32-NEXT: feq.h a0, fa3, fa2
+; ZVFHMIN32-NEXT: sb a0, 187(sp)
+; ZVFHMIN32-NEXT: lh a0, 756(sp)
+; ZVFHMIN32-NEXT: lh a1, 500(sp)
+; ZVFHMIN32-NEXT: fmv.h.x fa3, t1
+; ZVFHMIN32-NEXT: csrr t1, vlenb
+; ZVFHMIN32-NEXT: li t3, 13
+; ZVFHMIN32-NEXT: mul t1, t1, t3
+; ZVFHMIN32-NEXT: add t1, sp, t1
+; ZVFHMIN32-NEXT: addi t1, t1, 848
+; ZVFHMIN32-NEXT: vl2r.v v8, (t1) # Unknown-size Folded Reload
+; ZVFHMIN32-NEXT: vmv.x.s t3, v8
+; ZVFHMIN32-NEXT: fmv.h.x fa2, a0
+; ZVFHMIN32-NEXT: fmv.h.x fa1, a1
+; ZVFHMIN32-NEXT: feq.h a0, fa2, fa1
+; ZVFHMIN32-NEXT: sb a0, 186(sp)
+; ZVFHMIN32-NEXT: lh a0, 754(sp)
+; ZVFHMIN32-NEXT: fmv.h.x fa2, t0
+; ZVFHMIN32-NEXT: lh a1, 498(sp)
+; ZVFHMIN32-NEXT: csrr t0, vlenb
+; ZVFHMIN32-NEXT: li t1, 19
+; ZVFHMIN32-NEXT: mul t0, t0, t1
+; ZVFHMIN32-NEXT: add t0, sp, t0
+; ZVFHMIN32-NEXT: addi t0, t0, 848
+; ZVFHMIN32-NEXT: vl2r.v v8, (t0) # Unknown-size Folded Reload
+; ZVFHMIN32-NEXT: vmv.x.s s3, v8
+; ZVFHMIN32-NEXT: fmv.h.x fa1, a0
; ZVFHMIN32-NEXT: csrr a0, vlenb
-; ZVFHMIN32-NEXT: li s3, 10
-; ZVFHMIN32-NEXT: mul a0, a0, s3
+; ZVFHMIN32-NEXT: li t0, 21
+; ZVFHMIN32-NEXT: mul a0, a0, t0
; ZVFHMIN32-NEXT: add a0, sp, a0
; ZVFHMIN32-NEXT: addi a0, a0, 848
; ZVFHMIN32-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload
; ZVFHMIN32-NEXT: vmv.x.s a0, v8
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a5
-; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4
-; ZVFHMIN32-NEXT: sb a1, 188(sp)
-; ZVFHMIN32-NEXT: lh a1, 758(sp)
-; ZVFHMIN32-NEXT: lh a5, 502(sp)
-; ZVFHMIN32-NEXT: csrr s3, vlenb
-; ZVFHMIN32-NEXT: slli s3, s3, 4
-; ZVFHMIN32-NEXT: add s3, sp, s3
-; ZVFHMIN32-NEXT: addi s3, s3, 848
-; ZVFHMIN32-NEXT: vl2r.v v8, (s3) # Unknown-size Folded Reload
-; ZVFHMIN32-NEXT: vmv.x.s s4, v8
-; ZVFHMIN32-NEXT: vmv.x.s s3, v16
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a5
-; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4
-; ZVFHMIN32-NEXT: fmv.h.x fa5, t4
-; ZVFHMIN32-NEXT: sb a1, 187(sp)
-; ZVFHMIN32-NEXT: lh a1, 756(sp)
-; ZVFHMIN32-NEXT: lh a5, 500(sp)
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a2
-; ZVFHMIN32-NEXT: feq.h t4, fa5, fa4
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a5
-; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4
-; ZVFHMIN32-NEXT: fmv.h.x fa5, t3
-; ZVFHMIN32-NEXT: sb a1, 186(sp)
-; ZVFHMIN32-NEXT: lh a1, 754(sp)
-; ZVFHMIN32-NEXT: lh a2, 498(sp)
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a3
-; ZVFHMIN32-NEXT: feq.h t3, fa5, fa4
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a2
-; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4
-; ZVFHMIN32-NEXT: fmv.h.x fa5, t1
+; ZVFHMIN32-NEXT: fmv.h.x fa0, a1
+; ZVFHMIN32-NEXT: feq.h a1, fa1, fa0
+; ZVFHMIN32-NEXT: fmv.h.x fa1, a2
; ZVFHMIN32-NEXT: sb a1, 185(sp)
; ZVFHMIN32-NEXT: lh a1, 752(sp)
+; ZVFHMIN32-NEXT: fmv.h.x fa0, a3
; ZVFHMIN32-NEXT: lh a2, 496(sp)
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a4
-; ZVFHMIN32-NEXT: feq.h t1, fa5, fa4
+; ZVFHMIN32-NEXT: feq.h t0, fa5, fa1
; ZVFHMIN32-NEXT: fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT: feq.h t1, fa4, fa0
; ZVFHMIN32-NEXT: fmv.h.x fa4, a2
; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4
-; ZVFHMIN32-NEXT: fmv.h.x fa5, t2
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a4
; ZVFHMIN32-NEXT: sb a1, 184(sp)
; ZVFHMIN32-NEXT: lh a1, 750(sp)
+; ZVFHMIN32-NEXT: fmv.h.x fa4, t2
; ZVFHMIN32-NEXT: lh a2, 494(sp)
-; ZVFHMIN32-NEXT: fmv.h.x fa4, s5
-; ZVFHMIN32-NEXT: feq.h a3, fa5, fa4
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a2
-; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4
-; ZVFHMIN32-NEXT: fmv.h.x fa5, t0
-; ZVFHMIN32-NEXT: sb a1, 183(sp)
-; ZVFHMIN32-NEXT: lh a1, 748(sp)
-; ZVFHMIN32-NEXT: lh a2, 492(sp)
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a0
-; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT: feq.h a3, fa3, fa5
; ZVFHMIN32-NEXT: fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT: feq.h a1, fa2, fa4
; ZVFHMIN32-NEXT: fmv.h.x fa4, a2
-; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT: feq.h a2, fa5, fa4
; ZVFHMIN32-NEXT: fmv.h.x fa5, a7
-; ZVFHMIN32-NEXT: sb a1, 182(sp)
-; ZVFHMIN32-NEXT: lh a1, 746(sp)
-; ZVFHMIN32-NEXT: lh a2, 490(sp)
-; ZVFHMIN32-NEXT: fmv.h.x fa4, s4
-; ZVFHMIN32-NEXT: feq.h a4, fa5, fa4
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a2
-; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT: sb a2, 183(sp)
+; ZVFHMIN32-NEXT: lh a2, 748(sp)
+; ZVFHMIN32-NEXT: lh a4, 492(sp)
+; ZVFHMIN32-NEXT: fmv.h.x fa4, t3
+; ZVFHMIN32-NEXT: feq.h a7, fa5, fa4
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a2
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a4
+; ZVFHMIN32-NEXT: feq.h a2, fa5, fa4
; ZVFHMIN32-NEXT: fmv.h.x fa5, a6
-; ZVFHMIN32-NEXT: sb a1, 181(sp)
-; ZVFHMIN32-NEXT: lh a1, 744(sp)
-; ZVFHMIN32-NEXT: lh a2, 488(sp)
+; ZVFHMIN32-NEXT: sb a2, 182(sp)
+; ZVFHMIN32-NEXT: lh a2, 746(sp)
+; ZVFHMIN32-NEXT: lh a4, 490(sp)
; ZVFHMIN32-NEXT: fmv.h.x fa4, s3
; ZVFHMIN32-NEXT: feq.h a6, fa5, fa4
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a2
-; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4
-; ZVFHMIN32-NEXT: lw a2, 104(sp) # 4-byte Folded Reload
; ZVFHMIN32-NEXT: fmv.h.x fa5, a2
-; ZVFHMIN32-NEXT: addi a2, sp, 848
-; ZVFHMIN32-NEXT: vl2r.v v8, (a2) # Unknown-size Folded Reload
-; ZVFHMIN32-NEXT: vmv.x.s a2, v8
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a4
+; ZVFHMIN32-NEXT: feq.h a2, fa5, fa4
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a5
+; ZVFHMIN32-NEXT: sb a2, 181(sp)
+; ZVFHMIN32-NEXT: lh a2, 744(sp)
+; ZVFHMIN32-NEXT: lh a4, 488(sp)
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a0
+; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a2
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a4
+; ZVFHMIN32-NEXT: feq.h a2, fa5, fa4
+; ZVFHMIN32-NEXT: lw a4, 108(sp) # 4-byte Folded Reload
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a4
+; ZVFHMIN32-NEXT: vmv.x.s a5, v0
; ZVFHMIN32-NEXT: vsetivli zero, 1, e16, m2, ta, ma
; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 15
-; ZVFHMIN32-NEXT: vmv.x.s a5, v8
-; ZVFHMIN32-NEXT: sb a1, 180(sp)
-; ZVFHMIN32-NEXT: lh a1, 742(sp)
-; ZVFHMIN32-NEXT: lh a7, 486(sp)
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a2
+; ZVFHMIN32-NEXT: vmv.x.s a4, v8
+; ZVFHMIN32-NEXT: sb a2, 180(sp)
+; ZVFHMIN32-NEXT: lh a2, 742(sp)
+; ZVFHMIN32-NEXT: lh t2, 486(sp)
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a5
+; ZVFHMIN32-NEXT: feq.h a5, fa5, fa4
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a2
+; ZVFHMIN32-NEXT: fmv.h.x fa4, t2
; ZVFHMIN32-NEXT: feq.h a2, fa5, fa4
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a7
-; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4
-; ZVFHMIN32-NEXT: sb a1, 179(sp)
-; ZVFHMIN32-NEXT: lh a1, 740(sp)
-; ZVFHMIN32-NEXT: lh a7, 484(sp)
-; ZVFHMIN32-NEXT: sb a3, 140(sp)
-; ZVFHMIN32-NEXT: sb t1, 141(sp)
-; ZVFHMIN32-NEXT: sb t3, 142(sp)
-; ZVFHMIN32-NEXT: sb t4, 143(sp)
-; ZVFHMIN32-NEXT: sb a2, 136(sp)
-; ZVFHMIN32-NEXT: sb a6, 137(sp)
-; ZVFHMIN32-NEXT: sb a4, 138(sp)
-; ZVFHMIN32-NEXT: sb a0, 139(sp)
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a7
+; ZVFHMIN32-NEXT: sb a2, 179(sp)
+; ZVFHMIN32-NEXT: lh a2, 740(sp)
+; ZVFHMIN32-NEXT: lh t2, 484(sp)
+; ZVFHMIN32-NEXT: sb a1, 140(sp)
+; ZVFHMIN32-NEXT: sb a3, 141(sp)
+; ZVFHMIN32-NEXT: sb t1, 142(sp)
+; ZVFHMIN32-NEXT: sb t0, 143(sp)
+; ZVFHMIN32-NEXT: sb a5, 136(sp)
+; ZVFHMIN32-NEXT: sb a0, 137(sp)
+; ZVFHMIN32-NEXT: sb a6, 138(sp)
+; ZVFHMIN32-NEXT: sb a7, 139(sp)
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a2
+; ZVFHMIN32-NEXT: fmv.h.x fa4, t2
; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
; ZVFHMIN32-NEXT: sb a0, 178(sp)
-; ZVFHMIN32-NEXT: lh a1, 638(sp)
-; ZVFHMIN32-NEXT: lh a2, 382(sp)
+; ZVFHMIN32-NEXT: lh a0, 638(sp)
+; ZVFHMIN32-NEXT: lh a1, 382(sp)
; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 14
-; ZVFHMIN32-NEXT: vmv.x.s a0, v8
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a2
-; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4
-; ZVFHMIN32-NEXT: sb a1, 255(sp)
-; ZVFHMIN32-NEXT: lh a1, 636(sp)
-; ZVFHMIN32-NEXT: lh a2, 380(sp)
-; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 13
; ZVFHMIN32-NEXT: vmv.x.s t2, v8
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a2
-; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4
-; ZVFHMIN32-NEXT: sb a1, 254(sp)
-; ZVFHMIN32-NEXT: lh a1, 634(sp)
-; ZVFHMIN32-NEXT: lh a2, 378(sp)
-; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 12
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT: sb a0, 255(sp)
+; ZVFHMIN32-NEXT: lh a0, 636(sp)
+; ZVFHMIN32-NEXT: lh a1, 380(sp)
+; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 13
; ZVFHMIN32-NEXT: vmv.x.s t1, v8
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a2
-; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4
-; ZVFHMIN32-NEXT: sb a1, 253(sp)
-; ZVFHMIN32-NEXT: lh a1, 632(sp)
-; ZVFHMIN32-NEXT: lh a2, 376(sp)
-; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 11
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT: sb a0, 254(sp)
+; ZVFHMIN32-NEXT: lh a0, 634(sp)
+; ZVFHMIN32-NEXT: lh a1, 378(sp)
+; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 12
; ZVFHMIN32-NEXT: vmv.x.s t0, v8
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a2
-; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4
-; ZVFHMIN32-NEXT: sb a1, 252(sp)
-; ZVFHMIN32-NEXT: lh a1, 630(sp)
-; ZVFHMIN32-NEXT: lh a2, 374(sp)
-; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 10
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT: sb a0, 253(sp)
+; ZVFHMIN32-NEXT: lh a0, 632(sp)
+; ZVFHMIN32-NEXT: lh a1, 376(sp)
+; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 11
; ZVFHMIN32-NEXT: vmv.x.s a7, v8
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a2
-; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4
-; ZVFHMIN32-NEXT: sb a1, 251(sp)
-; ZVFHMIN32-NEXT: lh a1, 628(sp)
-; ZVFHMIN32-NEXT: lh a2, 372(sp)
-; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 9
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT: sb a0, 252(sp)
+; ZVFHMIN32-NEXT: lh a0, 630(sp)
+; ZVFHMIN32-NEXT: lh a1, 374(sp)
+; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 10
; ZVFHMIN32-NEXT: vmv.x.s a6, v8
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a2
-; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4
-; ZVFHMIN32-NEXT: lw a2, 108(sp) # 4-byte Folded Reload
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a2
-; ZVFHMIN32-NEXT: sb a1, 250(sp)
-; ZVFHMIN32-NEXT: lh a1, 626(sp)
-; ZVFHMIN32-NEXT: lh a2, 370(sp)
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a5
-; ZVFHMIN32-NEXT: feq.h a4, fa5, fa4
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a2
-; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4
-; ZVFHMIN32-NEXT: lw a2, 112(sp) # 4-byte Folded Reload
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a2
-; ZVFHMIN32-NEXT: sb a1, 249(sp)
-; ZVFHMIN32-NEXT: lh a1, 624(sp)
-; ZVFHMIN32-NEXT: lh a2, 368(sp)
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a0
-; ZVFHMIN32-NEXT: feq.h a3, fa5, fa4
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a2
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT: lw a1, 116(sp) # 4-byte Folded Reload
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT: sb a0, 248(sp)
-; ZVFHMIN32-NEXT: lh a0, 622(sp)
-; ZVFHMIN32-NEXT: lh a1, 366(sp)
-; ZVFHMIN32-NEXT: fmv.h.x fa4, t2
-; ZVFHMIN32-NEXT: feq.h a2, fa5, fa4
+; ZVFHMIN32-NEXT: sb a0, 251(sp)
+; ZVFHMIN32-NEXT: lh a0, 628(sp)
+; ZVFHMIN32-NEXT: lh a1, 372(sp)
+; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 9
+; ZVFHMIN32-NEXT: vmv.x.s a5, v8
; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT: lw a1, 120(sp) # 4-byte Folded Reload
+; ZVFHMIN32-NEXT: lw a1, 116(sp) # 4-byte Folded Reload
; ZVFHMIN32-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT: sb a0, 247(sp)
-; ZVFHMIN32-NEXT: lh a0, 620(sp)
-; ZVFHMIN32-NEXT: lh a1, 364(sp)
-; ZVFHMIN32-NEXT: fmv.h.x fa4, t1
-; ZVFHMIN32-NEXT: feq.h a5, fa5, fa4
+; ZVFHMIN32-NEXT: sb a0, 250(sp)
+; ZVFHMIN32-NEXT: lh a0, 626(sp)
+; ZVFHMIN32-NEXT: lh a1, 370(sp)
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a4
+; ZVFHMIN32-NEXT: feq.h a2, fa5, fa4
; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
; ZVFHMIN32-NEXT: lw a1, 124(sp) # 4-byte Folded Reload
; ZVFHMIN32-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT: sb a0, 246(sp)
-; ZVFHMIN32-NEXT: lh a0, 618(sp)
-; ZVFHMIN32-NEXT: lh a1, 362(sp)
+; ZVFHMIN32-NEXT: sb a0, 249(sp)
+; ZVFHMIN32-NEXT: lh a1, 624(sp)
+; ZVFHMIN32-NEXT: lh a3, 368(sp)
+; ZVFHMIN32-NEXT: fmv.h.x fa4, t2
+; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a3
+; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT: lw a3, 112(sp) # 4-byte Folded Reload
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a3
+; ZVFHMIN32-NEXT: sb a1, 248(sp)
+; ZVFHMIN32-NEXT: lh a1, 622(sp)
+; ZVFHMIN32-NEXT: lh a3, 366(sp)
+; ZVFHMIN32-NEXT: fmv.h.x fa4, t1
+; ZVFHMIN32-NEXT: feq.h a4, fa5, fa4
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a3
+; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT: lw a3, 120(sp) # 4-byte Folded Reload
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a3
+; ZVFHMIN32-NEXT: sb a1, 247(sp)
+; ZVFHMIN32-NEXT: lh a1, 620(sp)
+; ZVFHMIN32-NEXT: lh a3, 364(sp)
; ZVFHMIN32-NEXT: fmv.h.x fa4, t0
; ZVFHMIN32-NEXT: feq.h t0, fa5, fa4
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a3
+; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4
; ZVFHMIN32-NEXT: fmv.h.x fa5, s2
-; ZVFHMIN32-NEXT: sb a0, 245(sp)
-; ZVFHMIN32-NEXT: lh a0, 616(sp)
-; ZVFHMIN32-NEXT: lh a1, 360(sp)
+; ZVFHMIN32-NEXT: sb a1, 246(sp)
+; ZVFHMIN32-NEXT: lh a1, 618(sp)
+; ZVFHMIN32-NEXT: lh a3, 362(sp)
; ZVFHMIN32-NEXT: fmv.h.x fa4, a7
; ZVFHMIN32-NEXT: feq.h a7, fa5, fa4
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a3
+; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4
; ZVFHMIN32-NEXT: fmv.h.x fa5, t6
-; ZVFHMIN32-NEXT: sb a0, 244(sp)
-; ZVFHMIN32-NEXT: lh a0, 614(sp)
-; ZVFHMIN32-NEXT: lh a1, 358(sp)
+; ZVFHMIN32-NEXT: sb a1, 245(sp)
+; ZVFHMIN32-NEXT: lh a1, 616(sp)
+; ZVFHMIN32-NEXT: lh a3, 360(sp)
; ZVFHMIN32-NEXT: fmv.h.x fa4, a6
; ZVFHMIN32-NEXT: feq.h a6, fa5, fa4
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a3
+; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4
; ZVFHMIN32-NEXT: fmv.h.x fa5, t5
+; ZVFHMIN32-NEXT: sb a1, 244(sp)
+; ZVFHMIN32-NEXT: lh a1, 614(sp)
+; ZVFHMIN32-NEXT: lh a3, 358(sp)
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a5
+; ZVFHMIN32-NEXT: feq.h a5, fa5, fa4
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a3
+; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT: fmv.h.x fa5, t4
; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 8
-; ZVFHMIN32-NEXT: vmv.x.s a1, v8
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT: sb a0, 243(sp)
-; ZVFHMIN32-NEXT: lh a0, 612(sp)
-; ZVFHMIN32-NEXT: lh a1, 356(sp)
-; ZVFHMIN32-NEXT: sb a5, 204(sp)
-; ZVFHMIN32-NEXT: sb a2, 205(sp)
-; ZVFHMIN32-NEXT: sb a3, 206(sp)
-; ZVFHMIN32-NEXT: sb a4, 207(sp)
-; ZVFHMIN32-NEXT: feq.h a2, fa5, fa4
-; ZVFHMIN32-NEXT: sb a2, 200(sp)
-; ZVFHMIN32-NEXT: sb a6, 201(sp)
-; ZVFHMIN32-NEXT: sb a7, 202(sp)
-; ZVFHMIN32-NEXT: sb t0, 203(sp)
-; ZVFHMIN32-NEXT: li a2, 128
-; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT: fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT: vmv.x.s a3, v8
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a3
+; ZVFHMIN32-NEXT: sb a1, 243(sp)
+; ZVFHMIN32-NEXT: lh a1, 612(sp)
+; ZVFHMIN32-NEXT: lh a3, 356(sp)
+; ZVFHMIN32-NEXT: sb t0, 204(sp)
+; ZVFHMIN32-NEXT: sb a4, 205(sp)
+; ZVFHMIN32-NEXT: sb a0, 206(sp)
+; ZVFHMIN32-NEXT: sb a2, 207(sp)
; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT: sb a0, 242(sp)
-; ZVFHMIN32-NEXT: addi a0, sp, 128
-; ZVFHMIN32-NEXT: vsetvli zero, a2, e8, m8, ta, ma
-; ZVFHMIN32-NEXT: vle8.v v8, (a0)
+; ZVFHMIN32-NEXT: sb a0, 200(sp)
+; ZVFHMIN32-NEXT: sb a5, 201(sp)
+; ZVFHMIN32-NEXT: sb a6, 202(sp)
+; ZVFHMIN32-NEXT: sb a7, 203(sp)
+; ZVFHMIN32-NEXT: li a0, 128
+; ZVFHMIN32-NEXT: fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT: fmv.h.x fa4, a3
+; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT: sb a1, 242(sp)
+; ZVFHMIN32-NEXT: addi a1, sp, 128
+; ZVFHMIN32-NEXT: vsetvli zero, a0, e8, m8, ta, ma
+; ZVFHMIN32-NEXT: vle8.v v8, (a1)
; ZVFHMIN32-NEXT: vand.vi v8, v8, 1
; ZVFHMIN32-NEXT: vmsne.vi v0, v8, 0
; ZVFHMIN32-NEXT: addi sp, s0, -896
@@ -2442,6 +2440,12 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
; ZVFHMIN64-NEXT: sb a0, 219(sp)
; ZVFHMIN64-NEXT: lh a0, 564(sp)
; ZVFHMIN64-NEXT: lh a1, 308(sp)
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT: sb a0, 218(sp)
+; ZVFHMIN64-NEXT: lh a0, 562(sp)
+; ZVFHMIN64-NEXT: lh a1, 306(sp)
; ZVFHMIN64-NEXT: vsetivli zero, 1, e16, m1, ta, ma
; ZVFHMIN64-NEXT: vslidedown.vi v10, v8, 7
; ZVFHMIN64-NEXT: csrr a2, vlenb
@@ -2494,86 +2498,82 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
; ZVFHMIN64-NEXT: vs1r.v v10, (a2) # Unknown-size Folded Spill
; ZVFHMIN64-NEXT: vsetivli zero, 1, e16, m2, ta, ma
; ZVFHMIN64-NEXT: vslidedown.vi v26, v8, 15
-; ZVFHMIN64-NEXT: vslidedown.vi v20, v8, 14
-; ZVFHMIN64-NEXT: vslidedown.vi v28, v8, 13
-; ZVFHMIN64-NEXT: vslidedown.vi v10, v8, 12
-; ZVFHMIN64-NEXT: csrr a2, vlenb
-; ZVFHMIN64-NEXT: slli a2, a2, 1
-; ZVFHMIN64-NEXT: add a2, sp, a2
-; ZVFHMIN64-NEXT: addi a2, a2, 800
+; ZVFHMIN64-NEXT: vslidedown.vi v28, v8, 14
+; ZVFHMIN64-NEXT: vslidedown.vi v10, v8, 13
+; ZVFHMIN64-NEXT: addi a2, sp, 800
; ZVFHMIN64-NEXT: vs2r.v v10, (a2) # Unknown-size Folded Spill
-; ZVFHMIN64-NEXT: vslidedown.vi v4, v8, 11
-; ZVFHMIN64-NEXT: vslidedown.vi v2, v8, 10
-; ZVFHMIN64-NEXT: vslidedown.vi v30, v8, 9
-; ZVFHMIN64-NEXT: vslidedown.vi v22, v8, 8
-; ZVFHMIN64-NEXT: vmv.x.s t5, v16
+; ZVFHMIN64-NEXT: vslidedown.vi v6, v8, 12
+; ZVFHMIN64-NEXT: vslidedown.vi v2, v8, 11
+; ZVFHMIN64-NEXT: vslidedown.vi v22, v8, 10
+; ZVFHMIN64-NEXT: vslidedown.vi v20, v8, 9
+; ZVFHMIN64-NEXT: vslidedown.vi v18, v8, 8
+; ZVFHMIN64-NEXT: vmv.x.s a3, v16
; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT: sb a0, 218(sp)
-; ZVFHMIN64-NEXT: lh a0, 562(sp)
-; ZVFHMIN64-NEXT: lh a1, 306(sp)
+; ZVFHMIN64-NEXT: sb a0, 217(sp)
+; ZVFHMIN64-NEXT: lh a0, 560(sp)
+; ZVFHMIN64-NEXT: lh a1, 304(sp)
; ZVFHMIN64-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; ZVFHMIN64-NEXT: vslidedown.vi v3, v16, 7
-; ZVFHMIN64-NEXT: vslidedown.vi v31, v16, 6
-; ZVFHMIN64-NEXT: vslidedown.vi v5, v16, 5
+; ZVFHMIN64-NEXT: vslidedown.vi v21, v16, 7
+; ZVFHMIN64-NEXT: vslidedown.vi v3, v16, 6
+; ZVFHMIN64-NEXT: vslidedown.vi v19, v16, 5
; ZVFHMIN64-NEXT: vslidedown.vi v23, v16, 4
; ZVFHMIN64-NEXT: vslidedown.vi v8, v16, 3
; ZVFHMIN64-NEXT: csrr a2, vlenb
-; ZVFHMIN64-NEXT: li a3, 18
-; ZVFHMIN64-NEXT: mul a2, a2, a3
+; ZVFHMIN64-NEXT: li a4, 10
+; ZVFHMIN64-NEXT: mul a2, a2, a4
; ZVFHMIN64-NEXT: add a2, sp, a2
; ZVFHMIN64-NEXT: addi a2, a2, 800
; ZVFHMIN64-NEXT: vs1r.v v8, (a2) # Unknown-size Folded Spill
; ZVFHMIN64-NEXT: vslidedown.vi v8, v16, 2
; ZVFHMIN64-NEXT: csrr a2, vlenb
-; ZVFHMIN64-NEXT: li a3, 22
-; ZVFHMIN64-NEXT: mul a2, a2, a3
+; ZVFHMIN64-NEXT: slli a2, a2, 4
; ZVFHMIN64-NEXT: add a2, sp, a2
; ZVFHMIN64-NEXT: addi a2, a2, 800
; ZVFHMIN64-NEXT: vs1r.v v8, (a2) # Unknown-size Folded Spill
; ZVFHMIN64-NEXT: vslidedown.vi v8, v16, 1
; ZVFHMIN64-NEXT: csrr a2, vlenb
-; ZVFHMIN64-NEXT: li a3, 21
-; ZVFHMIN64-NEXT: mul a2, a2, a3
+; ZVFHMIN64-NEXT: slli a4, a2, 4
+; ZVFHMIN64-NEXT: sub a2, a4, a2
; ZVFHMIN64-NEXT: add a2, sp, a2
; ZVFHMIN64-NEXT: addi a2, a2, 800
; ZVFHMIN64-NEXT: vs1r.v v8, (a2) # Unknown-size Folded Spill
; ZVFHMIN64-NEXT: vsetivli zero, 1, e16, m2, ta, ma
-; ZVFHMIN64-NEXT: vslidedown.vi v18, v16, 15
-; ZVFHMIN64-NEXT: vslidedown.vi v14, v16, 14
-; ZVFHMIN64-NEXT: vslidedown.vi v12, v16, 13
-; ZVFHMIN64-NEXT: vslidedown.vi v10, v16, 12
-; ZVFHMIN64-NEXT: vslidedown.vi v8, v16, 11
-; ZVFHMIN64-NEXT: vslidedown.vi v6, v16, 10
+; ZVFHMIN64-NEXT: vslidedown.vi v14, v16, 15
+; ZVFHMIN64-NEXT: vslidedown.vi v8, v16, 14
+; ZVFHMIN64-NEXT: vslidedown.vi v10, v16, 13
+; ZVFHMIN64-NEXT: vslidedown.vi v12, v16, 12
+; ZVFHMIN64-NEXT: vslidedown.vi v30, v16, 11
; ZVFHMIN64-NEXT: csrr a2, vlenb
-; ZVFHMIN64-NEXT: li a3, 19
-; ZVFHMIN64-NEXT: mul a2, a2, a3
+; ZVFHMIN64-NEXT: slli a4, a2, 4
+; ZVFHMIN64-NEXT: add a2, a4, a2
; ZVFHMIN64-NEXT: add a2, sp, a2
; ZVFHMIN64-NEXT: addi a2, a2, 800
-; ZVFHMIN64-NEXT: vs2r.v v6, (a2) # Unknown-size Folded Spill
-; ZVFHMIN64-NEXT: vslidedown.vi v6, v16, 9
+; ZVFHMIN64-NEXT: vs2r.v v30, (a2) # Unknown-size Folded Spill
+; ZVFHMIN64-NEXT: vslidedown.vi v30, v16, 10
; ZVFHMIN64-NEXT: csrr a2, vlenb
-; ZVFHMIN64-NEXT: li a3, 14
-; ZVFHMIN64-NEXT: mul a2, a2, a3
+; ZVFHMIN64-NEXT: li a4, 11
+; ZVFHMIN64-NEXT: mul a2, a2, a4
; ZVFHMIN64-NEXT: add a2, sp, a2
; ZVFHMIN64-NEXT: addi a2, a2, 800
-; ZVFHMIN64-NEXT: vs2r.v v6, (a2) # Unknown-size Folded Spill
-; ZVFHMIN64-NEXT: vslidedown.vi v6, v16, 8
+; ZVFHMIN64-NEXT: vs2r.v v30, (a2) # Unknown-size Folded Spill
+; ZVFHMIN64-NEXT: vslidedown.vi v4, v16, 9
+; ZVFHMIN64-NEXT: vslidedown.vi v30, v16, 8
; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT: sb a0, 217(sp)
-; ZVFHMIN64-NEXT: lh a0, 560(sp)
-; ZVFHMIN64-NEXT: lh a1, 304(sp)
-; ZVFHMIN64-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; ZVFHMIN64-NEXT: vslidedown.vi v9, v0, 7
-; ZVFHMIN64-NEXT: vslidedown.vi v11, v0, 6
-; ZVFHMIN64-NEXT: vslidedown.vi v13, v0, 5
-; ZVFHMIN64-NEXT: vslidedown.vi v29, v0, 4
-; ZVFHMIN64-NEXT: vslidedown.vi v27, v0, 3
-; ZVFHMIN64-NEXT: vslidedown.vi v7, v0, 2
-; ZVFHMIN64-NEXT: vslidedown.vi v21, v0, 1
+; ZVFHMIN64-NEXT: sb a0, 216(sp)
+; ZVFHMIN64-NEXT: lh a0, 558(sp)
+; ZVFHMIN64-NEXT: lh a1, 302(sp)
+; ZVFHMIN64-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; ZVFHMIN64-NEXT: vslidedown.vi v11, v0, 7
+; ZVFHMIN64-NEXT: vslidedown.vi v7, v0, 6
+; ZVFHMIN64-NEXT: vslidedown.vi v9, v0, 5
+; ZVFHMIN64-NEXT: vslidedown.vi v29, v0, 4
+; ZVFHMIN64-NEXT: vslidedown.vi v31, v0, 3
+; ZVFHMIN64-NEXT: vslidedown.vi v5, v0, 2
+; ZVFHMIN64-NEXT: vslidedown.vi v27, v0, 1
; ZVFHMIN64-NEXT: vsetivli zero, 1, e16, m2, ta, ma
; ZVFHMIN64-NEXT: vslidedown.vi v16, v0, 15
; ZVFHMIN64-NEXT: csrr a2, vlenb
@@ -2583,99 +2583,88 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
; ZVFHMIN64-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill
; ZVFHMIN64-NEXT: vslidedown.vi v16, v0, 14
; ZVFHMIN64-NEXT: csrr a2, vlenb
-; ZVFHMIN64-NEXT: slli a2, a2, 3
+; ZVFHMIN64-NEXT: slli a2, a2, 1
; ZVFHMIN64-NEXT: add a2, sp, a2
; ZVFHMIN64-NEXT: addi a2, a2, 800
; ZVFHMIN64-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill
; ZVFHMIN64-NEXT: vslidedown.vi v16, v0, 13
; ZVFHMIN64-NEXT: csrr a2, vlenb
-; ZVFHMIN64-NEXT: li a3, 6
-; ZVFHMIN64-NEXT: mul a2, a2, a3
+; ZVFHMIN64-NEXT: li a4, 6
+; ZVFHMIN64-NEXT: mul a2, a2, a4
; ZVFHMIN64-NEXT: add a2, sp, a2
; ZVFHMIN64-NEXT: addi a2, a2, 800
; ZVFHMIN64-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill
; ZVFHMIN64-NEXT: vslidedown.vi v16, v0, 12
; ZVFHMIN64-NEXT: csrr a2, vlenb
-; ZVFHMIN64-NEXT: li a3, 12
-; ZVFHMIN64-NEXT: mul a2, a2, a3
+; ZVFHMIN64-NEXT: slli a2, a2, 3
; ZVFHMIN64-NEXT: add a2, sp, a2
; ZVFHMIN64-NEXT: addi a2, a2, 800
; ZVFHMIN64-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill
; ZVFHMIN64-NEXT: vslidedown.vi v16, v0, 11
; ZVFHMIN64-NEXT: csrr a2, vlenb
-; ZVFHMIN64-NEXT: li a3, 10
-; ZVFHMIN64-NEXT: mul a2, a2, a3
+; ZVFHMIN64-NEXT: li a4, 13
+; ZVFHMIN64-NEXT: mul a2, a2, a4
; ZVFHMIN64-NEXT: add a2, sp, a2
; ZVFHMIN64-NEXT: addi a2, a2, 800
; ZVFHMIN64-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill
; ZVFHMIN64-NEXT: vslidedown.vi v16, v0, 10
; ZVFHMIN64-NEXT: csrr a2, vlenb
-; ZVFHMIN64-NEXT: slli a2, a2, 4
+; ZVFHMIN64-NEXT: li a4, 19
+; ZVFHMIN64-NEXT: mul a2, a2, a4
; ZVFHMIN64-NEXT: add a2, sp, a2
; ZVFHMIN64-NEXT: addi a2, a2, 800
; ZVFHMIN64-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill
; ZVFHMIN64-NEXT: vslidedown.vi v16, v0, 9
+; ZVFHMIN64-NEXT: csrr a2, vlenb
+; ZVFHMIN64-NEXT: li a4, 21
+; ZVFHMIN64-NEXT: mul a2, a2, a4
+; ZVFHMIN64-NEXT: add a2, sp, a2
+; ZVFHMIN64-NEXT: addi a2, a2, 800
+; ZVFHMIN64-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill
; ZVFHMIN64-NEXT: vslidedown.vi v0, v0, 8
-; ZVFHMIN64-NEXT: addi a2, sp, 800
-; ZVFHMIN64-NEXT: vs2r.v v0, (a2) # Unknown-size Folded Spill
-; ZVFHMIN64-NEXT: vmv.x.s t4, v26
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT: sb a0, 216(sp)
-; ZVFHMIN64-NEXT: lh a0, 558(sp)
-; ZVFHMIN64-NEXT: lh a1, 302(sp)
-; ZVFHMIN64-NEXT: vmv.x.s t3, v20
-; ZVFHMIN64-NEXT: vmv.x.s t1, v28
; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
; ZVFHMIN64-NEXT: sb a0, 215(sp)
; ZVFHMIN64-NEXT: lh a0, 556(sp)
; ZVFHMIN64-NEXT: lh a1, 300(sp)
-; ZVFHMIN64-NEXT: csrr a2, vlenb
-; ZVFHMIN64-NEXT: slli a2, a2, 1
-; ZVFHMIN64-NEXT: add a2, sp, a2
-; ZVFHMIN64-NEXT: addi a2, a2, 800
-; ZVFHMIN64-NEXT: vl2r.v v0, (a2) # Unknown-size Folded Reload
-; ZVFHMIN64-NEXT: vmv.x.s t2, v0
-; ZVFHMIN64-NEXT: vmv.x.s t0, v4
+; ZVFHMIN64-NEXT: vmv.x.s t3, v26
+; ZVFHMIN64-NEXT: vmv.x.s t2, v28
; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
; ZVFHMIN64-NEXT: sb a0, 214(sp)
; ZVFHMIN64-NEXT: lh a0, 554(sp)
; ZVFHMIN64-NEXT: lh a1, 298(sp)
-; ZVFHMIN64-NEXT: vmv.x.s a7, v2
-; ZVFHMIN64-NEXT: vmv.x.s a6, v30
+; ZVFHMIN64-NEXT: addi a2, sp, 800
+; ZVFHMIN64-NEXT: vl2r.v v16, (a2) # Unknown-size Folded Reload
+; ZVFHMIN64-NEXT: vmv.x.s t1, v16
+; ZVFHMIN64-NEXT: vmv.x.s t0, v6
; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
; ZVFHMIN64-NEXT: sb a0, 213(sp)
; ZVFHMIN64-NEXT: lh a0, 552(sp)
; ZVFHMIN64-NEXT: lh a1, 296(sp)
-; ZVFHMIN64-NEXT: vmv.x.s a2, v22
-; ZVFHMIN64-NEXT: sd a2, 80(sp) # 8-byte Folded Spill
-; ZVFHMIN64-NEXT: vmv.x.s a2, v18
-; ZVFHMIN64-NEXT: sd a2, 88(sp) # 8-byte Folded Spill
+; ZVFHMIN64-NEXT: vmv.x.s a7, v2
+; ZVFHMIN64-NEXT: vmv.x.s a6, v22
; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
; ZVFHMIN64-NEXT: sb a0, 212(sp)
; ZVFHMIN64-NEXT: lh a0, 550(sp)
; ZVFHMIN64-NEXT: lh a1, 294(sp)
-; ZVFHMIN64-NEXT: vmv.x.s a2, v14
-; ZVFHMIN64-NEXT: sd a2, 96(sp) # 8-byte Folded Spill
-; ZVFHMIN64-NEXT: vmv.x.s a2, v12
-; ZVFHMIN64-NEXT: sd a2, 104(sp) # 8-byte Folded Spill
+; ZVFHMIN64-NEXT: vmv.x.s a5, v20
+; ZVFHMIN64-NEXT: vmv.x.s a2, v18
+; ZVFHMIN64-NEXT: sd a2, 88(sp) # 8-byte Folded Spill
; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
; ZVFHMIN64-NEXT: sb a0, 211(sp)
; ZVFHMIN64-NEXT: lh a0, 548(sp)
; ZVFHMIN64-NEXT: lh a1, 292(sp)
-; ZVFHMIN64-NEXT: vmv.x.s a2, v10
-; ZVFHMIN64-NEXT: sd a2, 112(sp) # 8-byte Folded Spill
+; ZVFHMIN64-NEXT: vmv.x.s a2, v14
+; ZVFHMIN64-NEXT: sd a2, 104(sp) # 8-byte Folded Spill
; ZVFHMIN64-NEXT: vmv.x.s a2, v8
; ZVFHMIN64-NEXT: sd a2, 120(sp) # 8-byte Folded Spill
; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
@@ -2684,33 +2673,27 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
; ZVFHMIN64-NEXT: sb a0, 210(sp)
; ZVFHMIN64-NEXT: lh a0, 546(sp)
; ZVFHMIN64-NEXT: lh a1, 290(sp)
-; ZVFHMIN64-NEXT: fmv.h.x fa5, t5
-; ZVFHMIN64-NEXT: vmv.x.s t5, v24
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a3
+; ZVFHMIN64-NEXT: vmv.x.s a3, v24
; ZVFHMIN64-NEXT: fmv.h.x fa4, a0
; ZVFHMIN64-NEXT: fmv.h.x fa3, a1
; ZVFHMIN64-NEXT: feq.h a0, fa4, fa3
; ZVFHMIN64-NEXT: sb a0, 209(sp)
; ZVFHMIN64-NEXT: lh a0, 544(sp)
; ZVFHMIN64-NEXT: lh a1, 288(sp)
-; ZVFHMIN64-NEXT: fmv.h.x fa4, t5
-; ZVFHMIN64-NEXT: feq.h t5, fa5, fa4
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a3
+; ZVFHMIN64-NEXT: feq.h a3, fa5, fa4
; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT: sb t5, 192(sp)
+; ZVFHMIN64-NEXT: sb a3, 192(sp)
; ZVFHMIN64-NEXT: sb a0, 208(sp)
; ZVFHMIN64-NEXT: lh a0, 738(sp)
; ZVFHMIN64-NEXT: lh a1, 482(sp)
-; ZVFHMIN64-NEXT: csrr a2, vlenb
-; ZVFHMIN64-NEXT: li a3, 29
-; ZVFHMIN64-NEXT: mul a2, a2, a3
-; ZVFHMIN64-NEXT: add a2, sp, a2
-; ZVFHMIN64-NEXT: lh s7, 800(a2) # 8-byte Folded Reload
-; ZVFHMIN64-NEXT: csrr a2, vlenb
-; ZVFHMIN64-NEXT: li a3, 28
-; ZVFHMIN64-NEXT: mul a2, a2, a3
-; ZVFHMIN64-NEXT: add a2, sp, a2
-; ZVFHMIN64-NEXT: lh s4, 800(a2) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT: vmv.x.s a2, v10
+; ZVFHMIN64-NEXT: sd a2, 96(sp) # 8-byte Folded Spill
+; ZVFHMIN64-NEXT: vmv.x.s a2, v12
+; ZVFHMIN64-NEXT: sd a2, 112(sp) # 8-byte Folded Spill
; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
@@ -2718,15 +2701,15 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
; ZVFHMIN64-NEXT: lh a0, 736(sp)
; ZVFHMIN64-NEXT: lh a1, 480(sp)
; ZVFHMIN64-NEXT: csrr a2, vlenb
-; ZVFHMIN64-NEXT: li a3, 27
+; ZVFHMIN64-NEXT: li a3, 29
; ZVFHMIN64-NEXT: mul a2, a2, a3
; ZVFHMIN64-NEXT: add a2, sp, a2
-; ZVFHMIN64-NEXT: lh s8, 800(a2) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT: lh s5, 800(a2) # 8-byte Folded Reload
; ZVFHMIN64-NEXT: csrr a2, vlenb
-; ZVFHMIN64-NEXT: li a3, 26
+; ZVFHMIN64-NEXT: li a3, 28
; ZVFHMIN64-NEXT: mul a2, a2, a3
; ZVFHMIN64-NEXT: add a2, sp, a2
-; ZVFHMIN64-NEXT: lh s5, 800(a2) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT: lh s2, 800(a2) # 8-byte Folded Reload
; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
@@ -2734,15 +2717,15 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
; ZVFHMIN64-NEXT: lh a0, 734(sp)
; ZVFHMIN64-NEXT: lh a1, 478(sp)
; ZVFHMIN64-NEXT: csrr a2, vlenb
-; ZVFHMIN64-NEXT: li a3, 25
+; ZVFHMIN64-NEXT: li a3, 27
; ZVFHMIN64-NEXT: mul a2, a2, a3
; ZVFHMIN64-NEXT: add a2, sp, a2
-; ZVFHMIN64-NEXT: lh s9, 800(a2) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT: lh s6, 800(a2) # 8-byte Folded Reload
; ZVFHMIN64-NEXT: csrr a2, vlenb
-; ZVFHMIN64-NEXT: li a3, 24
+; ZVFHMIN64-NEXT: li a3, 26
; ZVFHMIN64-NEXT: mul a2, a2, a3
; ZVFHMIN64-NEXT: add a2, sp, a2
-; ZVFHMIN64-NEXT: lh s6, 800(a2) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT: lh s3, 800(a2) # 8-byte Folded Reload
; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
@@ -2750,138 +2733,148 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
; ZVFHMIN64-NEXT: lh a0, 732(sp)
; ZVFHMIN64-NEXT: lh a1, 476(sp)
; ZVFHMIN64-NEXT: csrr a2, vlenb
-; ZVFHMIN64-NEXT: li a3, 23
+; ZVFHMIN64-NEXT: li a3, 25
; ZVFHMIN64-NEXT: mul a2, a2, a3
; ZVFHMIN64-NEXT: add a2, sp, a2
-; ZVFHMIN64-NEXT: lh s3, 800(a2) # 8-byte Folded Reload
-; ZVFHMIN64-NEXT: vmv.x.s t5, v3
+; ZVFHMIN64-NEXT: lh s7, 800(a2) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT: csrr a2, vlenb
+; ZVFHMIN64-NEXT: li a3, 24
+; ZVFHMIN64-NEXT: mul a2, a2, a3
+; ZVFHMIN64-NEXT: add a2, sp, a2
+; ZVFHMIN64-NEXT: lh s4, 800(a2) # 8-byte Folded Reload
; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
; ZVFHMIN64-NEXT: sb a0, 174(sp)
; ZVFHMIN64-NEXT: lh a0, 730(sp)
; ZVFHMIN64-NEXT: lh a1, 474(sp)
-; ZVFHMIN64-NEXT: vmv.x.s s2, v31
-; ZVFHMIN64-NEXT: vmv.x.s t6, v5
+; ZVFHMIN64-NEXT: csrr a2, vlenb
+; ZVFHMIN64-NEXT: li a3, 23
+; ZVFHMIN64-NEXT: mul a2, a2, a3
+; ZVFHMIN64-NEXT: add a2, sp, a2
+; ZVFHMIN64-NEXT: lh s8, 800(a2) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT: vmv.x.s t4, v21
; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
; ZVFHMIN64-NEXT: sb a0, 173(sp)
-; ZVFHMIN64-NEXT: lh a1, 728(sp)
-; ZVFHMIN64-NEXT: lh s10, 472(sp)
-; ZVFHMIN64-NEXT: vmv.x.s a3, v9
-; ZVFHMIN64-NEXT: vmv.x.s a4, v11
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT: fmv.h.x fa4, s10
-; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4
-; ZVFHMIN64-NEXT: sb a1, 172(sp)
-; ZVFHMIN64-NEXT: lh a1, 726(sp)
-; ZVFHMIN64-NEXT: lh s10, 470(sp)
-; ZVFHMIN64-NEXT: vmv.x.s a2, v13
-; ZVFHMIN64-NEXT: vmv.x.s s11, v29
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT: fmv.h.x fa4, s10
-; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4
-; ZVFHMIN64-NEXT: sb a1, 171(sp)
-; ZVFHMIN64-NEXT: lh ra, 724(sp)
-; ZVFHMIN64-NEXT: lh a0, 468(sp)
-; ZVFHMIN64-NEXT: vmv.x.s a5, v27
-; ZVFHMIN64-NEXT: vmv.x.s s10, v7
-; ZVFHMIN64-NEXT: fmv.h.x fa5, ra
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a0
+; ZVFHMIN64-NEXT: lh a0, 728(sp)
+; ZVFHMIN64-NEXT: lh a1, 472(sp)
+; ZVFHMIN64-NEXT: vmv.x.s t6, v3
+; ZVFHMIN64-NEXT: vmv.x.s t5, v19
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT: sb a0, 172(sp)
+; ZVFHMIN64-NEXT: lh a0, 726(sp)
+; ZVFHMIN64-NEXT: lh a1, 470(sp)
+; ZVFHMIN64-NEXT: vmv.x.s s10, v11
+; ZVFHMIN64-NEXT: vmv.x.s s11, v7
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT: sb a0, 171(sp)
+; ZVFHMIN64-NEXT: lh a0, 724(sp)
+; ZVFHMIN64-NEXT: lh s9, 468(sp)
+; ZVFHMIN64-NEXT: vmv.x.s a4, v9
+; ZVFHMIN64-NEXT: vmv.x.s ra, v29
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT: fmv.h.x fa4, s9
; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
; ZVFHMIN64-NEXT: sb a0, 170(sp)
; ZVFHMIN64-NEXT: lh a0, 722(sp)
; ZVFHMIN64-NEXT: lh a1, 466(sp)
-; ZVFHMIN64-NEXT: vmv.x.s ra, v21
-; ZVFHMIN64-NEXT: fmv.h.x fa5, s7
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a0
-; ZVFHMIN64-NEXT: fmv.h.x fa3, a1
-; ZVFHMIN64-NEXT: feq.h a0, fa4, fa3
+; ZVFHMIN64-NEXT: vmv.x.s s9, v31
+; ZVFHMIN64-NEXT: vmv.x.s a3, v5
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
; ZVFHMIN64-NEXT: sb a0, 169(sp)
; ZVFHMIN64-NEXT: lh a0, 720(sp)
; ZVFHMIN64-NEXT: lh a1, 464(sp)
-; ZVFHMIN64-NEXT: fmv.h.x fa4, s4
-; ZVFHMIN64-NEXT: fmv.h.x fa3, s8
-; ZVFHMIN64-NEXT: fmv.h.x fa2, a0
-; ZVFHMIN64-NEXT: fmv.h.x fa1, a1
-; ZVFHMIN64-NEXT: feq.h a0, fa2, fa1
+; ZVFHMIN64-NEXT: vmv.x.s a2, v27
+; ZVFHMIN64-NEXT: fmv.h.x fa5, s5
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a0
+; ZVFHMIN64-NEXT: fmv.h.x fa3, a1
+; ZVFHMIN64-NEXT: feq.h a0, fa4, fa3
; ZVFHMIN64-NEXT: sb a0, 168(sp)
; ZVFHMIN64-NEXT: lh a0, 718(sp)
; ZVFHMIN64-NEXT: lh a1, 462(sp)
-; ZVFHMIN64-NEXT: fmv.h.x fa2, s5
-; ZVFHMIN64-NEXT: fmv.h.x fa1, s9
-; ZVFHMIN64-NEXT: fmv.h.x fa0, a0
-; ZVFHMIN64-NEXT: fmv.h.x ft0, a1
-; ZVFHMIN64-NEXT: feq.h a0, fa0, ft0
+; ZVFHMIN64-NEXT: fmv.h.x fa4, s2
+; ZVFHMIN64-NEXT: fmv.h.x fa3, s6
+; ZVFHMIN64-NEXT: fmv.h.x fa2, a0
+; ZVFHMIN64-NEXT: fmv.h.x fa1, a1
+; ZVFHMIN64-NEXT: feq.h a0, fa2, fa1
; ZVFHMIN64-NEXT: sb a0, 167(sp)
; ZVFHMIN64-NEXT: lh a0, 716(sp)
-; ZVFHMIN64-NEXT: fmv.h.x fa0, s6
; ZVFHMIN64-NEXT: lh a1, 460(sp)
-; ZVFHMIN64-NEXT: fmv.h.x ft0, a3
+; ZVFHMIN64-NEXT: fmv.h.x fa2, s3
+; ZVFHMIN64-NEXT: fmv.h.x fa1, s7
+; ZVFHMIN64-NEXT: fmv.h.x fa0, a0
+; ZVFHMIN64-NEXT: fmv.h.x ft0, a1
+; ZVFHMIN64-NEXT: feq.h a0, fa0, ft0
+; ZVFHMIN64-NEXT: sb a0, 166(sp)
+; ZVFHMIN64-NEXT: lh a0, 714(sp)
+; ZVFHMIN64-NEXT: lh a1, 458(sp)
+; ZVFHMIN64-NEXT: fmv.h.x fa0, s4
+; ZVFHMIN64-NEXT: fmv.h.x ft0, s8
; ZVFHMIN64-NEXT: fmv.h.x ft1, a0
-; ZVFHMIN64-NEXT: feq.h a0, fa5, ft0
+; ZVFHMIN64-NEXT: fmv.h.x ft2, a1
+; ZVFHMIN64-NEXT: feq.h a0, ft1, ft2
+; ZVFHMIN64-NEXT: sb a0, 165(sp)
+; ZVFHMIN64-NEXT: lh a0, 712(sp)
+; ZVFHMIN64-NEXT: lh a1, 456(sp)
+; ZVFHMIN64-NEXT: fmv.h.x ft1, s10
+; ZVFHMIN64-NEXT: fmv.h.x ft2, s11
+; ZVFHMIN64-NEXT: fmv.h.x ft3, a0
+; ZVFHMIN64-NEXT: fmv.h.x ft4, a1
+; ZVFHMIN64-NEXT: feq.h a0, ft3, ft4
+; ZVFHMIN64-NEXT: sb a0, 164(sp)
+; ZVFHMIN64-NEXT: lh a0, 710(sp)
+; ZVFHMIN64-NEXT: fmv.h.x ft3, a4
+; ZVFHMIN64-NEXT: lh a1, 454(sp)
+; ZVFHMIN64-NEXT: fmv.h.x ft4, ra
+; ZVFHMIN64-NEXT: fmv.h.x ft5, a0
+; ZVFHMIN64-NEXT: feq.h a0, fa5, ft1
; ZVFHMIN64-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT: feq.h a1, ft1, fa5
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a4
-; ZVFHMIN64-NEXT: sb a1, 166(sp)
-; ZVFHMIN64-NEXT: lh a1, 714(sp)
-; ZVFHMIN64-NEXT: fmv.h.x ft0, a2
-; ZVFHMIN64-NEXT: lh a2, 458(sp)
-; ZVFHMIN64-NEXT: feq.h a3, fa4, fa5
+; ZVFHMIN64-NEXT: feq.h a1, ft5, fa5
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a3
+; ZVFHMIN64-NEXT: sb a1, 163(sp)
+; ZVFHMIN64-NEXT: lh a1, 708(sp)
+; ZVFHMIN64-NEXT: fmv.h.x ft1, a2
+; ZVFHMIN64-NEXT: lh a2, 452(sp)
+; ZVFHMIN64-NEXT: feq.h a3, fa0, fa5
; ZVFHMIN64-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT: feq.h a1, fa3, ft0
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a2
-; ZVFHMIN64-NEXT: feq.h a2, fa5, fa4
-; ZVFHMIN64-NEXT: fmv.h.x fa5, s3
-; ZVFHMIN64-NEXT: sb a2, 165(sp)
-; ZVFHMIN64-NEXT: lh a2, 712(sp)
-; ZVFHMIN64-NEXT: lh a4, 456(sp)
-; ZVFHMIN64-NEXT: fmv.h.x fa4, s11
-; ZVFHMIN64-NEXT: feq.h s3, fa2, fa4
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a2
-; ZVFHMIN64-NEXT: fmv.h.x fa3, a4
-; ZVFHMIN64-NEXT: feq.h a2, fa4, fa3
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a5
-; ZVFHMIN64-NEXT: sb a2, 164(sp)
-; ZVFHMIN64-NEXT: lh a2, 710(sp)
-; ZVFHMIN64-NEXT: lh a4, 454(sp)
-; ZVFHMIN64-NEXT: fmv.h.x fa3, s10
-; ZVFHMIN64-NEXT: feq.h a5, fa1, fa4
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a2
-; ZVFHMIN64-NEXT: fmv.h.x fa2, a4
-; ZVFHMIN64-NEXT: feq.h a2, fa4, fa2
-; ZVFHMIN64-NEXT: fmv.h.x fa4, ra
-; ZVFHMIN64-NEXT: sb a2, 163(sp)
-; ZVFHMIN64-NEXT: lh a2, 708(sp)
-; ZVFHMIN64-NEXT: lh a4, 452(sp)
-; ZVFHMIN64-NEXT: feq.h s4, fa0, fa3
-; ZVFHMIN64-NEXT: feq.h s5, fa5, fa4
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a2
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a4
-; ZVFHMIN64-NEXT: feq.h a2, fa5, fa4
+; ZVFHMIN64-NEXT: feq.h a1, ft0, ft1
+; ZVFHMIN64-NEXT: fmv.h.x fa0, a2
+; ZVFHMIN64-NEXT: feq.h a2, fa5, fa0
+; ZVFHMIN64-NEXT: fmv.h.x fa5, s9
; ZVFHMIN64-NEXT: sb a2, 162(sp)
; ZVFHMIN64-NEXT: lh a2, 706(sp)
; ZVFHMIN64-NEXT: lh a4, 450(sp)
-; ZVFHMIN64-NEXT: sb s5, 129(sp)
-; ZVFHMIN64-NEXT: sb s4, 130(sp)
-; ZVFHMIN64-NEXT: sb a5, 131(sp)
-; ZVFHMIN64-NEXT: sb s3, 132(sp)
+; ZVFHMIN64-NEXT: sb a1, 129(sp)
+; ZVFHMIN64-NEXT: feq.h a1, fa1, fa5
+; ZVFHMIN64-NEXT: sb a3, 130(sp)
+; ZVFHMIN64-NEXT: feq.h a3, fa2, ft4
+; ZVFHMIN64-NEXT: sb a1, 131(sp)
+; ZVFHMIN64-NEXT: feq.h a1, fa4, ft2
+; ZVFHMIN64-NEXT: sb a3, 132(sp)
+; ZVFHMIN64-NEXT: feq.h a3, fa3, ft3
; ZVFHMIN64-NEXT: fmv.h.x fa5, a2
; ZVFHMIN64-NEXT: fmv.h.x fa4, a4
; ZVFHMIN64-NEXT: feq.h a2, fa5, fa4
-; ZVFHMIN64-NEXT: sb a1, 133(sp)
-; ZVFHMIN64-NEXT: sb a3, 134(sp)
+; ZVFHMIN64-NEXT: sb a3, 133(sp)
+; ZVFHMIN64-NEXT: sb a1, 134(sp)
; ZVFHMIN64-NEXT: sb a0, 135(sp)
; ZVFHMIN64-NEXT: sb a2, 161(sp)
; ZVFHMIN64-NEXT: lh a0, 610(sp)
; ZVFHMIN64-NEXT: lh a1, 354(sp)
-; ZVFHMIN64-NEXT: vmv.x.s s6, v23
+; ZVFHMIN64-NEXT: vmv.x.s s4, v23
; ZVFHMIN64-NEXT: csrr a2, vlenb
-; ZVFHMIN64-NEXT: li a3, 18
+; ZVFHMIN64-NEXT: li a3, 10
; ZVFHMIN64-NEXT: mul a2, a2, a3
; ZVFHMIN64-NEXT: add a2, sp, a2
-; ZVFHMIN64-NEXT: lh s5, 800(a2) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT: lh s2, 800(a2) # 8-byte Folded Reload
; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
@@ -2889,13 +2882,12 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
; ZVFHMIN64-NEXT: lh a0, 608(sp)
; ZVFHMIN64-NEXT: lh a1, 352(sp)
; ZVFHMIN64-NEXT: csrr a2, vlenb
-; ZVFHMIN64-NEXT: li a3, 22
-; ZVFHMIN64-NEXT: mul a2, a2, a3
+; ZVFHMIN64-NEXT: slli a2, a2, 4
; ZVFHMIN64-NEXT: add a2, sp, a2
-; ZVFHMIN64-NEXT: lh s4, 800(a2) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT: lh s5, 800(a2) # 8-byte Folded Reload
; ZVFHMIN64-NEXT: csrr a2, vlenb
-; ZVFHMIN64-NEXT: li a3, 21
-; ZVFHMIN64-NEXT: mul a2, a2, a3
+; ZVFHMIN64-NEXT: slli a3, a2, 4
+; ZVFHMIN64-NEXT: sub a2, a3, a2
; ZVFHMIN64-NEXT: add a2, sp, a2
; ZVFHMIN64-NEXT: lh s3, 800(a2) # 8-byte Folded Reload
; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
@@ -2904,148 +2896,148 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
; ZVFHMIN64-NEXT: sb a0, 240(sp)
; ZVFHMIN64-NEXT: lh a0, 606(sp)
; ZVFHMIN64-NEXT: lh a1, 350(sp)
-; ZVFHMIN64-NEXT: fmv.h.x fa3, t5
-; ZVFHMIN64-NEXT: fmv.h.x fa5, s2
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a0
-; ZVFHMIN64-NEXT: fmv.h.x fa2, a1
-; ZVFHMIN64-NEXT: feq.h a0, fa4, fa2
+; ZVFHMIN64-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 7
+; ZVFHMIN64-NEXT: vmv.x.s s6, v8
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
; ZVFHMIN64-NEXT: sb a0, 239(sp)
; ZVFHMIN64-NEXT: lh a0, 604(sp)
; ZVFHMIN64-NEXT: lh a1, 348(sp)
-; ZVFHMIN64-NEXT: fmv.h.x fa4, t6
-; ZVFHMIN64-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 7
-; ZVFHMIN64-NEXT: fmv.h.x fa2, a0
-; ZVFHMIN64-NEXT: fmv.h.x fa1, a1
-; ZVFHMIN64-NEXT: feq.h a0, fa2, fa1
+; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 6
+; ZVFHMIN64-NEXT: vmv.x.s s7, v8
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
; ZVFHMIN64-NEXT: sb a0, 238(sp)
; ZVFHMIN64-NEXT: lh a0, 602(sp)
; ZVFHMIN64-NEXT: lh a1, 346(sp)
-; ZVFHMIN64-NEXT: vmv.x.s a2, v8
-; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 6
-; ZVFHMIN64-NEXT: fmv.h.x fa2, a0
-; ZVFHMIN64-NEXT: fmv.h.x fa1, a1
-; ZVFHMIN64-NEXT: feq.h a0, fa2, fa1
+; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 5
+; ZVFHMIN64-NEXT: vmv.x.s s8, v8
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
; ZVFHMIN64-NEXT: sb a0, 237(sp)
; ZVFHMIN64-NEXT: lh a0, 600(sp)
; ZVFHMIN64-NEXT: lh a1, 344(sp)
-; ZVFHMIN64-NEXT: vmv.x.s a3, v8
-; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 5
-; ZVFHMIN64-NEXT: fmv.h.x fa2, a0
-; ZVFHMIN64-NEXT: fmv.h.x fa1, a1
-; ZVFHMIN64-NEXT: feq.h a0, fa2, fa1
+; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 4
+; ZVFHMIN64-NEXT: vmv.x.s s9, v8
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
; ZVFHMIN64-NEXT: sb a0, 236(sp)
; ZVFHMIN64-NEXT: lh a0, 598(sp)
; ZVFHMIN64-NEXT: lh a1, 342(sp)
-; ZVFHMIN64-NEXT: vmv.x.s a4, v8
-; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 4
-; ZVFHMIN64-NEXT: fmv.h.x fa2, a0
-; ZVFHMIN64-NEXT: fmv.h.x fa1, a1
-; ZVFHMIN64-NEXT: feq.h a0, fa2, fa1
+; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 3
+; ZVFHMIN64-NEXT: vmv.x.s s10, v8
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
; ZVFHMIN64-NEXT: sb a0, 235(sp)
; ZVFHMIN64-NEXT: lh a0, 596(sp)
; ZVFHMIN64-NEXT: lh a1, 340(sp)
-; ZVFHMIN64-NEXT: vmv.x.s a5, v8
-; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 3
-; ZVFHMIN64-NEXT: fmv.h.x fa2, a0
-; ZVFHMIN64-NEXT: fmv.h.x fa1, a1
-; ZVFHMIN64-NEXT: feq.h a0, fa2, fa1
+; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 2
+; ZVFHMIN64-NEXT: vmv.x.s s11, v8
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
; ZVFHMIN64-NEXT: sb a0, 234(sp)
; ZVFHMIN64-NEXT: lh a0, 594(sp)
; ZVFHMIN64-NEXT: lh a1, 338(sp)
-; ZVFHMIN64-NEXT: vmv.x.s t6, v8
-; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 2
-; ZVFHMIN64-NEXT: fmv.h.x fa2, a0
-; ZVFHMIN64-NEXT: fmv.h.x fa1, a1
-; ZVFHMIN64-NEXT: feq.h a0, fa2, fa1
+; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 1
+; ZVFHMIN64-NEXT: vmv.x.s ra, v8
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
; ZVFHMIN64-NEXT: sb a0, 233(sp)
; ZVFHMIN64-NEXT: lh a0, 592(sp)
; ZVFHMIN64-NEXT: lh a1, 336(sp)
-; ZVFHMIN64-NEXT: vmv.x.s s2, v8
-; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 1
-; ZVFHMIN64-NEXT: fmv.h.x fa2, a0
-; ZVFHMIN64-NEXT: fmv.h.x fa1, a1
-; ZVFHMIN64-NEXT: feq.h a0, fa2, fa1
+; ZVFHMIN64-NEXT: fmv.h.x fa5, t4
+; ZVFHMIN64-NEXT: fmv.h.x fa4, t6
+; ZVFHMIN64-NEXT: fmv.h.x fa3, a0
+; ZVFHMIN64-NEXT: fmv.h.x fa2, a1
+; ZVFHMIN64-NEXT: feq.h a0, fa3, fa2
; ZVFHMIN64-NEXT: sb a0, 232(sp)
; ZVFHMIN64-NEXT: lh a0, 590(sp)
-; ZVFHMIN64-NEXT: fmv.h.x fa2, a2
; ZVFHMIN64-NEXT: lh a1, 334(sp)
-; ZVFHMIN64-NEXT: vmv.x.s a2, v8
+; ZVFHMIN64-NEXT: fmv.h.x fa3, t5
+; ZVFHMIN64-NEXT: fmv.h.x fa2, s4
; ZVFHMIN64-NEXT: fmv.h.x fa1, a0
-; ZVFHMIN64-NEXT: feq.h t5, fa3, fa2
-; ZVFHMIN64-NEXT: fmv.h.x fa3, a1
-; ZVFHMIN64-NEXT: feq.h a0, fa1, fa3
-; ZVFHMIN64-NEXT: fmv.h.x fa3, a3
+; ZVFHMIN64-NEXT: fmv.h.x fa0, a1
+; ZVFHMIN64-NEXT: feq.h a0, fa1, fa0
; ZVFHMIN64-NEXT: sb a0, 231(sp)
; ZVFHMIN64-NEXT: lh a0, 588(sp)
-; ZVFHMIN64-NEXT: fmv.h.x fa2, a4
; ZVFHMIN64-NEXT: lh a1, 332(sp)
-; ZVFHMIN64-NEXT: feq.h a3, fa5, fa3
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT: feq.h a0, fa4, fa2
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4
-; ZVFHMIN64-NEXT: fmv.h.x fa5, s6
-; ZVFHMIN64-NEXT: sb a1, 230(sp)
-; ZVFHMIN64-NEXT: lh a1, 586(sp)
-; ZVFHMIN64-NEXT: lh a4, 330(sp)
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a5
-; ZVFHMIN64-NEXT: feq.h a5, fa5, fa4
+; ZVFHMIN64-NEXT: fmv.h.x fa1, s2
+; ZVFHMIN64-NEXT: fmv.h.x fa0, s5
+; ZVFHMIN64-NEXT: fmv.h.x ft0, a0
+; ZVFHMIN64-NEXT: fmv.h.x ft1, a1
+; ZVFHMIN64-NEXT: feq.h a0, ft0, ft1
+; ZVFHMIN64-NEXT: sb a0, 230(sp)
+; ZVFHMIN64-NEXT: lh a0, 586(sp)
+; ZVFHMIN64-NEXT: fmv.h.x ft0, s3
+; ZVFHMIN64-NEXT: lh a1, 330(sp)
+; ZVFHMIN64-NEXT: fmv.h.x ft1, s6
+; ZVFHMIN64-NEXT: fmv.h.x ft2, a0
+; ZVFHMIN64-NEXT: feq.h a0, fa5, ft1
; ZVFHMIN64-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a4
-; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4
-; ZVFHMIN64-NEXT: fmv.h.x fa5, s5
+; ZVFHMIN64-NEXT: feq.h a1, ft2, fa5
+; ZVFHMIN64-NEXT: fmv.h.x fa5, s7
; ZVFHMIN64-NEXT: sb a1, 229(sp)
; ZVFHMIN64-NEXT: lh a1, 584(sp)
-; ZVFHMIN64-NEXT: lh a4, 328(sp)
-; ZVFHMIN64-NEXT: fmv.h.x fa4, t6
-; ZVFHMIN64-NEXT: feq.h t6, fa5, fa4
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a4
-; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4
-; ZVFHMIN64-NEXT: fmv.h.x fa5, s4
-; ZVFHMIN64-NEXT: sb a1, 228(sp)
-; ZVFHMIN64-NEXT: lh a1, 582(sp)
-; ZVFHMIN64-NEXT: lh a4, 326(sp)
-; ZVFHMIN64-NEXT: fmv.h.x fa4, s2
-; ZVFHMIN64-NEXT: feq.h s2, fa5, fa4
+; ZVFHMIN64-NEXT: fmv.h.x ft1, s8
+; ZVFHMIN64-NEXT: lh a2, 328(sp)
+; ZVFHMIN64-NEXT: feq.h a3, fa4, fa5
; ZVFHMIN64-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a4
-; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4
-; ZVFHMIN64-NEXT: fmv.h.x fa5, s3
-; ZVFHMIN64-NEXT: sb a1, 227(sp)
-; ZVFHMIN64-NEXT: lh a1, 580(sp)
-; ZVFHMIN64-NEXT: lh a4, 324(sp)
+; ZVFHMIN64-NEXT: feq.h a1, fa3, ft1
; ZVFHMIN64-NEXT: fmv.h.x fa4, a2
; ZVFHMIN64-NEXT: feq.h a2, fa5, fa4
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a4
-; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4
-; ZVFHMIN64-NEXT: sb a1, 226(sp)
-; ZVFHMIN64-NEXT: lh a1, 578(sp)
+; ZVFHMIN64-NEXT: fmv.h.x fa5, s9
+; ZVFHMIN64-NEXT: sb a2, 228(sp)
+; ZVFHMIN64-NEXT: lh a2, 582(sp)
+; ZVFHMIN64-NEXT: lh a4, 326(sp)
+; ZVFHMIN64-NEXT: fmv.h.x fa4, s10
+; ZVFHMIN64-NEXT: feq.h t4, fa2, fa5
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a2
+; ZVFHMIN64-NEXT: fmv.h.x fa3, a4
+; ZVFHMIN64-NEXT: feq.h a2, fa5, fa3
+; ZVFHMIN64-NEXT: fmv.h.x fa5, s11
+; ZVFHMIN64-NEXT: fmv.h.x fa3, ra
+; ZVFHMIN64-NEXT: sb a2, 227(sp)
+; ZVFHMIN64-NEXT: lh a2, 580(sp)
+; ZVFHMIN64-NEXT: lh a4, 324(sp)
+; ZVFHMIN64-NEXT: feq.h t5, fa0, fa5
+; ZVFHMIN64-NEXT: feq.h t6, ft0, fa3
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a2
+; ZVFHMIN64-NEXT: fmv.h.x fa3, a4
+; ZVFHMIN64-NEXT: feq.h a2, fa5, fa3
+; ZVFHMIN64-NEXT: sb a2, 226(sp)
+; ZVFHMIN64-NEXT: lh a2, 578(sp)
; ZVFHMIN64-NEXT: lh a4, 322(sp)
-; ZVFHMIN64-NEXT: sb a2, 193(sp)
-; ZVFHMIN64-NEXT: sb s2, 194(sp)
+; ZVFHMIN64-NEXT: sb t6, 193(sp)
+; ZVFHMIN64-NEXT: feq.h t6, fa1, fa4
+; ZVFHMIN64-NEXT: sb t5, 194(sp)
; ZVFHMIN64-NEXT: sb t6, 195(sp)
-; ZVFHMIN64-NEXT: sb a5, 196(sp)
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT: sb t4, 196(sp)
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a2
; ZVFHMIN64-NEXT: fmv.h.x fa4, a4
-; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4
-; ZVFHMIN64-NEXT: sb a0, 197(sp)
+; ZVFHMIN64-NEXT: feq.h a2, fa5, fa4
+; ZVFHMIN64-NEXT: sb a1, 197(sp)
; ZVFHMIN64-NEXT: sb a3, 198(sp)
-; ZVFHMIN64-NEXT: sb t5, 199(sp)
-; ZVFHMIN64-NEXT: sb a1, 225(sp)
+; ZVFHMIN64-NEXT: sb a0, 199(sp)
+; ZVFHMIN64-NEXT: sb a2, 225(sp)
; ZVFHMIN64-NEXT: lh a0, 766(sp)
; ZVFHMIN64-NEXT: lh a1, 510(sp)
; ZVFHMIN64-NEXT: csrr a2, vlenb
-; ZVFHMIN64-NEXT: li a3, 19
-; ZVFHMIN64-NEXT: mul a2, a2, a3
+; ZVFHMIN64-NEXT: slli a3, a2, 4
+; ZVFHMIN64-NEXT: add a2, a3, a2
; ZVFHMIN64-NEXT: add a2, sp, a2
; ZVFHMIN64-NEXT: addi a2, a2, 800
; ZVFHMIN64-NEXT: vl2r.v v8, (a2) # Unknown-size Folded Reload
; ZVFHMIN64-NEXT: vmv.x.s s2, v8
; ZVFHMIN64-NEXT: csrr a2, vlenb
-; ZVFHMIN64-NEXT: li a3, 14
+; ZVFHMIN64-NEXT: li a3, 11
; ZVFHMIN64-NEXT: mul a2, a2, a3
; ZVFHMIN64-NEXT: add a2, sp, a2
; ZVFHMIN64-NEXT: addi a2, a2, 800
@@ -3057,301 +3049,305 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
; ZVFHMIN64-NEXT: sb a0, 191(sp)
; ZVFHMIN64-NEXT: lh a0, 764(sp)
; ZVFHMIN64-NEXT: lh a1, 508(sp)
-; ZVFHMIN64-NEXT: vmv.x.s t5, v6
-; ZVFHMIN64-NEXT: csrr a2, vlenb
-; ZVFHMIN64-NEXT: slli a2, a2, 2
-; ZVFHMIN64-NEXT: add a2, sp, a2
-; ZVFHMIN64-NEXT: addi a2, a2, 800
-; ZVFHMIN64-NEXT: vl2r.v v8, (a2) # Unknown-size Folded Reload
-; ZVFHMIN64-NEXT: vmv.x.s a2, v8
+; ZVFHMIN64-NEXT: vmv.x.s t5, v4
+; ZVFHMIN64-NEXT: vmv.x.s t4, v30
; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
; ZVFHMIN64-NEXT: sb a0, 190(sp)
; ZVFHMIN64-NEXT: lh a0, 762(sp)
; ZVFHMIN64-NEXT: lh a1, 506(sp)
+; ZVFHMIN64-NEXT: csrr a2, vlenb
+; ZVFHMIN64-NEXT: slli a2, a2, 2
+; ZVFHMIN64-NEXT: add a2, sp, a2
+; ZVFHMIN64-NEXT: addi a2, a2, 800
+; ZVFHMIN64-NEXT: vl2r.v v8, (a2) # Unknown-size Folded Reload
+; ZVFHMIN64-NEXT: vmv.x.s a2, v8
; ZVFHMIN64-NEXT: csrr a3, vlenb
-; ZVFHMIN64-NEXT: slli a3, a3, 3
+; ZVFHMIN64-NEXT: slli a3, a3, 1
; ZVFHMIN64-NEXT: add a3, sp, a3
; ZVFHMIN64-NEXT: addi a3, a3, 800
; ZVFHMIN64-NEXT: vl2r.v v8, (a3) # Unknown-size Folded Reload
; ZVFHMIN64-NEXT: vmv.x.s a3, v8
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT: sb a0, 189(sp)
+; ZVFHMIN64-NEXT: lh a0, 760(sp)
+; ZVFHMIN64-NEXT: lh a1, 504(sp)
+; ZVFHMIN64-NEXT: fmv.h.x fa5, t3
; ZVFHMIN64-NEXT: csrr a4, vlenb
-; ZVFHMIN64-NEXT: li a5, 6
-; ZVFHMIN64-NEXT: mul a4, a4, a5
+; ZVFHMIN64-NEXT: li t3, 6
+; ZVFHMIN64-NEXT: mul a4, a4, t3
; ZVFHMIN64-NEXT: add a4, sp, a4
; ZVFHMIN64-NEXT: addi a4, a4, 800
; ZVFHMIN64-NEXT: vl2r.v v8, (a4) # Unknown-size Folded Reload
; ZVFHMIN64-NEXT: vmv.x.s a4, v8
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT: sb a0, 189(sp)
-; ZVFHMIN64-NEXT: lh a1, 760(sp)
-; ZVFHMIN64-NEXT: lh a5, 504(sp)
-; ZVFHMIN64-NEXT: csrr a0, vlenb
-; ZVFHMIN64-NEXT: li s3, 12
-; ZVFHMIN64-NEXT: mul a0, a0, s3
-; ZVFHMIN64-NEXT: add a0, sp, a0
-; ZVFHMIN64-NEXT: addi a0, a0, 800
-; ZVFHMIN64-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload
-; ZVFHMIN64-NEXT: vmv.x.s s5, v8
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a0
+; ZVFHMIN64-NEXT: fmv.h.x fa3, a1
+; ZVFHMIN64-NEXT: feq.h a0, fa4, fa3
+; ZVFHMIN64-NEXT: sb a0, 188(sp)
+; ZVFHMIN64-NEXT: lh a0, 758(sp)
+; ZVFHMIN64-NEXT: lh a1, 502(sp)
+; ZVFHMIN64-NEXT: fmv.h.x fa4, t2
+; ZVFHMIN64-NEXT: csrr t2, vlenb
+; ZVFHMIN64-NEXT: slli t2, t2, 3
+; ZVFHMIN64-NEXT: add t2, sp, t2
+; ZVFHMIN64-NEXT: addi t2, t2, 800
+; ZVFHMIN64-NEXT: vl2r.v v8, (t2) # Unknown-size Folded Reload
+; ZVFHMIN64-NEXT: vmv.x.s t2, v8
+; ZVFHMIN64-NEXT: fmv.h.x fa3, a0
+; ZVFHMIN64-NEXT: fmv.h.x fa2, a1
+; ZVFHMIN64-NEXT: feq.h a0, fa3, fa2
+; ZVFHMIN64-NEXT: sb a0, 187(sp)
+; ZVFHMIN64-NEXT: lh a0, 756(sp)
+; ZVFHMIN64-NEXT: lh a1, 500(sp)
+; ZVFHMIN64-NEXT: fmv.h.x fa3, t1
+; ZVFHMIN64-NEXT: csrr t1, vlenb
+; ZVFHMIN64-NEXT: li t3, 13
+; ZVFHMIN64-NEXT: mul t1, t1, t3
+; ZVFHMIN64-NEXT: add t1, sp, t1
+; ZVFHMIN64-NEXT: addi t1, t1, 800
+; ZVFHMIN64-NEXT: vl2r.v v8, (t1) # Unknown-size Folded Reload
+; ZVFHMIN64-NEXT: vmv.x.s t3, v8
+; ZVFHMIN64-NEXT: fmv.h.x fa2, a0
+; ZVFHMIN64-NEXT: fmv.h.x fa1, a1
+; ZVFHMIN64-NEXT: feq.h a0, fa2, fa1
+; ZVFHMIN64-NEXT: sb a0, 186(sp)
+; ZVFHMIN64-NEXT: lh a0, 754(sp)
+; ZVFHMIN64-NEXT: fmv.h.x fa2, t0
+; ZVFHMIN64-NEXT: lh a1, 498(sp)
+; ZVFHMIN64-NEXT: csrr t0, vlenb
+; ZVFHMIN64-NEXT: li t1, 19
+; ZVFHMIN64-NEXT: mul t0, t0, t1
+; ZVFHMIN64-NEXT: add t0, sp, t0
+; ZVFHMIN64-NEXT: addi t0, t0, 800
+; ZVFHMIN64-NEXT: vl2r.v v8, (t0) # Unknown-size Folded Reload
+; ZVFHMIN64-NEXT: vmv.x.s s3, v8
+; ZVFHMIN64-NEXT: fmv.h.x fa1, a0
; ZVFHMIN64-NEXT: csrr a0, vlenb
-; ZVFHMIN64-NEXT: li s3, 10
-; ZVFHMIN64-NEXT: mul a0, a0, s3
+; ZVFHMIN64-NEXT: li t0, 21
+; ZVFHMIN64-NEXT: mul a0, a0, t0
; ZVFHMIN64-NEXT: add a0, sp, a0
; ZVFHMIN64-NEXT: addi a0, a0, 800
; ZVFHMIN64-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload
; ZVFHMIN64-NEXT: vmv.x.s a0, v8
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a5
-; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4
-; ZVFHMIN64-NEXT: sb a1, 188(sp)
-; ZVFHMIN64-NEXT: lh a1, 758(sp)
-; ZVFHMIN64-NEXT: lh a5, 502(sp)
-; ZVFHMIN64-NEXT: csrr s3, vlenb
-; ZVFHMIN64-NEXT: slli s3, s3, 4
-; ZVFHMIN64-NEXT: add s3, sp, s3
-; ZVFHMIN64-NEXT: addi s3, s3, 800
-; ZVFHMIN64-NEXT: vl2r.v v8, (s3) # Unknown-size Folded Reload
-; ZVFHMIN64-NEXT: vmv.x.s s4, v8
-; ZVFHMIN64-NEXT: vmv.x.s s3, v16
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a5
-; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4
-; ZVFHMIN64-NEXT: fmv.h.x fa5, t4
-; ZVFHMIN64-NEXT: sb a1, 187(sp)
-; ZVFHMIN64-NEXT: lh a1, 756(sp)
-; ZVFHMIN64-NEXT: lh a5, 500(sp)
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a2
-; ZVFHMIN64-NEXT: feq.h t4, fa5, fa4
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a5
-; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4
-; ZVFHMIN64-NEXT: fmv.h.x fa5, t3
-; ZVFHMIN64-NEXT: sb a1, 186(sp)
-; ZVFHMIN64-NEXT: lh a1, 754(sp)
-; ZVFHMIN64-NEXT: lh a2, 498(sp)
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a3
-; ZVFHMIN64-NEXT: feq.h t3, fa5, fa4
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a2
-; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4
-; ZVFHMIN64-NEXT: fmv.h.x fa5, t1
+; ZVFHMIN64-NEXT: fmv.h.x fa0, a1
+; ZVFHMIN64-NEXT: feq.h a1, fa1, fa0
+; ZVFHMIN64-NEXT: fmv.h.x fa1, a2
; ZVFHMIN64-NEXT: sb a1, 185(sp)
; ZVFHMIN64-NEXT: lh a1, 752(sp)
+; ZVFHMIN64-NEXT: fmv.h.x fa0, a3
; ZVFHMIN64-NEXT: lh a2, 496(sp)
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a4
-; ZVFHMIN64-NEXT: feq.h t1, fa5, fa4
+; ZVFHMIN64-NEXT: feq.h t0, fa5, fa1
; ZVFHMIN64-NEXT: fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT: feq.h t1, fa4, fa0
; ZVFHMIN64-NEXT: fmv.h.x fa4, a2
; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4
-; ZVFHMIN64-NEXT: fmv.h.x fa5, t2
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a4
; ZVFHMIN64-NEXT: sb a1, 184(sp)
; ZVFHMIN64-NEXT: lh a1, 750(sp)
+; ZVFHMIN64-NEXT: fmv.h.x fa4, t2
; ZVFHMIN64-NEXT: lh a2, 494(sp)
-; ZVFHMIN64-NEXT: fmv.h.x fa4, s5
-; ZVFHMIN64-NEXT: feq.h a3, fa5, fa4
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a2
-; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4
-; ZVFHMIN64-NEXT: fmv.h.x fa5, t0
-; ZVFHMIN64-NEXT: sb a1, 183(sp)
-; ZVFHMIN64-NEXT: lh a1, 748(sp)
-; ZVFHMIN64-NEXT: lh a2, 492(sp)
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a0
-; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT: feq.h a3, fa3, fa5
; ZVFHMIN64-NEXT: fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT: feq.h a1, fa2, fa4
; ZVFHMIN64-NEXT: fmv.h.x fa4, a2
-; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT: feq.h a2, fa5, fa4
; ZVFHMIN64-NEXT: fmv.h.x fa5, a7
-; ZVFHMIN64-NEXT: sb a1, 182(sp)
-; ZVFHMIN64-NEXT: lh a1, 746(sp)
-; ZVFHMIN64-NEXT: lh a2, 490(sp)
-; ZVFHMIN64-NEXT: fmv.h.x fa4, s4
-; ZVFHMIN64-NEXT: feq.h a4, fa5, fa4
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a2
-; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT: sb a2, 183(sp)
+; ZVFHMIN64-NEXT: lh a2, 748(sp)
+; ZVFHMIN64-NEXT: lh a4, 492(sp)
+; ZVFHMIN64-NEXT: fmv.h.x fa4, t3
+; ZVFHMIN64-NEXT: feq.h a7, fa5, fa4
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a2
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a4
+; ZVFHMIN64-NEXT: feq.h a2, fa5, fa4
; ZVFHMIN64-NEXT: fmv.h.x fa5, a6
-; ZVFHMIN64-NEXT: sb a1, 181(sp)
-; ZVFHMIN64-NEXT: lh a1, 744(sp)
-; ZVFHMIN64-NEXT: lh a2, 488(sp)
+; ZVFHMIN64-NEXT: sb a2, 182(sp)
+; ZVFHMIN64-NEXT: lh a2, 746(sp)
+; ZVFHMIN64-NEXT: lh a4, 490(sp)
; ZVFHMIN64-NEXT: fmv.h.x fa4, s3
; ZVFHMIN64-NEXT: feq.h a6, fa5, fa4
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a2
-; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4
-; ZVFHMIN64-NEXT: ld a2, 80(sp) # 8-byte Folded Reload
; ZVFHMIN64-NEXT: fmv.h.x fa5, a2
-; ZVFHMIN64-NEXT: addi a2, sp, 800
-; ZVFHMIN64-NEXT: vl2r.v v8, (a2) # Unknown-size Folded Reload
-; ZVFHMIN64-NEXT: vmv.x.s a2, v8
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a4
+; ZVFHMIN64-NEXT: feq.h a2, fa5, fa4
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a5
+; ZVFHMIN64-NEXT: sb a2, 181(sp)
+; ZVFHMIN64-NEXT: lh a2, 744(sp)
+; ZVFHMIN64-NEXT: lh a4, 488(sp)
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a0
+; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a2
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a4
+; ZVFHMIN64-NEXT: feq.h a2, fa5, fa4
+; ZVFHMIN64-NEXT: ld a4, 88(sp) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a4
+; ZVFHMIN64-NEXT: vmv.x.s a5, v0
; ZVFHMIN64-NEXT: vsetivli zero, 1, e16, m2, ta, ma
; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 15
-; ZVFHMIN64-NEXT: vmv.x.s a5, v8
-; ZVFHMIN64-NEXT: sb a1, 180(sp)
-; ZVFHMIN64-NEXT: lh a1, 742(sp)
-; ZVFHMIN64-NEXT: lh a7, 486(sp)
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a2
+; ZVFHMIN64-NEXT: vmv.x.s a4, v8
+; ZVFHMIN64-NEXT: sb a2, 180(sp)
+; ZVFHMIN64-NEXT: lh a2, 742(sp)
+; ZVFHMIN64-NEXT: lh t2, 486(sp)
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a5
+; ZVFHMIN64-NEXT: feq.h a5, fa5, fa4
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a2
+; ZVFHMIN64-NEXT: fmv.h.x fa4, t2
; ZVFHMIN64-NEXT: feq.h a2, fa5, fa4
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a7
-; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4
-; ZVFHMIN64-NEXT: sb a1, 179(sp)
-; ZVFHMIN64-NEXT: lh a1, 740(sp)
-; ZVFHMIN64-NEXT: lh a7, 484(sp)
-; ZVFHMIN64-NEXT: sb a3, 140(sp)
-; ZVFHMIN64-NEXT: sb t1, 141(sp)
-; ZVFHMIN64-NEXT: sb t3, 142(sp)
-; ZVFHMIN64-NEXT: sb t4, 143(sp)
-; ZVFHMIN64-NEXT: sb a2, 136(sp)
-; ZVFHMIN64-NEXT: sb a6, 137(sp)
-; ZVFHMIN64-NEXT: sb a4, 138(sp)
-; ZVFHMIN64-NEXT: sb a0, 139(sp)
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a7
+; ZVFHMIN64-NEXT: sb a2, 179(sp)
+; ZVFHMIN64-NEXT: lh a2, 740(sp)
+; ZVFHMIN64-NEXT: lh t2, 484(sp)
+; ZVFHMIN64-NEXT: sb a1, 140(sp)
+; ZVFHMIN64-NEXT: sb a3, 141(sp)
+; ZVFHMIN64-NEXT: sb t1, 142(sp)
+; ZVFHMIN64-NEXT: sb t0, 143(sp)
+; ZVFHMIN64-NEXT: sb a5, 136(sp)
+; ZVFHMIN64-NEXT: sb a0, 137(sp)
+; ZVFHMIN64-NEXT: sb a6, 138(sp)
+; ZVFHMIN64-NEXT: sb a7, 139(sp)
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a2
+; ZVFHMIN64-NEXT: fmv.h.x fa4, t2
; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
; ZVFHMIN64-NEXT: sb a0, 178(sp)
-; ZVFHMIN64-NEXT: lh a1, 638(sp)
-; ZVFHMIN64-NEXT: lh a2, 382(sp)
+; ZVFHMIN64-NEXT: lh a0, 638(sp)
+; ZVFHMIN64-NEXT: lh a1, 382(sp)
; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 14
-; ZVFHMIN64-NEXT: vmv.x.s a0, v8
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a2
-; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4
-; ZVFHMIN64-NEXT: sb a1, 255(sp)
-; ZVFHMIN64-NEXT: lh a1, 636(sp)
-; ZVFHMIN64-NEXT: lh a2, 380(sp)
-; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 13
; ZVFHMIN64-NEXT: vmv.x.s t2, v8
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a2
-; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4
-; ZVFHMIN64-NEXT: sb a1, 254(sp)
-; ZVFHMIN64-NEXT: lh a1, 634(sp)
-; ZVFHMIN64-NEXT: lh a2, 378(sp)
-; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 12
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT: sb a0, 255(sp)
+; ZVFHMIN64-NEXT: lh a0, 636(sp)
+; ZVFHMIN64-NEXT: lh a1, 380(sp)
+; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 13
; ZVFHMIN64-NEXT: vmv.x.s t1, v8
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a2
-; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4
-; ZVFHMIN64-NEXT: sb a1, 253(sp)
-; ZVFHMIN64-NEXT: lh a1, 632(sp)
-; ZVFHMIN64-NEXT: lh a2, 376(sp)
-; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 11
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT: sb a0, 254(sp)
+; ZVFHMIN64-NEXT: lh a0, 634(sp)
+; ZVFHMIN64-NEXT: lh a1, 378(sp)
+; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 12
; ZVFHMIN64-NEXT: vmv.x.s t0, v8
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a2
-; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4
-; ZVFHMIN64-NEXT: sb a1, 252(sp)
-; ZVFHMIN64-NEXT: lh a1, 630(sp)
-; ZVFHMIN64-NEXT: lh a2, 374(sp)
-; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 10
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT: sb a0, 253(sp)
+; ZVFHMIN64-NEXT: lh a0, 632(sp)
+; ZVFHMIN64-NEXT: lh a1, 376(sp)
+; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 11
; ZVFHMIN64-NEXT: vmv.x.s a7, v8
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a2
-; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4
-; ZVFHMIN64-NEXT: sb a1, 251(sp)
-; ZVFHMIN64-NEXT: lh a1, 628(sp)
-; ZVFHMIN64-NEXT: lh a2, 372(sp)
-; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 9
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT: sb a0, 252(sp)
+; ZVFHMIN64-NEXT: lh a0, 630(sp)
+; ZVFHMIN64-NEXT: lh a1, 374(sp)
+; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 10
; ZVFHMIN64-NEXT: vmv.x.s a6, v8
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a2
-; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4
-; ZVFHMIN64-NEXT: ld a2, 88(sp) # 8-byte Folded Reload
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a2
-; ZVFHMIN64-NEXT: sb a1, 250(sp)
-; ZVFHMIN64-NEXT: lh a1, 626(sp)
-; ZVFHMIN64-NEXT: lh a2, 370(sp)
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a5
-; ZVFHMIN64-NEXT: feq.h a4, fa5, fa4
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a2
-; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4
-; ZVFHMIN64-NEXT: ld a2, 96(sp) # 8-byte Folded Reload
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a2
-; ZVFHMIN64-NEXT: sb a1, 249(sp)
-; ZVFHMIN64-NEXT: lh a1, 624(sp)
-; ZVFHMIN64-NEXT: lh a2, 368(sp)
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a0
-; ZVFHMIN64-NEXT: feq.h a3, fa5, fa4
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a2
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT: ld a1, 104(sp) # 8-byte Folded Reload
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT: sb a0, 248(sp)
-; ZVFHMIN64-NEXT: lh a0, 622(sp)
-; ZVFHMIN64-NEXT: lh a1, 366(sp)
-; ZVFHMIN64-NEXT: fmv.h.x fa4, t2
-; ZVFHMIN64-NEXT: feq.h a2, fa5, fa4
+; ZVFHMIN64-NEXT: sb a0, 251(sp)
+; ZVFHMIN64-NEXT: lh a0, 628(sp)
+; ZVFHMIN64-NEXT: lh a1, 372(sp)
+; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 9
+; ZVFHMIN64-NEXT: vmv.x.s a5, v8
; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT: ld a1, 112(sp) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT: ld a1, 104(sp) # 8-byte Folded Reload
; ZVFHMIN64-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT: sb a0, 247(sp)
-; ZVFHMIN64-NEXT: lh a0, 620(sp)
-; ZVFHMIN64-NEXT: lh a1, 364(sp)
-; ZVFHMIN64-NEXT: fmv.h.x fa4, t1
-; ZVFHMIN64-NEXT: feq.h a5, fa5, fa4
+; ZVFHMIN64-NEXT: sb a0, 250(sp)
+; ZVFHMIN64-NEXT: lh a0, 626(sp)
+; ZVFHMIN64-NEXT: lh a1, 370(sp)
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a4
+; ZVFHMIN64-NEXT: feq.h a2, fa5, fa4
; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
; ZVFHMIN64-NEXT: ld a1, 120(sp) # 8-byte Folded Reload
; ZVFHMIN64-NEXT: fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT: sb a0, 246(sp)
-; ZVFHMIN64-NEXT: lh a0, 618(sp)
-; ZVFHMIN64-NEXT: lh a1, 362(sp)
+; ZVFHMIN64-NEXT: sb a0, 249(sp)
+; ZVFHMIN64-NEXT: lh a1, 624(sp)
+; ZVFHMIN64-NEXT: lh a3, 368(sp)
+; ZVFHMIN64-NEXT: fmv.h.x fa4, t2
+; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a3
+; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT: ld a3, 96(sp) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a3
+; ZVFHMIN64-NEXT: sb a1, 248(sp)
+; ZVFHMIN64-NEXT: lh a1, 622(sp)
+; ZVFHMIN64-NEXT: lh a3, 366(sp)
+; ZVFHMIN64-NEXT: fmv.h.x fa4, t1
+; ZVFHMIN64-NEXT: feq.h a4, fa5, fa4
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a3
+; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT: ld a3, 112(sp) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a3
+; ZVFHMIN64-NEXT: sb a1, 247(sp)
+; ZVFHMIN64-NEXT: lh a1, 620(sp)
+; ZVFHMIN64-NEXT: lh a3, 364(sp)
; ZVFHMIN64-NEXT: fmv.h.x fa4, t0
; ZVFHMIN64-NEXT: feq.h t0, fa5, fa4
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a3
+; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4
; ZVFHMIN64-NEXT: fmv.h.x fa5, s2
-; ZVFHMIN64-NEXT: sb a0, 245(sp)
-; ZVFHMIN64-NEXT: lh a0, 616(sp)
-; ZVFHMIN64-NEXT: lh a1, 360(sp)
+; ZVFHMIN64-NEXT: sb a1, 246(sp)
+; ZVFHMIN64-NEXT: lh a1, 618(sp)
+; ZVFHMIN64-NEXT: lh a3, 362(sp)
; ZVFHMIN64-NEXT: fmv.h.x fa4, a7
; ZVFHMIN64-NEXT: feq.h a7, fa5, fa4
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a3
+; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4
; ZVFHMIN64-NEXT: fmv.h.x fa5, t6
-; ZVFHMIN64-NEXT: sb a0, 244(sp)
-; ZVFHMIN64-NEXT: lh a0, 614(sp)
-; ZVFHMIN64-NEXT: lh a1, 358(sp)
+; ZVFHMIN64-NEXT: sb a1, 245(sp)
+; ZVFHMIN64-NEXT: lh a1, 616(sp)
+; ZVFHMIN64-NEXT: lh a3, 360(sp)
; ZVFHMIN64-NEXT: fmv.h.x fa4, a6
; ZVFHMIN64-NEXT: feq.h a6, fa5, fa4
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a3
+; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4
; ZVFHMIN64-NEXT: fmv.h.x fa5, t5
+; ZVFHMIN64-NEXT: sb a1, 244(sp)
+; ZVFHMIN64-NEXT: lh a1, 614(sp)
+; ZVFHMIN64-NEXT: lh a3, 358(sp)
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a5
+; ZVFHMIN64-NEXT: feq.h a5, fa5, fa4
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a3
+; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT: fmv.h.x fa5, t4
; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 8
-; ZVFHMIN64-NEXT: vmv.x.s a1, v8
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT: sb a0, 243(sp)
-; ZVFHMIN64-NEXT: lh a0, 612(sp)
-; ZVFHMIN64-NEXT: lh a1, 356(sp)
-; ZVFHMIN64-NEXT: sb a5, 204(sp)
-; ZVFHMIN64-NEXT: sb a2, 205(sp)
-; ZVFHMIN64-NEXT: sb a3, 206(sp)
-; ZVFHMIN64-NEXT: sb a4, 207(sp)
-; ZVFHMIN64-NEXT: feq.h a2, fa5, fa4
-; ZVFHMIN64-NEXT: sb a2, 200(sp)
-; ZVFHMIN64-NEXT: sb a6, 201(sp)
-; ZVFHMIN64-NEXT: sb a7, 202(sp)
-; ZVFHMIN64-NEXT: sb t0, 203(sp)
-; ZVFHMIN64-NEXT: li a2, 128
-; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT: fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT: vmv.x.s a3, v8
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a3
+; ZVFHMIN64-NEXT: sb a1, 243(sp)
+; ZVFHMIN64-NEXT: lh a1, 612(sp)
+; ZVFHMIN64-NEXT: lh a3, 356(sp)
+; ZVFHMIN64-NEXT: sb t0, 204(sp)
+; ZVFHMIN64-NEXT: sb a4, 205(sp)
+; ZVFHMIN64-NEXT: sb a0, 206(sp)
+; ZVFHMIN64-NEXT: sb a2, 207(sp)
; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT: sb a0, 242(sp)
-; ZVFHMIN64-NEXT: addi a0, sp, 128
-; ZVFHMIN64-NEXT: vsetvli zero, a2, e8, m8, ta, ma
-; ZVFHMIN64-NEXT: vle8.v v8, (a0)
+; ZVFHMIN64-NEXT: sb a0, 200(sp)
+; ZVFHMIN64-NEXT: sb a5, 201(sp)
+; ZVFHMIN64-NEXT: sb a6, 202(sp)
+; ZVFHMIN64-NEXT: sb a7, 203(sp)
+; ZVFHMIN64-NEXT: li a0, 128
+; ZVFHMIN64-NEXT: fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT: fmv.h.x fa4, a3
+; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT: sb a1, 242(sp)
+; ZVFHMIN64-NEXT: addi a1, sp, 128
+; ZVFHMIN64-NEXT: vsetvli zero, a0, e8, m8, ta, ma
+; ZVFHMIN64-NEXT: vle8.v v8, (a1)
; ZVFHMIN64-NEXT: vand.vi v8, v8, 1
; ZVFHMIN64-NEXT: vmsne.vi v0, v8, 0
; ZVFHMIN64-NEXT: addi sp, s0, -896
diff --git a/llvm/test/CodeGen/RISCV/rvv/intrinsic-vector-match.ll b/llvm/test/CodeGen/RISCV/rvv/intrinsic-vector-match.ll
index 5b272c98a1e0ac..dd2a8240ee2533 100644
--- a/llvm/test/CodeGen/RISCV/rvv/intrinsic-vector-match.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/intrinsic-vector-match.ll
@@ -507,34 +507,26 @@ define <8 x i1> @match_v8i8_v16i8(<8 x i8> %op1, <16 x i8> %op2, <8 x i1> %mask)
define <vscale x 16 x i1> @match_nxv16i8_v32i8(<vscale x 16 x i8> %op1, <32 x i8> %op2, <vscale x 16 x i1> %mask) {
; RV32-LABEL: match_nxv16i8_v32i8:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -64
-; RV32-NEXT: .cfi_def_cfa_offset 64
-; RV32-NEXT: sw ra, 60(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s0, 56(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s1, 52(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s2, 48(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s3, 44(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s4, 40(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s5, 36(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s6, 32(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s7, 28(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s8, 24(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s9, 20(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s10, 16(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s11, 12(sp) # 4-byte Folded Spill
-; RV32-NEXT: .cfi_offset ra, -4
-; RV32-NEXT: .cfi_offset s0, -8
-; RV32-NEXT: .cfi_offset s1, -12
-; RV32-NEXT: .cfi_offset s2, -16
-; RV32-NEXT: .cfi_offset s3, -20
-; RV32-NEXT: .cfi_offset s4, -24
-; RV32-NEXT: .cfi_offset s5, -28
-; RV32-NEXT: .cfi_offset s6, -32
-; RV32-NEXT: .cfi_offset s7, -36
-; RV32-NEXT: .cfi_offset s8, -40
-; RV32-NEXT: .cfi_offset s9, -44
-; RV32-NEXT: .cfi_offset s10, -48
-; RV32-NEXT: .cfi_offset s11, -52
+; RV32-NEXT: addi sp, sp, -48
+; RV32-NEXT: .cfi_def_cfa_offset 48
+; RV32-NEXT: sw s0, 44(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s1, 40(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s2, 36(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s3, 32(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s4, 28(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s5, 24(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s6, 20(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s7, 16(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s8, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT: .cfi_offset s0, -4
+; RV32-NEXT: .cfi_offset s1, -8
+; RV32-NEXT: .cfi_offset s2, -12
+; RV32-NEXT: .cfi_offset s3, -16
+; RV32-NEXT: .cfi_offset s4, -20
+; RV32-NEXT: .cfi_offset s5, -24
+; RV32-NEXT: .cfi_offset s6, -28
+; RV32-NEXT: .cfi_offset s7, -32
+; RV32-NEXT: .cfi_offset s8, -36
; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma
; RV32-NEXT: vmv.x.s a0, v10
; RV32-NEXT: vslidedown.vi v12, v10, 1
@@ -592,43 +584,43 @@ define <vscale x 16 x i1> @match_nxv16i8_v32i8(<vscale x 16 x i8> %op1, <32 x i8
; RV32-NEXT: vmv.x.s s5, v15
; RV32-NEXT: vmv.x.s s6, v16
; RV32-NEXT: vmv.x.s s7, v17
-; RV32-NEXT: vmv.x.s s8, v18
-; RV32-NEXT: vmv.x.s s9, v19
-; RV32-NEXT: vmv.x.s s10, v20
-; RV32-NEXT: vmv.x.s s11, v21
-; RV32-NEXT: vsetvli ra, zero, e8, m2, ta, ma
+; RV32-NEXT: vsetvli s8, zero, e8, m2, ta, ma
; RV32-NEXT: vmseq.vx v12, v8, a0
-; RV32-NEXT: vmv.x.s a0, v22
+; RV32-NEXT: vmv.x.s a0, v18
; RV32-NEXT: vmseq.vx v13, v8, s2
-; RV32-NEXT: vmv.x.s s2, v23
+; RV32-NEXT: vmv.x.s s2, v19
; RV32-NEXT: vmseq.vx v14, v8, s3
-; RV32-NEXT: vmv.x.s s3, v11
-; RV32-NEXT: vmseq.vx v11, v8, s4
-; RV32-NEXT: vmv.x.s s4, v24
-; RV32-NEXT: vmseq.vx v15, v8, s5
-; RV32-NEXT: vmv.x.s s5, v10
+; RV32-NEXT: vmv.x.s s3, v20
+; RV32-NEXT: vmseq.vx v15, v8, s4
+; RV32-NEXT: vmv.x.s s4, v21
+; RV32-NEXT: vmseq.vx v16, v8, s5
+; RV32-NEXT: vmv.x.s s5, v22
+; RV32-NEXT: vmseq.vx v17, v8, s6
+; RV32-NEXT: vmv.x.s s6, v23
+; RV32-NEXT: vmseq.vx v18, v8, s7
+; RV32-NEXT: vmv.x.s s7, v11
+; RV32-NEXT: vmseq.vx v11, v8, a0
+; RV32-NEXT: vmv.x.s a0, v24
+; RV32-NEXT: vmseq.vx v19, v8, s2
+; RV32-NEXT: vmv.x.s s2, v10
; RV32-NEXT: vmor.mm v10, v12, v13
-; RV32-NEXT: vmseq.vx v12, v8, s6
; RV32-NEXT: vmor.mm v10, v10, v14
-; RV32-NEXT: vmseq.vx v13, v8, s7
-; RV32-NEXT: vmor.mm v10, v10, v11
-; RV32-NEXT: vmseq.vx v11, v8, s8
; RV32-NEXT: vmor.mm v10, v10, v15
-; RV32-NEXT: vmseq.vx v14, v8, s9
-; RV32-NEXT: vmor.mm v10, v10, v12
-; RV32-NEXT: vmseq.vx v12, v8, s10
-; RV32-NEXT: vmor.mm v10, v10, v13
-; RV32-NEXT: vmseq.vx v13, v8, s11
-; RV32-NEXT: vmor.mm v10, v10, v11
-; RV32-NEXT: vmseq.vx v11, v8, a0
-; RV32-NEXT: vmor.mm v10, v10, v14
-; RV32-NEXT: vmseq.vx v14, v8, s2
-; RV32-NEXT: vmor.mm v10, v10, v12
+; RV32-NEXT: vmor.mm v10, v10, v16
+; RV32-NEXT: vmor.mm v10, v10, v17
; RV32-NEXT: vmseq.vx v12, v8, s3
-; RV32-NEXT: vmor.mm v10, v10, v13
+; RV32-NEXT: vmor.mm v10, v10, v18
; RV32-NEXT: vmseq.vx v13, v8, s4
; RV32-NEXT: vmor.mm v10, v10, v11
; RV32-NEXT: vmseq.vx v11, v8, s5
+; RV32-NEXT: vmor.mm v10, v10, v19
+; RV32-NEXT: vmseq.vx v14, v8, s6
+; RV32-NEXT: vmor.mm v10, v10, v12
+; RV32-NEXT: vmseq.vx v12, v8, s7
+; RV32-NEXT: vmor.mm v10, v10, v13
+; RV32-NEXT: vmseq.vx v13, v8, a0
+; RV32-NEXT: vmor.mm v10, v10, v11
+; RV32-NEXT: vmseq.vx v11, v8, s2
; RV32-NEXT: vmor.mm v10, v10, v14
; RV32-NEXT: vmseq.vx v14, v8, a1
; RV32-NEXT: vmor.mm v10, v10, v12
@@ -666,20 +658,15 @@ define <vscale x 16 x i1> @match_nxv16i8_v32i8(<vscale x 16 x i8> %op1, <32 x i8
; RV32-NEXT: vmseq.vx v11, v8, s1
; RV32-NEXT: vmor.mm v8, v10, v11
; RV32-NEXT: vmand.mm v0, v8, v0
-; RV32-NEXT: lw ra, 60(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s0, 56(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s1, 52(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s2, 48(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s3, 44(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s4, 40(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s5, 36(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s6, 32(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s7, 28(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s8, 24(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s9, 20(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s10, 16(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s11, 12(sp) # 4-byte Folded Reload
-; RV32-NEXT: .cfi_restore ra
+; RV32-NEXT: lw s0, 44(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s1, 40(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s2, 36(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s3, 32(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s4, 28(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s5, 24(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s6, 20(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s7, 16(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s8, 12(sp) # 4-byte Folded Reload
; RV32-NEXT: .cfi_restore s0
; RV32-NEXT: .cfi_restore s1
; RV32-NEXT: .cfi_restore s2
@@ -689,43 +676,32 @@ define <vscale x 16 x i1> @match_nxv16i8_v32i8(<vscale x 16 x i8> %op1, <32 x i8
; RV32-NEXT: .cfi_restore s6
; RV32-NEXT: .cfi_restore s7
; RV32-NEXT: .cfi_restore s8
-; RV32-NEXT: .cfi_restore s9
-; RV32-NEXT: .cfi_restore s10
-; RV32-NEXT: .cfi_restore s11
-; RV32-NEXT: addi sp, sp, 64
+; RV32-NEXT: addi sp, sp, 48
; RV32-NEXT: .cfi_def_cfa_offset 0
; RV32-NEXT: ret
;
; RV64-LABEL: match_nxv16i8_v32i8:
; RV64: # %bb.0:
-; RV64-NEXT: addi sp, sp, -112
-; RV64-NEXT: .cfi_def_cfa_offset 112
-; RV64-NEXT: sd ra, 104(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s0, 96(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s1, 88(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s2, 80(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s3, 72(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s4, 64(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s5, 56(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s6, 48(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s7, 40(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s8, 32(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s9, 24(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s10, 16(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s11, 8(sp) # 8-byte Folded Spill
-; RV64-NEXT: .cfi_offset ra, -8
-; RV64-NEXT: .cfi_offset s0, -16
-; RV64-NEXT: .cfi_offset s1, -24
-; RV64-NEXT: .cfi_offset s2, -32
-; RV64-NEXT: .cfi_offset s3, -40
-; RV64-NEXT: .cfi_offset s4, -48
-; RV64-NEXT: .cfi_offset s5, -56
-; RV64-NEXT: .cfi_offset s6, -64
-; RV64-NEXT: .cfi_offset s7, -72
-; RV64-NEXT: .cfi_offset s8, -80
-; RV64-NEXT: .cfi_offset s9, -88
-; RV64-NEXT: .cfi_offset s10, -96
-; RV64-NEXT: .cfi_offset s11, -104
+; RV64-NEXT: addi sp, sp, -80
+; RV64-NEXT: .cfi_def_cfa_offset 80
+; RV64-NEXT: sd s0, 72(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s1, 64(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s2, 56(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s3, 48(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s4, 40(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s5, 32(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s6, 24(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s7, 16(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s8, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT: .cfi_offset s0, -8
+; RV64-NEXT: .cfi_offset s1, -16
+; RV64-NEXT: .cfi_offset s2, -24
+; RV64-NEXT: .cfi_offset s3, -32
+; RV64-NEXT: .cfi_offset s4, -40
+; RV64-NEXT: .cfi_offset s5, -48
+; RV64-NEXT: .cfi_offset s6, -56
+; RV64-NEXT: .cfi_offset s7, -64
+; RV64-NEXT: .cfi_offset s8, -72
; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma
; RV64-NEXT: vmv.x.s a0, v10
; RV64-NEXT: vslidedown.vi v12, v10, 1
@@ -783,43 +759,43 @@ define <vscale x 16 x i1> @match_nxv16i8_v32i8(<vscale x 16 x i8> %op1, <32 x i8
; RV64-NEXT: vmv.x.s s5, v15
; RV64-NEXT: vmv.x.s s6, v16
; RV64-NEXT: vmv.x.s s7, v17
-; RV64-NEXT: vmv.x.s s8, v18
-; RV64-NEXT: vmv.x.s s9, v19
-; RV64-NEXT: vmv.x.s s10, v20
-; RV64-NEXT: vmv.x.s s11, v21
-; RV64-NEXT: vsetvli ra, zero, e8, m2, ta, ma
+; RV64-NEXT: vsetvli s8, zero, e8, m2, ta, ma
; RV64-NEXT: vmseq.vx v12, v8, a0
-; RV64-NEXT: vmv.x.s a0, v22
+; RV64-NEXT: vmv.x.s a0, v18
; RV64-NEXT: vmseq.vx v13, v8, s2
-; RV64-NEXT: vmv.x.s s2, v23
+; RV64-NEXT: vmv.x.s s2, v19
; RV64-NEXT: vmseq.vx v14, v8, s3
-; RV64-NEXT: vmv.x.s s3, v11
-; RV64-NEXT: vmseq.vx v11, v8, s4
-; RV64-NEXT: vmv.x.s s4, v24
-; RV64-NEXT: vmseq.vx v15, v8, s5
-; RV64-NEXT: vmv.x.s s5, v10
+; RV64-NEXT: vmv.x.s s3, v20
+; RV64-NEXT: vmseq.vx v15, v8, s4
+; RV64-NEXT: vmv.x.s s4, v21
+; RV64-NEXT: vmseq.vx v16, v8, s5
+; RV64-NEXT: vmv.x.s s5, v22
+; RV64-NEXT: vmseq.vx v17, v8, s6
+; RV64-NEXT: vmv.x.s s6, v23
+; RV64-NEXT: vmseq.vx v18, v8, s7
+; RV64-NEXT: vmv.x.s s7, v11
+; RV64-NEXT: vmseq.vx v11, v8, a0
+; RV64-NEXT: vmv.x.s a0, v24
+; RV64-NEXT: vmseq.vx v19, v8, s2
+; RV64-NEXT: vmv.x.s s2, v10
; RV64-NEXT: vmor.mm v10, v12, v13
-; RV64-NEXT: vmseq.vx v12, v8, s6
; RV64-NEXT: vmor.mm v10, v10, v14
-; RV64-NEXT: vmseq.vx v13, v8, s7
-; RV64-NEXT: vmor.mm v10, v10, v11
-; RV64-NEXT: vmseq.vx v11, v8, s8
; RV64-NEXT: vmor.mm v10, v10, v15
-; RV64-NEXT: vmseq.vx v14, v8, s9
-; RV64-NEXT: vmor.mm v10, v10, v12
-; RV64-NEXT: vmseq.vx v12, v8, s10
-; RV64-NEXT: vmor.mm v10, v10, v13
-; RV64-NEXT: vmseq.vx v13, v8, s11
-; RV64-NEXT: vmor.mm v10, v10, v11
-; RV64-NEXT: vmseq.vx v11, v8, a0
-; RV64-NEXT: vmor.mm v10, v10, v14
-; RV64-NEXT: vmseq.vx v14, v8, s2
-; RV64-NEXT: vmor.mm v10, v10, v12
+; RV64-NEXT: vmor.mm v10, v10, v16
+; RV64-NEXT: vmor.mm v10, v10, v17
; RV64-NEXT: vmseq.vx v12, v8, s3
-; RV64-NEXT: vmor.mm v10, v10, v13
+; RV64-NEXT: vmor.mm v10, v10, v18
; RV64-NEXT: vmseq.vx v13, v8, s4
; RV64-NEXT: vmor.mm v10, v10, v11
; RV64-NEXT: vmseq.vx v11, v8, s5
+; RV64-NEXT: vmor.mm v10, v10, v19
+; RV64-NEXT: vmseq.vx v14, v8, s6
+; RV64-NEXT: vmor.mm v10, v10, v12
+; RV64-NEXT: vmseq.vx v12, v8, s7
+; RV64-NEXT: vmor.mm v10, v10, v13
+; RV64-NEXT: vmseq.vx v13, v8, a0
+; RV64-NEXT: vmor.mm v10, v10, v11
+; RV64-NEXT: vmseq.vx v11, v8, s2
; RV64-NEXT: vmor.mm v10, v10, v14
; RV64-NEXT: vmseq.vx v14, v8, a1
; RV64-NEXT: vmor.mm v10, v10, v12
@@ -857,20 +833,15 @@ define <vscale x 16 x i1> @match_nxv16i8_v32i8(<vscale x 16 x i8> %op1, <32 x i8
; RV64-NEXT: vmseq.vx v11, v8, s1
; RV64-NEXT: vmor.mm v8, v10, v11
; RV64-NEXT: vmand.mm v0, v8, v0
-; RV64-NEXT: ld ra, 104(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s0, 96(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s1, 88(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s2, 80(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s3, 72(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s4, 64(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s5, 56(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s6, 48(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s7, 40(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s8, 32(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s9, 24(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s10, 16(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s11, 8(sp) # 8-byte Folded Reload
-; RV64-NEXT: .cfi_restore ra
+; RV64-NEXT: ld s0, 72(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s1, 64(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s2, 56(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s3, 48(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s4, 40(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s5, 32(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s6, 24(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s7, 16(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s8, 8(sp) # 8-byte Folded Reload
; RV64-NEXT: .cfi_restore s0
; RV64-NEXT: .cfi_restore s1
; RV64-NEXT: .cfi_restore s2
@@ -880,10 +851,7 @@ define <vscale x 16 x i1> @match_nxv16i8_v32i8(<vscale x 16 x i8> %op1, <32 x i8
; RV64-NEXT: .cfi_restore s6
; RV64-NEXT: .cfi_restore s7
; RV64-NEXT: .cfi_restore s8
-; RV64-NEXT: .cfi_restore s9
-; RV64-NEXT: .cfi_restore s10
-; RV64-NEXT: .cfi_restore s11
-; RV64-NEXT: addi sp, sp, 112
+; RV64-NEXT: addi sp, sp, 80
; RV64-NEXT: .cfi_def_cfa_offset 0
; RV64-NEXT: ret
%r = tail call <vscale x 16 x i1> @llvm.experimental.vector.match(<vscale x 16 x i8> %op1, <32 x i8> %op2, <vscale x 16 x i1> %mask)
@@ -893,20 +861,16 @@ define <vscale x 16 x i1> @match_nxv16i8_v32i8(<vscale x 16 x i8> %op1, <32 x i8
define <16 x i1> @match_v16i8_v32i8(<16 x i8> %op1, <32 x i8> %op2, <16 x i1> %mask) {
; RV32-LABEL: match_v16i8_v32i8:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -48
-; RV32-NEXT: .cfi_def_cfa_offset 48
-; RV32-NEXT: sw s0, 44(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s1, 40(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s2, 36(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s3, 32(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s4, 28(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s5, 24(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s6, 20(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s7, 16(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s8, 12(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s9, 8(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s10, 4(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s11, 0(sp) # 4-byte Folded Spill
+; RV32-NEXT: addi sp, sp, -32
+; RV32-NEXT: .cfi_def_cfa_offset 32
+; RV32-NEXT: sw s0, 28(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s1, 24(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s2, 20(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s3, 16(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s4, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s5, 8(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s6, 4(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s7, 0(sp) # 4-byte Folded Spill
; RV32-NEXT: .cfi_offset s0, -4
; RV32-NEXT: .cfi_offset s1, -8
; RV32-NEXT: .cfi_offset s2, -12
@@ -915,10 +879,6 @@ define <16 x i1> @match_v16i8_v32i8(<16 x i8> %op1, <32 x i8> %op2, <16 x i1> %m
; RV32-NEXT: .cfi_offset s5, -24
; RV32-NEXT: .cfi_offset s6, -28
; RV32-NEXT: .cfi_offset s7, -32
-; RV32-NEXT: .cfi_offset s8, -36
-; RV32-NEXT: .cfi_offset s9, -40
-; RV32-NEXT: .cfi_offset s10, -44
-; RV32-NEXT: .cfi_offset s11, -48
; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma
; RV32-NEXT: vmv.x.s a0, v10
; RV32-NEXT: vslidedown.vi v9, v10, 1
@@ -976,42 +936,42 @@ define <16 x i1> @match_v16i8_v32i8(<16 x i8> %op1, <32 x i8> %op2, <16 x i1> %m
; RV32-NEXT: vmv.x.s s5, v14
; RV32-NEXT: vmv.x.s s6, v15
; RV32-NEXT: vmv.x.s s7, v16
-; RV32-NEXT: vmv.x.s s8, v17
-; RV32-NEXT: vmv.x.s s9, v18
-; RV32-NEXT: vmv.x.s s10, v19
-; RV32-NEXT: vmv.x.s s11, v20
; RV32-NEXT: vmseq.vx v9, v8, a0
-; RV32-NEXT: vmv.x.s a0, v21
+; RV32-NEXT: vmv.x.s a0, v17
; RV32-NEXT: vmseq.vx v12, v8, s2
-; RV32-NEXT: vmv.x.s s2, v22
+; RV32-NEXT: vmv.x.s s2, v18
; RV32-NEXT: vmseq.vx v13, v8, s3
-; RV32-NEXT: vmv.x.s s3, v11
-; RV32-NEXT: vmseq.vx v11, v8, s4
-; RV32-NEXT: vmv.x.s s4, v23
-; RV32-NEXT: vmseq.vx v14, v8, s5
-; RV32-NEXT: vmv.x.s s5, v10
+; RV32-NEXT: vmv.x.s s3, v19
+; RV32-NEXT: vmseq.vx v14, v8, s4
+; RV32-NEXT: vmv.x.s s4, v20
+; RV32-NEXT: vmseq.vx v15, v8, s5
+; RV32-NEXT: vmv.x.s s5, v21
+; RV32-NEXT: vmseq.vx v16, v8, s6
+; RV32-NEXT: vmv.x.s s6, v22
+; RV32-NEXT: vmseq.vx v17, v8, s7
+; RV32-NEXT: vmv.x.s s7, v11
+; RV32-NEXT: vmseq.vx v11, v8, a0
+; RV32-NEXT: vmv.x.s a0, v23
+; RV32-NEXT: vmseq.vx v18, v8, s2
+; RV32-NEXT: vmv.x.s s2, v10
; RV32-NEXT: vmor.mm v9, v9, v12
-; RV32-NEXT: vmseq.vx v10, v8, s6
; RV32-NEXT: vmor.mm v9, v9, v13
-; RV32-NEXT: vmseq.vx v12, v8, s7
-; RV32-NEXT: vmor.mm v9, v9, v11
-; RV32-NEXT: vmseq.vx v11, v8, s8
; RV32-NEXT: vmor.mm v9, v9, v14
-; RV32-NEXT: vmseq.vx v13, v8, s9
-; RV32-NEXT: vmor.mm v9, v9, v10
-; RV32-NEXT: vmseq.vx v10, v8, s10
-; RV32-NEXT: vmor.mm v9, v9, v12
-; RV32-NEXT: vmseq.vx v12, v8, s11
-; RV32-NEXT: vmor.mm v9, v9, v11
-; RV32-NEXT: vmseq.vx v11, v8, a0
-; RV32-NEXT: vmor.mm v9, v9, v13
-; RV32-NEXT: vmseq.vx v13, v8, s2
-; RV32-NEXT: vmor.mm v9, v9, v10
+; RV32-NEXT: vmor.mm v9, v9, v15
+; RV32-NEXT: vmor.mm v9, v9, v16
; RV32-NEXT: vmseq.vx v10, v8, s3
-; RV32-NEXT: vmor.mm v9, v9, v12
+; RV32-NEXT: vmor.mm v9, v9, v17
; RV32-NEXT: vmseq.vx v12, v8, s4
; RV32-NEXT: vmor.mm v9, v9, v11
; RV32-NEXT: vmseq.vx v11, v8, s5
+; RV32-NEXT: vmor.mm v9, v9, v18
+; RV32-NEXT: vmseq.vx v13, v8, s6
+; RV32-NEXT: vmor.mm v9, v9, v10
+; RV32-NEXT: vmseq.vx v10, v8, s7
+; RV32-NEXT: vmor.mm v9, v9, v12
+; RV32-NEXT: vmseq.vx v12, v8, a0
+; RV32-NEXT: vmor.mm v9, v9, v11
+; RV32-NEXT: vmseq.vx v11, v8, s2
; RV32-NEXT: vmor.mm v9, v9, v13
; RV32-NEXT: vmseq.vx v13, v8, a1
; RV32-NEXT: vmor.mm v9, v9, v10
@@ -1049,18 +1009,14 @@ define <16 x i1> @match_v16i8_v32i8(<16 x i8> %op1, <32 x i8> %op2, <16 x i1> %m
; RV32-NEXT: vmseq.vx v8, v8, s1
; RV32-NEXT: vmor.mm v8, v9, v8
; RV32-NEXT: vmand.mm v0, v8, v0
-; RV32-NEXT: lw s0, 44(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s1, 40(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s2, 36(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s3, 32(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s4, 28(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s5, 24(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s6, 20(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s7, 16(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s8, 12(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s9, 8(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s10, 4(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s11, 0(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s0, 28(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s1, 24(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s2, 20(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s3, 16(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s4, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s5, 8(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s6, 4(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s7, 0(sp) # 4-byte Folded Reload
; RV32-NEXT: .cfi_restore s0
; RV32-NEXT: .cfi_restore s1
; RV32-NEXT: .cfi_restore s2
@@ -1069,30 +1025,22 @@ define <16 x i1> @match_v16i8_v32i8(<16 x i8> %op1, <32 x i8> %op2, <16 x i1> %m
; RV32-NEXT: .cfi_restore s5
; RV32-NEXT: .cfi_restore s6
; RV32-NEXT: .cfi_restore s7
-; RV32-NEXT: .cfi_restore s8
-; RV32-NEXT: .cfi_restore s9
-; RV32-NEXT: .cfi_restore s10
-; RV32-NEXT: .cfi_restore s11
-; RV32-NEXT: addi sp, sp, 48
+; RV32-NEXT: addi sp, sp, 32
; RV32-NEXT: .cfi_def_cfa_offset 0
; RV32-NEXT: ret
;
; RV64-LABEL: match_v16i8_v32i8:
; RV64: # %bb.0:
-; RV64-NEXT: addi sp, sp, -96
-; RV64-NEXT: .cfi_def_cfa_offset 96
-; RV64-NEXT: sd s0, 88(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s1, 80(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s2, 72(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s3, 64(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s4, 56(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s5, 48(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s6, 40(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s7, 32(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s8, 24(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s9, 16(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s10, 8(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s11, 0(sp) # 8-byte Folded Spill
+; RV64-NEXT: addi sp, sp, -64
+; RV64-NEXT: .cfi_def_cfa_offset 64
+; RV64-NEXT: sd s0, 56(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s1, 48(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s2, 40(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s3, 32(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s4, 24(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s5, 16(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s6, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s7, 0(sp) # 8-byte Folded Spill
; RV64-NEXT: .cfi_offset s0, -8
; RV64-NEXT: .cfi_offset s1, -16
; RV64-NEXT: .cfi_offset s2, -24
@@ -1101,10 +1049,6 @@ define <16 x i1> @match_v16i8_v32i8(<16 x i8> %op1, <32 x i8> %op2, <16 x i1> %m
; RV64-NEXT: .cfi_offset s5, -48
; RV64-NEXT: .cfi_offset s6, -56
; RV64-NEXT: .cfi_offset s7, -64
-; RV64-NEXT: .cfi_offset s8, -72
-; RV64-NEXT: .cfi_offset s9, -80
-; RV64-NEXT: .cfi_offset s10, -88
-; RV64-NEXT: .cfi_offset s11, -96
; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma
; RV64-NEXT: vmv.x.s a0, v10
; RV64-NEXT: vslidedown.vi v9, v10, 1
@@ -1162,42 +1106,42 @@ define <16 x i1> @match_v16i8_v32i8(<16 x i8> %op1, <32 x i8> %op2, <16 x i1> %m
; RV64-NEXT: vmv.x.s s5, v14
; RV64-NEXT: vmv.x.s s6, v15
; RV64-NEXT: vmv.x.s s7, v16
-; RV64-NEXT: vmv.x.s s8, v17
-; RV64-NEXT: vmv.x.s s9, v18
-; RV64-NEXT: vmv.x.s s10, v19
-; RV64-NEXT: vmv.x.s s11, v20
; RV64-NEXT: vmseq.vx v9, v8, a0
-; RV64-NEXT: vmv.x.s a0, v21
+; RV64-NEXT: vmv.x.s a0, v17
; RV64-NEXT: vmseq.vx v12, v8, s2
-; RV64-NEXT: vmv.x.s s2, v22
+; RV64-NEXT: vmv.x.s s2, v18
; RV64-NEXT: vmseq.vx v13, v8, s3
-; RV64-NEXT: vmv.x.s s3, v11
-; RV64-NEXT: vmseq.vx v11, v8, s4
-; RV64-NEXT: vmv.x.s s4, v23
-; RV64-NEXT: vmseq.vx v14, v8, s5
-; RV64-NEXT: vmv.x.s s5, v10
+; RV64-NEXT: vmv.x.s s3, v19
+; RV64-NEXT: vmseq.vx v14, v8, s4
+; RV64-NEXT: vmv.x.s s4, v20
+; RV64-NEXT: vmseq.vx v15, v8, s5
+; RV64-NEXT: vmv.x.s s5, v21
+; RV64-NEXT: vmseq.vx v16, v8, s6
+; RV64-NEXT: vmv.x.s s6, v22
+; RV64-NEXT: vmseq.vx v17, v8, s7
+; RV64-NEXT: vmv.x.s s7, v11
+; RV64-NEXT: vmseq.vx v11, v8, a0
+; RV64-NEXT: vmv.x.s a0, v23
+; RV64-NEXT: vmseq.vx v18, v8, s2
+; RV64-NEXT: vmv.x.s s2, v10
; RV64-NEXT: vmor.mm v9, v9, v12
-; RV64-NEXT: vmseq.vx v10, v8, s6
; RV64-NEXT: vmor.mm v9, v9, v13
-; RV64-NEXT: vmseq.vx v12, v8, s7
-; RV64-NEXT: vmor.mm v9, v9, v11
-; RV64-NEXT: vmseq.vx v11, v8, s8
; RV64-NEXT: vmor.mm v9, v9, v14
-; RV64-NEXT: vmseq.vx v13, v8, s9
-; RV64-NEXT: vmor.mm v9, v9, v10
-; RV64-NEXT: vmseq.vx v10, v8, s10
-; RV64-NEXT: vmor.mm v9, v9, v12
-; RV64-NEXT: vmseq.vx v12, v8, s11
-; RV64-NEXT: vmor.mm v9, v9, v11
-; RV64-NEXT: vmseq.vx v11, v8, a0
-; RV64-NEXT: vmor.mm v9, v9, v13
-; RV64-NEXT: vmseq.vx v13, v8, s2
-; RV64-NEXT: vmor.mm v9, v9, v10
+; RV64-NEXT: vmor.mm v9, v9, v15
+; RV64-NEXT: vmor.mm v9, v9, v16
; RV64-NEXT: vmseq.vx v10, v8, s3
-; RV64-NEXT: vmor.mm v9, v9, v12
+; RV64-NEXT: vmor.mm v9, v9, v17
; RV64-NEXT: vmseq.vx v12, v8, s4
; RV64-NEXT: vmor.mm v9, v9, v11
; RV64-NEXT: vmseq.vx v11, v8, s5
+; RV64-NEXT: vmor.mm v9, v9, v18
+; RV64-NEXT: vmseq.vx v13, v8, s6
+; RV64-NEXT: vmor.mm v9, v9, v10
+; RV64-NEXT: vmseq.vx v10, v8, s7
+; RV64-NEXT: vmor.mm v9, v9, v12
+; RV64-NEXT: vmseq.vx v12, v8, a0
+; RV64-NEXT: vmor.mm v9, v9, v11
+; RV64-NEXT: vmseq.vx v11, v8, s2
; RV64-NEXT: vmor.mm v9, v9, v13
; RV64-NEXT: vmseq.vx v13, v8, a1
; RV64-NEXT: vmor.mm v9, v9, v10
@@ -1235,18 +1179,14 @@ define <16 x i1> @match_v16i8_v32i8(<16 x i8> %op1, <32 x i8> %op2, <16 x i1> %m
; RV64-NEXT: vmseq.vx v8, v8, s1
; RV64-NEXT: vmor.mm v8, v9, v8
; RV64-NEXT: vmand.mm v0, v8, v0
-; RV64-NEXT: ld s0, 88(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s1, 80(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s2, 72(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s3, 64(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s4, 56(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s5, 48(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s6, 40(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s7, 32(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s8, 24(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s9, 16(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s10, 8(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s11, 0(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s0, 56(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s1, 48(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s2, 40(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s3, 32(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s4, 24(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s5, 16(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s6, 8(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s7, 0(sp) # 8-byte Folded Reload
; RV64-NEXT: .cfi_restore s0
; RV64-NEXT: .cfi_restore s1
; RV64-NEXT: .cfi_restore s2
@@ -1255,11 +1195,7 @@ define <16 x i1> @match_v16i8_v32i8(<16 x i8> %op1, <32 x i8> %op2, <16 x i1> %m
; RV64-NEXT: .cfi_restore s5
; RV64-NEXT: .cfi_restore s6
; RV64-NEXT: .cfi_restore s7
-; RV64-NEXT: .cfi_restore s8
-; RV64-NEXT: .cfi_restore s9
-; RV64-NEXT: .cfi_restore s10
-; RV64-NEXT: .cfi_restore s11
-; RV64-NEXT: addi sp, sp, 96
+; RV64-NEXT: addi sp, sp, 64
; RV64-NEXT: .cfi_def_cfa_offset 0
; RV64-NEXT: ret
%r = tail call <16 x i1> @llvm.experimental.vector.match(<16 x i8> %op1, <32 x i8> %op2, <16 x i1> %mask)
diff --git a/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll
index 123048d996360c..22e6f23d4d6e6a 100644
--- a/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll
+++ b/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll
@@ -2203,139 +2203,136 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
;
; RV32I-LABEL: lshr_32bytes:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi sp, sp, -128
-; RV32I-NEXT: sw ra, 124(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s0, 120(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s1, 116(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s2, 112(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s3, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s4, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s5, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s6, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s7, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s8, 88(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s9, 84(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s10, 80(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s11, 76(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu s1, 0(a0)
+; RV32I-NEXT: addi sp, sp, -112
+; RV32I-NEXT: sw s0, 108(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 104(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s2, 100(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s3, 96(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s4, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s5, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s6, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s7, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s8, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s9, 72(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s10, 68(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s11, 64(sp) # 4-byte Folded Spill
+; RV32I-NEXT: lbu a3, 0(a0)
; RV32I-NEXT: lbu a4, 1(a0)
; RV32I-NEXT: lbu a5, 2(a0)
; RV32I-NEXT: lbu a6, 3(a0)
-; RV32I-NEXT: lbu t1, 4(a0)
-; RV32I-NEXT: lbu t3, 5(a0)
-; RV32I-NEXT: lbu t4, 6(a0)
-; RV32I-NEXT: lbu s0, 7(a0)
-; RV32I-NEXT: lbu t2, 8(a0)
-; RV32I-NEXT: lbu s3, 9(a0)
-; RV32I-NEXT: lbu s6, 10(a0)
-; RV32I-NEXT: lbu s8, 11(a0)
-; RV32I-NEXT: lbu s9, 12(a0)
-; RV32I-NEXT: lbu s10, 13(a0)
-; RV32I-NEXT: lbu s4, 14(a0)
-; RV32I-NEXT: lbu s7, 15(a0)
-; RV32I-NEXT: lbu s5, 16(a0)
-; RV32I-NEXT: lbu s11, 17(a0)
-; RV32I-NEXT: lbu ra, 18(a0)
-; RV32I-NEXT: lbu a3, 19(a0)
-; RV32I-NEXT: lbu t5, 20(a0)
-; RV32I-NEXT: lbu t6, 21(a0)
-; RV32I-NEXT: lbu a7, 22(a0)
-; RV32I-NEXT: lbu t0, 23(a0)
+; RV32I-NEXT: lbu a7, 4(a0)
+; RV32I-NEXT: lbu t0, 5(a0)
+; RV32I-NEXT: lbu t1, 6(a0)
+; RV32I-NEXT: lbu t2, 7(a0)
+; RV32I-NEXT: lbu t3, 8(a0)
+; RV32I-NEXT: lbu t4, 9(a0)
+; RV32I-NEXT: lbu t5, 10(a0)
+; RV32I-NEXT: lbu t6, 11(a0)
+; RV32I-NEXT: lbu s0, 12(a0)
+; RV32I-NEXT: lbu s1, 13(a0)
+; RV32I-NEXT: lbu s2, 14(a0)
+; RV32I-NEXT: lbu s3, 15(a0)
+; RV32I-NEXT: lbu s4, 16(a0)
+; RV32I-NEXT: lbu s5, 17(a0)
+; RV32I-NEXT: lbu s6, 18(a0)
+; RV32I-NEXT: lbu s7, 19(a0)
; RV32I-NEXT: slli a4, a4, 8
; RV32I-NEXT: slli a5, a5, 16
; RV32I-NEXT: slli a6, a6, 24
-; RV32I-NEXT: slli t3, t3, 8
-; RV32I-NEXT: slli t4, t4, 16
-; RV32I-NEXT: slli s0, s0, 24
-; RV32I-NEXT: or a4, a4, s1
-; RV32I-NEXT: sw a4, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT: slli t0, t0, 8
+; RV32I-NEXT: slli t1, t1, 16
+; RV32I-NEXT: slli t2, t2, 24
+; RV32I-NEXT: or a3, a4, a3
; RV32I-NEXT: or a4, a6, a5
-; RV32I-NEXT: or a5, t3, t1
-; RV32I-NEXT: or a6, s0, t4
-; RV32I-NEXT: lbu t1, 24(a0)
+; RV32I-NEXT: or a5, t0, a7
+; RV32I-NEXT: or a6, t2, t1
+; RV32I-NEXT: lbu s8, 20(a0)
+; RV32I-NEXT: lbu s9, 21(a0)
+; RV32I-NEXT: lbu s10, 22(a0)
+; RV32I-NEXT: lbu s11, 23(a0)
+; RV32I-NEXT: slli t4, t4, 8
+; RV32I-NEXT: slli t5, t5, 16
+; RV32I-NEXT: slli t6, t6, 24
+; RV32I-NEXT: slli s1, s1, 8
+; RV32I-NEXT: slli s2, s2, 16
+; RV32I-NEXT: slli s3, s3, 24
+; RV32I-NEXT: or a7, t4, t3
+; RV32I-NEXT: or t0, t6, t5
+; RV32I-NEXT: or t1, s1, s0
+; RV32I-NEXT: or t2, s3, s2
+; RV32I-NEXT: lbu t6, 24(a0)
; RV32I-NEXT: lbu s0, 25(a0)
; RV32I-NEXT: lbu s1, 26(a0)
; RV32I-NEXT: lbu s2, 27(a0)
-; RV32I-NEXT: slli s3, s3, 8
+; RV32I-NEXT: slli s5, s5, 8
; RV32I-NEXT: slli s6, s6, 16
-; RV32I-NEXT: slli s8, s8, 24
-; RV32I-NEXT: slli s10, s10, 8
-; RV32I-NEXT: or t2, s3, t2
-; RV32I-NEXT: or t3, s8, s6
-; RV32I-NEXT: or t4, s10, s9
-; RV32I-NEXT: lbu s3, 28(a0)
-; RV32I-NEXT: lbu s6, 29(a0)
-; RV32I-NEXT: lbu s8, 30(a0)
-; RV32I-NEXT: lbu s9, 31(a0)
-; RV32I-NEXT: slli s4, s4, 16
; RV32I-NEXT: slli s7, s7, 24
-; RV32I-NEXT: slli s11, s11, 8
-; RV32I-NEXT: slli ra, ra, 16
-; RV32I-NEXT: slli a3, a3, 24
-; RV32I-NEXT: or a0, s7, s4
-; RV32I-NEXT: or s4, s11, s5
-; RV32I-NEXT: or s5, a3, ra
-; RV32I-NEXT: lbu a3, 0(a1)
-; RV32I-NEXT: lbu s7, 1(a1)
-; RV32I-NEXT: lbu s10, 2(a1)
+; RV32I-NEXT: slli s9, s9, 8
+; RV32I-NEXT: or t3, s5, s4
+; RV32I-NEXT: or t4, s7, s6
+; RV32I-NEXT: or t5, s9, s8
+; RV32I-NEXT: lbu s3, 28(a0)
+; RV32I-NEXT: lbu s4, 29(a0)
+; RV32I-NEXT: lbu s5, 30(a0)
+; RV32I-NEXT: lbu s6, 31(a0)
+; RV32I-NEXT: slli s10, s10, 16
+; RV32I-NEXT: slli s11, s11, 24
+; RV32I-NEXT: slli s0, s0, 8
+; RV32I-NEXT: slli s1, s1, 16
+; RV32I-NEXT: slli s2, s2, 24
+; RV32I-NEXT: or a0, s11, s10
+; RV32I-NEXT: or t6, s0, t6
+; RV32I-NEXT: or s0, s2, s1
+; RV32I-NEXT: lbu s1, 0(a1)
+; RV32I-NEXT: lbu s2, 1(a1)
+; RV32I-NEXT: lbu s7, 2(a1)
; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: sw zero, 48(sp)
+; RV32I-NEXT: sw zero, 52(sp)
; RV32I-NEXT: sw zero, 56(sp)
; RV32I-NEXT: sw zero, 60(sp)
-; RV32I-NEXT: sw zero, 64(sp)
-; RV32I-NEXT: sw zero, 68(sp)
+; RV32I-NEXT: sw zero, 32(sp)
+; RV32I-NEXT: sw zero, 36(sp)
; RV32I-NEXT: sw zero, 40(sp)
; RV32I-NEXT: sw zero, 44(sp)
-; RV32I-NEXT: sw zero, 48(sp)
-; RV32I-NEXT: sw zero, 52(sp)
-; RV32I-NEXT: slli t6, t6, 8
-; RV32I-NEXT: or t5, t6, t5
-; RV32I-NEXT: addi t6, sp, 8
-; RV32I-NEXT: slli a7, a7, 16
-; RV32I-NEXT: slli t0, t0, 24
-; RV32I-NEXT: slli s0, s0, 8
-; RV32I-NEXT: slli s1, s1, 16
-; RV32I-NEXT: slli s2, s2, 24
-; RV32I-NEXT: slli s6, s6, 8
-; RV32I-NEXT: slli s8, s8, 16
-; RV32I-NEXT: slli s9, s9, 24
-; RV32I-NEXT: slli s7, s7, 8
-; RV32I-NEXT: slli s10, s10, 16
+; RV32I-NEXT: slli s4, s4, 8
+; RV32I-NEXT: or s3, s4, s3
+; RV32I-NEXT: mv s4, sp
+; RV32I-NEXT: slli s5, s5, 16
+; RV32I-NEXT: slli s6, s6, 24
+; RV32I-NEXT: slli s2, s2, 8
+; RV32I-NEXT: slli s7, s7, 16
; RV32I-NEXT: slli a1, a1, 24
-; RV32I-NEXT: or a7, t0, a7
-; RV32I-NEXT: or t0, s0, t1
-; RV32I-NEXT: or t1, s2, s1
-; RV32I-NEXT: or s0, s6, s3
-; RV32I-NEXT: or s1, s9, s8
-; RV32I-NEXT: or a3, s7, a3
-; RV32I-NEXT: or a1, a1, s10
-; RV32I-NEXT: lw s2, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT: or a4, a4, s2
-; RV32I-NEXT: or a5, a6, a5
-; RV32I-NEXT: or a6, t3, t2
-; RV32I-NEXT: or a0, a0, t4
-; RV32I-NEXT: or t2, s5, s4
-; RV32I-NEXT: or a7, a7, t5
-; RV32I-NEXT: or t0, t1, t0
-; RV32I-NEXT: or s0, s1, s0
-; RV32I-NEXT: or a1, a1, a3
-; RV32I-NEXT: sw t2, 24(sp)
-; RV32I-NEXT: sw a7, 28(sp)
-; RV32I-NEXT: sw t0, 32(sp)
-; RV32I-NEXT: sw s0, 36(sp)
-; RV32I-NEXT: sw a4, 8(sp)
-; RV32I-NEXT: sw a5, 12(sp)
-; RV32I-NEXT: sw a6, 16(sp)
+; RV32I-NEXT: or s5, s6, s5
+; RV32I-NEXT: or s1, s2, s1
+; RV32I-NEXT: or a1, a1, s7
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a5, t0, a7
+; RV32I-NEXT: or a6, t2, t1
+; RV32I-NEXT: or a7, t4, t3
+; RV32I-NEXT: or a0, a0, t5
+; RV32I-NEXT: or t0, s0, t6
+; RV32I-NEXT: or t1, s5, s3
+; RV32I-NEXT: or a1, a1, s1
+; RV32I-NEXT: sw a7, 16(sp)
; RV32I-NEXT: sw a0, 20(sp)
+; RV32I-NEXT: sw t0, 24(sp)
+; RV32I-NEXT: sw t1, 28(sp)
+; RV32I-NEXT: sw a3, 0(sp)
+; RV32I-NEXT: sw a4, 4(sp)
+; RV32I-NEXT: sw a5, 8(sp)
+; RV32I-NEXT: sw a6, 12(sp)
; RV32I-NEXT: slli t1, a1, 3
; RV32I-NEXT: andi a1, a1, 28
-; RV32I-NEXT: add a1, t6, a1
+; RV32I-NEXT: add a1, s4, a1
; RV32I-NEXT: andi a0, t1, 24
-; RV32I-NEXT: xori t0, a0, 31
+; RV32I-NEXT: xori a7, a0, 31
; RV32I-NEXT: lw a3, 0(a1)
; RV32I-NEXT: lw a4, 4(a1)
; RV32I-NEXT: lw a5, 8(a1)
; RV32I-NEXT: lw a6, 12(a1)
-; RV32I-NEXT: lw a7, 16(a1)
+; RV32I-NEXT: lw t0, 16(a1)
; RV32I-NEXT: lw t2, 20(a1)
; RV32I-NEXT: lw t3, 24(a1)
; RV32I-NEXT: lw t4, 28(a1)
@@ -2344,33 +2341,33 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: srl a1, a3, t1
; RV32I-NEXT: slli t6, a4, 1
; RV32I-NEXT: srl a3, a6, t1
-; RV32I-NEXT: slli s0, a7, 1
+; RV32I-NEXT: slli s0, t0, 1
; RV32I-NEXT: srl a4, a5, t1
; RV32I-NEXT: slli s1, a6, 1
; RV32I-NEXT: srl a5, t2, t1
; RV32I-NEXT: slli s2, t3, 1
-; RV32I-NEXT: srl a6, a7, t1
+; RV32I-NEXT: srl a6, t0, t1
; RV32I-NEXT: slli t2, t2, 1
-; RV32I-NEXT: srl a7, t3, t1
+; RV32I-NEXT: srl t0, t3, t1
; RV32I-NEXT: slli t3, t4, 1
; RV32I-NEXT: srl t1, t4, t1
-; RV32I-NEXT: sll t4, t5, t0
-; RV32I-NEXT: sll t5, t6, t0
-; RV32I-NEXT: sll t6, s0, t0
-; RV32I-NEXT: sll s0, s1, t0
-; RV32I-NEXT: sll s1, s2, t0
-; RV32I-NEXT: sll t2, t2, t0
-; RV32I-NEXT: sll t3, t3, t0
+; RV32I-NEXT: sll t4, t5, a7
+; RV32I-NEXT: sll t5, t6, a7
+; RV32I-NEXT: sll t6, s0, a7
+; RV32I-NEXT: sll s0, s1, a7
+; RV32I-NEXT: sll s1, s2, a7
+; RV32I-NEXT: sll t2, t2, a7
+; RV32I-NEXT: sll t3, t3, a7
; RV32I-NEXT: srli s2, t1, 24
; RV32I-NEXT: srli s3, t1, 16
; RV32I-NEXT: srli s4, t1, 8
-; RV32I-NEXT: or t0, a0, t4
+; RV32I-NEXT: or a7, a0, t4
; RV32I-NEXT: or t4, a1, t5
; RV32I-NEXT: or t5, a3, t6
; RV32I-NEXT: or s0, a4, s0
; RV32I-NEXT: or s1, a5, s1
; RV32I-NEXT: or t2, a6, t2
-; RV32I-NEXT: or t3, a7, t3
+; RV32I-NEXT: or t3, t0, t3
; RV32I-NEXT: sb t1, 28(a2)
; RV32I-NEXT: sb s4, 29(a2)
; RV32I-NEXT: sb s3, 30(a2)
@@ -2387,23 +2384,23 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: srli s6, s0, 24
; RV32I-NEXT: srli s7, s0, 16
; RV32I-NEXT: srli s0, s0, 8
-; RV32I-NEXT: srli s8, t5, 24
-; RV32I-NEXT: srli s9, t5, 16
-; RV32I-NEXT: srli t5, t5, 8
-; RV32I-NEXT: srli s10, t4, 24
-; RV32I-NEXT: srli s11, t4, 16
-; RV32I-NEXT: srli t4, t4, 8
-; RV32I-NEXT: sb a7, 24(a2)
+; RV32I-NEXT: sb t0, 24(a2)
+; RV32I-NEXT: srli t0, t5, 24
; RV32I-NEXT: sb t3, 25(a2)
+; RV32I-NEXT: srli t3, t5, 16
+; RV32I-NEXT: srli t5, t5, 8
; RV32I-NEXT: sb t6, 26(a2)
+; RV32I-NEXT: srli t6, t4, 24
; RV32I-NEXT: sb t1, 27(a2)
-; RV32I-NEXT: srli a7, t0, 24
+; RV32I-NEXT: srli t1, t4, 16
+; RV32I-NEXT: srli t4, t4, 8
; RV32I-NEXT: sb a6, 16(a2)
+; RV32I-NEXT: srli a6, a7, 24
; RV32I-NEXT: sb t2, 17(a2)
; RV32I-NEXT: sb s3, 18(a2)
; RV32I-NEXT: sb s2, 19(a2)
-; RV32I-NEXT: srli a6, t0, 16
-; RV32I-NEXT: srli t0, t0, 8
+; RV32I-NEXT: srli t2, a7, 16
+; RV32I-NEXT: srli a7, a7, 8
; RV32I-NEXT: sb a5, 20(a2)
; RV32I-NEXT: sb s1, 21(a2)
; RV32I-NEXT: sb s5, 22(a2)
@@ -2414,30 +2411,29 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: sb s6, 11(a2)
; RV32I-NEXT: sb a3, 12(a2)
; RV32I-NEXT: sb t5, 13(a2)
-; RV32I-NEXT: sb s9, 14(a2)
-; RV32I-NEXT: sb s8, 15(a2)
+; RV32I-NEXT: sb t3, 14(a2)
+; RV32I-NEXT: sb t0, 15(a2)
; RV32I-NEXT: sb a1, 0(a2)
; RV32I-NEXT: sb t4, 1(a2)
-; RV32I-NEXT: sb s11, 2(a2)
-; RV32I-NEXT: sb s10, 3(a2)
+; RV32I-NEXT: sb t1, 2(a2)
+; RV32I-NEXT: sb t6, 3(a2)
; RV32I-NEXT: sb a0, 4(a2)
-; RV32I-NEXT: sb t0, 5(a2)
-; RV32I-NEXT: sb a6, 6(a2)
-; RV32I-NEXT: sb a7, 7(a2)
-; RV32I-NEXT: lw ra, 124(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s0, 120(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s1, 116(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s2, 112(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s3, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s4, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s5, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s6, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s7, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s8, 88(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s9, 84(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s10, 80(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s11, 76(sp) # 4-byte Folded Reload
-; RV32I-NEXT: addi sp, sp, 128
+; RV32I-NEXT: sb a7, 5(a2)
+; RV32I-NEXT: sb t2, 6(a2)
+; RV32I-NEXT: sb a6, 7(a2)
+; RV32I-NEXT: lw s0, 108(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 104(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s2, 100(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s3, 96(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s4, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s5, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s6, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s7, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s8, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s9, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s10, 68(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s11, 64(sp) # 4-byte Folded Reload
+; RV32I-NEXT: addi sp, sp, 112
; RV32I-NEXT: ret
%src = load i256, ptr %src.ptr, align 1
%byteOff = load i256, ptr %byteOff.ptr, align 1
@@ -2682,132 +2678,128 @@ define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
;
; RV32I-LABEL: lshr_32bytes_wordOff:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi sp, sp, -128
-; RV32I-NEXT: sw ra, 124(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s0, 120(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s1, 116(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s2, 112(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s3, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s4, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s5, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s6, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s7, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s8, 88(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s9, 84(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s10, 80(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s11, 76(sp) # 4-byte Folded Spill
-; RV32I-NEXT: mv a3, a1
-; RV32I-NEXT: lbu a5, 0(a0)
-; RV32I-NEXT: lbu a7, 1(a0)
-; RV32I-NEXT: lbu t0, 2(a0)
-; RV32I-NEXT: lbu t1, 3(a0)
-; RV32I-NEXT: lbu s2, 4(a0)
-; RV32I-NEXT: lbu s4, 5(a0)
-; RV32I-NEXT: lbu s5, 6(a0)
-; RV32I-NEXT: lbu s6, 7(a0)
-; RV32I-NEXT: lbu s3, 8(a0)
-; RV32I-NEXT: lbu s9, 9(a0)
-; RV32I-NEXT: lbu s10, 10(a0)
-; RV32I-NEXT: lbu s11, 11(a0)
-; RV32I-NEXT: lbu ra, 12(a0)
-; RV32I-NEXT: lbu a1, 13(a0)
-; RV32I-NEXT: lbu t4, 14(a0)
-; RV32I-NEXT: lbu t6, 15(a0)
-; RV32I-NEXT: lbu a4, 16(a0)
-; RV32I-NEXT: sw a4, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a6, 17(a0)
-; RV32I-NEXT: lbu t2, 18(a0)
-; RV32I-NEXT: lbu t3, 19(a0)
-; RV32I-NEXT: lbu a4, 20(a0)
-; RV32I-NEXT: lbu t5, 21(a0)
+; RV32I-NEXT: addi sp, sp, -112
+; RV32I-NEXT: sw s0, 108(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 104(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s2, 100(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s3, 96(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s4, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s5, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s6, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s7, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s8, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s9, 72(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s10, 68(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s11, 64(sp) # 4-byte Folded Spill
+; RV32I-NEXT: lbu a3, 0(a0)
+; RV32I-NEXT: lbu a4, 1(a0)
+; RV32I-NEXT: lbu a5, 2(a0)
+; RV32I-NEXT: lbu a6, 3(a0)
+; RV32I-NEXT: lbu a7, 4(a0)
+; RV32I-NEXT: lbu t0, 5(a0)
+; RV32I-NEXT: lbu t1, 6(a0)
+; RV32I-NEXT: lbu t2, 7(a0)
+; RV32I-NEXT: lbu t3, 8(a0)
+; RV32I-NEXT: lbu t4, 9(a0)
+; RV32I-NEXT: lbu t5, 10(a0)
+; RV32I-NEXT: lbu t6, 11(a0)
+; RV32I-NEXT: lbu s2, 12(a0)
+; RV32I-NEXT: lbu s3, 13(a0)
+; RV32I-NEXT: lbu s4, 14(a0)
+; RV32I-NEXT: lbu s5, 15(a0)
+; RV32I-NEXT: lbu s6, 16(a0)
+; RV32I-NEXT: lbu s7, 17(a0)
+; RV32I-NEXT: lbu s8, 18(a0)
+; RV32I-NEXT: lbu s9, 19(a0)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: slli a5, a5, 16
+; RV32I-NEXT: slli a6, a6, 24
+; RV32I-NEXT: slli t0, t0, 8
+; RV32I-NEXT: slli t1, t1, 16
+; RV32I-NEXT: slli t2, t2, 24
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a5, t0, a7
+; RV32I-NEXT: or a6, t2, t1
+; RV32I-NEXT: lbu s10, 20(a0)
+; RV32I-NEXT: lbu s11, 21(a0)
; RV32I-NEXT: lbu s0, 22(a0)
; RV32I-NEXT: lbu s1, 23(a0)
-; RV32I-NEXT: slli a7, a7, 8
-; RV32I-NEXT: slli t0, t0, 16
-; RV32I-NEXT: slli t1, t1, 24
-; RV32I-NEXT: slli s4, s4, 8
-; RV32I-NEXT: slli s5, s5, 16
-; RV32I-NEXT: slli s6, s6, 24
-; RV32I-NEXT: or a5, a7, a5
-; RV32I-NEXT: or a7, t1, t0
-; RV32I-NEXT: or t0, s4, s2
-; RV32I-NEXT: or t1, s6, s5
-; RV32I-NEXT: lbu s2, 24(a0)
-; RV32I-NEXT: lbu s6, 25(a0)
-; RV32I-NEXT: lbu s7, 26(a0)
-; RV32I-NEXT: lbu s8, 27(a0)
-; RV32I-NEXT: slli s9, s9, 8
-; RV32I-NEXT: slli s10, s10, 16
-; RV32I-NEXT: slli s11, s11, 24
-; RV32I-NEXT: slli a1, a1, 8
-; RV32I-NEXT: or s3, s9, s3
-; RV32I-NEXT: or s4, s11, s10
-; RV32I-NEXT: or s5, a1, ra
-; RV32I-NEXT: lbu s9, 28(a0)
-; RV32I-NEXT: lbu a1, 29(a0)
-; RV32I-NEXT: lbu s10, 30(a0)
+; RV32I-NEXT: slli t4, t4, 8
+; RV32I-NEXT: slli t5, t5, 16
+; RV32I-NEXT: slli t6, t6, 24
+; RV32I-NEXT: slli s3, s3, 8
+; RV32I-NEXT: slli s4, s4, 16
+; RV32I-NEXT: slli s5, s5, 24
+; RV32I-NEXT: or a7, t4, t3
+; RV32I-NEXT: or t0, t6, t5
+; RV32I-NEXT: or t1, s3, s2
+; RV32I-NEXT: or t2, s5, s4
+; RV32I-NEXT: lbu t3, 24(a0)
+; RV32I-NEXT: lbu s2, 25(a0)
+; RV32I-NEXT: lbu s3, 26(a0)
+; RV32I-NEXT: lbu s4, 27(a0)
+; RV32I-NEXT: slli s7, s7, 8
+; RV32I-NEXT: slli s8, s8, 16
+; RV32I-NEXT: slli s9, s9, 24
+; RV32I-NEXT: slli s11, s11, 8
+; RV32I-NEXT: or t4, s7, s6
+; RV32I-NEXT: or t5, s9, s8
+; RV32I-NEXT: or t6, s11, s10
+; RV32I-NEXT: lbu s5, 28(a0)
+; RV32I-NEXT: lbu s6, 29(a0)
+; RV32I-NEXT: lbu s7, 30(a0)
; RV32I-NEXT: lbu a0, 31(a0)
-; RV32I-NEXT: lbu a3, 0(a3)
+; RV32I-NEXT: lbu a1, 0(a1)
+; RV32I-NEXT: sw zero, 48(sp)
+; RV32I-NEXT: sw zero, 52(sp)
; RV32I-NEXT: sw zero, 56(sp)
; RV32I-NEXT: sw zero, 60(sp)
-; RV32I-NEXT: sw zero, 64(sp)
-; RV32I-NEXT: sw zero, 68(sp)
+; RV32I-NEXT: sw zero, 32(sp)
+; RV32I-NEXT: sw zero, 36(sp)
; RV32I-NEXT: sw zero, 40(sp)
; RV32I-NEXT: sw zero, 44(sp)
-; RV32I-NEXT: sw zero, 48(sp)
-; RV32I-NEXT: sw zero, 52(sp)
-; RV32I-NEXT: slli t4, t4, 16
-; RV32I-NEXT: slli t6, t6, 24
-; RV32I-NEXT: or t4, t6, t4
-; RV32I-NEXT: addi t6, sp, 8
-; RV32I-NEXT: slli a6, a6, 8
-; RV32I-NEXT: slli t2, t2, 16
-; RV32I-NEXT: slli t3, t3, 24
-; RV32I-NEXT: slli t5, t5, 8
; RV32I-NEXT: slli s0, s0, 16
; RV32I-NEXT: slli s1, s1, 24
+; RV32I-NEXT: or s0, s1, s0
+; RV32I-NEXT: mv s1, sp
+; RV32I-NEXT: slli s2, s2, 8
+; RV32I-NEXT: slli s3, s3, 16
+; RV32I-NEXT: slli s4, s4, 24
; RV32I-NEXT: slli s6, s6, 8
; RV32I-NEXT: slli s7, s7, 16
-; RV32I-NEXT: slli s8, s8, 24
-; RV32I-NEXT: slli a1, a1, 8
-; RV32I-NEXT: slli s10, s10, 16
; RV32I-NEXT: slli a0, a0, 24
-; RV32I-NEXT: slli a3, a3, 2
-; RV32I-NEXT: lw s11, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT: or a6, a6, s11
-; RV32I-NEXT: or t2, t3, t2
-; RV32I-NEXT: or a4, t5, a4
-; RV32I-NEXT: or s0, s1, s0
-; RV32I-NEXT: or t3, s6, s2
-; RV32I-NEXT: or t5, s8, s7
-; RV32I-NEXT: or a1, a1, s9
-; RV32I-NEXT: or a0, a0, s10
-; RV32I-NEXT: andi a3, a3, 28
-; RV32I-NEXT: or a5, a7, a5
-; RV32I-NEXT: or a7, t1, t0
-; RV32I-NEXT: or t0, s4, s3
-; RV32I-NEXT: or t1, t4, s5
-; RV32I-NEXT: or a6, t2, a6
-; RV32I-NEXT: or a4, s0, a4
-; RV32I-NEXT: or t2, t5, t3
-; RV32I-NEXT: or a0, a0, a1
-; RV32I-NEXT: add t6, t6, a3
-; RV32I-NEXT: sw a6, 24(sp)
-; RV32I-NEXT: sw a4, 28(sp)
-; RV32I-NEXT: sw t2, 32(sp)
-; RV32I-NEXT: sw a0, 36(sp)
+; RV32I-NEXT: slli a1, a1, 2
+; RV32I-NEXT: or t3, s2, t3
+; RV32I-NEXT: or s2, s4, s3
+; RV32I-NEXT: or s3, s6, s5
+; RV32I-NEXT: or a0, a0, s7
+; RV32I-NEXT: andi a1, a1, 28
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a5, t0, a7
+; RV32I-NEXT: or a6, t2, t1
+; RV32I-NEXT: or a7, t5, t4
+; RV32I-NEXT: or t0, s0, t6
+; RV32I-NEXT: or t1, s2, t3
+; RV32I-NEXT: or a0, a0, s3
+; RV32I-NEXT: add s1, s1, a1
+; RV32I-NEXT: sw a7, 16(sp)
+; RV32I-NEXT: sw t0, 20(sp)
+; RV32I-NEXT: sw t1, 24(sp)
+; RV32I-NEXT: sw a0, 28(sp)
+; RV32I-NEXT: sw a3, 0(sp)
+; RV32I-NEXT: sw a4, 4(sp)
; RV32I-NEXT: sw a5, 8(sp)
-; RV32I-NEXT: sw a7, 12(sp)
-; RV32I-NEXT: sw t0, 16(sp)
-; RV32I-NEXT: sw t1, 20(sp)
-; RV32I-NEXT: lw a6, 16(t6)
-; RV32I-NEXT: lw a5, 20(t6)
-; RV32I-NEXT: lw a7, 24(t6)
-; RV32I-NEXT: lw a1, 0(t6)
-; RV32I-NEXT: lw a0, 4(t6)
-; RV32I-NEXT: lw a4, 8(t6)
-; RV32I-NEXT: lw a3, 12(t6)
-; RV32I-NEXT: lw t0, 28(t6)
+; RV32I-NEXT: sw a6, 12(sp)
+; RV32I-NEXT: lw a6, 16(s1)
+; RV32I-NEXT: lw a5, 20(s1)
+; RV32I-NEXT: lw a7, 24(s1)
+; RV32I-NEXT: lw a1, 0(s1)
+; RV32I-NEXT: lw a0, 4(s1)
+; RV32I-NEXT: lw a4, 8(s1)
+; RV32I-NEXT: lw a3, 12(s1)
+; RV32I-NEXT: lw t0, 28(s1)
; RV32I-NEXT: srli t1, a7, 24
; RV32I-NEXT: srli t2, a7, 16
; RV32I-NEXT: srli t3, a7, 8
@@ -2822,21 +2814,21 @@ define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
; RV32I-NEXT: srli s5, a5, 8
; RV32I-NEXT: srli s6, a4, 24
; RV32I-NEXT: srli s7, a4, 16
-; RV32I-NEXT: srli s8, a4, 8
-; RV32I-NEXT: srli s9, a3, 24
-; RV32I-NEXT: srli s10, a3, 16
-; RV32I-NEXT: srli s11, a3, 8
; RV32I-NEXT: sb a7, 24(a2)
-; RV32I-NEXT: srli a7, a1, 24
+; RV32I-NEXT: srli a7, a4, 8
; RV32I-NEXT: sb t3, 25(a2)
+; RV32I-NEXT: srli t3, a3, 24
; RV32I-NEXT: sb t2, 26(a2)
+; RV32I-NEXT: srli t2, a3, 16
; RV32I-NEXT: sb t1, 27(a2)
-; RV32I-NEXT: srli t1, a1, 16
+; RV32I-NEXT: srli t1, a3, 8
; RV32I-NEXT: sb t0, 28(a2)
+; RV32I-NEXT: srli t0, a1, 24
; RV32I-NEXT: sb t6, 29(a2)
+; RV32I-NEXT: srli t6, a1, 16
; RV32I-NEXT: sb t5, 30(a2)
; RV32I-NEXT: sb t4, 31(a2)
-; RV32I-NEXT: srli t0, a1, 8
+; RV32I-NEXT: srli t4, a1, 8
; RV32I-NEXT: sb a6, 16(a2)
; RV32I-NEXT: sb s2, 17(a2)
; RV32I-NEXT: sb s1, 18(a2)
@@ -2848,36 +2840,35 @@ define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
; RV32I-NEXT: sb s3, 23(a2)
; RV32I-NEXT: srli a5, a0, 16
; RV32I-NEXT: sb a4, 8(a2)
-; RV32I-NEXT: sb s8, 9(a2)
+; RV32I-NEXT: sb a7, 9(a2)
; RV32I-NEXT: sb s7, 10(a2)
; RV32I-NEXT: sb s6, 11(a2)
; RV32I-NEXT: srli a4, a0, 8
; RV32I-NEXT: sb a3, 12(a2)
-; RV32I-NEXT: sb s11, 13(a2)
-; RV32I-NEXT: sb s10, 14(a2)
-; RV32I-NEXT: sb s9, 15(a2)
+; RV32I-NEXT: sb t1, 13(a2)
+; RV32I-NEXT: sb t2, 14(a2)
+; RV32I-NEXT: sb t3, 15(a2)
; RV32I-NEXT: sb a1, 0(a2)
-; RV32I-NEXT: sb t0, 1(a2)
-; RV32I-NEXT: sb t1, 2(a2)
-; RV32I-NEXT: sb a7, 3(a2)
+; RV32I-NEXT: sb t4, 1(a2)
+; RV32I-NEXT: sb t6, 2(a2)
+; RV32I-NEXT: sb t0, 3(a2)
; RV32I-NEXT: sb a0, 4(a2)
; RV32I-NEXT: sb a4, 5(a2)
; RV32I-NEXT: sb a5, 6(a2)
; RV32I-NEXT: sb a6, 7(a2)
-; RV32I-NEXT: lw ra, 124(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s0, 120(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s1, 116(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s2, 112(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s3, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s4, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s5, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s6, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s7, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s8, 88(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s9, 84(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s10, 80(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s11, 76(sp) # 4-byte Folded Reload
-; RV32I-NEXT: addi sp, sp, 128
+; RV32I-NEXT: lw s0, 108(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 104(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s2, 100(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s3, 96(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s4, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s5, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s6, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s7, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s8, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s9, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s10, 68(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s11, 64(sp) # 4-byte Folded Reload
+; RV32I-NEXT: addi sp, sp, 112
; RV32I-NEXT: ret
%src = load i256, ptr %src.ptr, align 1
%wordOff = load i256, ptr %wordOff.ptr, align 1
@@ -2903,111 +2894,111 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
; RV64I-NEXT: sd s9, 80(sp) # 8-byte Folded Spill
; RV64I-NEXT: sd s10, 72(sp) # 8-byte Folded Spill
; RV64I-NEXT: sd s11, 64(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a5, 0(a0)
-; RV64I-NEXT: lbu a7, 1(a0)
-; RV64I-NEXT: lbu t2, 2(a0)
-; RV64I-NEXT: lbu s3, 3(a0)
-; RV64I-NEXT: lbu t0, 4(a0)
-; RV64I-NEXT: lbu s8, 5(a0)
-; RV64I-NEXT: lbu s9, 6(a0)
-; RV64I-NEXT: lbu s10, 7(a0)
-; RV64I-NEXT: lbu s2, 8(a0)
-; RV64I-NEXT: lbu s4, 9(a0)
-; RV64I-NEXT: lbu s5, 10(a0)
-; RV64I-NEXT: lbu s6, 11(a0)
-; RV64I-NEXT: lbu s7, 12(a0)
-; RV64I-NEXT: lbu s11, 13(a0)
-; RV64I-NEXT: lbu t1, 14(a0)
-; RV64I-NEXT: lbu t3, 15(a0)
-; RV64I-NEXT: lbu a3, 16(a0)
-; RV64I-NEXT: lbu a6, 17(a0)
-; RV64I-NEXT: lbu t4, 18(a0)
-; RV64I-NEXT: lbu t5, 19(a0)
-; RV64I-NEXT: lbu a4, 20(a0)
-; RV64I-NEXT: lbu t6, 21(a0)
-; RV64I-NEXT: lbu s0, 22(a0)
-; RV64I-NEXT: lbu s1, 23(a0)
-; RV64I-NEXT: slli a7, a7, 8
-; RV64I-NEXT: slli t2, t2, 16
+; RV64I-NEXT: lbu a3, 0(a0)
+; RV64I-NEXT: lbu a4, 1(a0)
+; RV64I-NEXT: lbu a5, 2(a0)
+; RV64I-NEXT: lbu a6, 3(a0)
+; RV64I-NEXT: lbu a7, 4(a0)
+; RV64I-NEXT: lbu t0, 5(a0)
+; RV64I-NEXT: lbu t1, 6(a0)
+; RV64I-NEXT: lbu t2, 7(a0)
+; RV64I-NEXT: lbu t3, 8(a0)
+; RV64I-NEXT: lbu t4, 9(a0)
+; RV64I-NEXT: lbu t5, 10(a0)
+; RV64I-NEXT: lbu t6, 11(a0)
+; RV64I-NEXT: lbu s0, 12(a0)
+; RV64I-NEXT: lbu s1, 13(a0)
+; RV64I-NEXT: lbu s2, 14(a0)
+; RV64I-NEXT: lbu s3, 15(a0)
+; RV64I-NEXT: lbu s4, 16(a0)
+; RV64I-NEXT: lbu s5, 17(a0)
+; RV64I-NEXT: lbu s6, 18(a0)
+; RV64I-NEXT: lbu s7, 19(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: slli a5, a5, 16
+; RV64I-NEXT: slli a6, a6, 24
+; RV64I-NEXT: slli t0, t0, 8
+; RV64I-NEXT: slli t1, t1, 16
+; RV64I-NEXT: slli t2, t2, 24
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: or a5, t0, a7
+; RV64I-NEXT: or a6, t2, t1
+; RV64I-NEXT: lbu s8, 20(a0)
+; RV64I-NEXT: lbu s9, 21(a0)
+; RV64I-NEXT: lbu s10, 22(a0)
+; RV64I-NEXT: lbu s11, 23(a0)
+; RV64I-NEXT: slli t4, t4, 8
+; RV64I-NEXT: slli t5, t5, 16
+; RV64I-NEXT: slli t6, t6, 24
+; RV64I-NEXT: slli s1, s1, 8
+; RV64I-NEXT: slli s2, s2, 16
; RV64I-NEXT: slli s3, s3, 24
-; RV64I-NEXT: slli s8, s8, 8
-; RV64I-NEXT: slli s9, s9, 16
-; RV64I-NEXT: slli s10, s10, 24
-; RV64I-NEXT: or a5, a7, a5
-; RV64I-NEXT: or a7, s3, t2
-; RV64I-NEXT: or t0, s8, t0
-; RV64I-NEXT: or t2, s10, s9
-; RV64I-NEXT: lbu s3, 24(a0)
-; RV64I-NEXT: lbu s8, 25(a0)
-; RV64I-NEXT: lbu s9, 26(a0)
-; RV64I-NEXT: lbu s10, 27(a0)
-; RV64I-NEXT: slli s4, s4, 8
-; RV64I-NEXT: slli s5, s5, 16
-; RV64I-NEXT: slli s6, s6, 24
-; RV64I-NEXT: slli s11, s11, 8
-; RV64I-NEXT: or s2, s4, s2
-; RV64I-NEXT: or s4, s6, s5
-; RV64I-NEXT: or s5, s11, s7
-; RV64I-NEXT: lbu s6, 28(a0)
-; RV64I-NEXT: lbu s7, 29(a0)
-; RV64I-NEXT: lbu s11, 30(a0)
+; RV64I-NEXT: or a7, t4, t3
+; RV64I-NEXT: or t0, t6, t5
+; RV64I-NEXT: or t1, s1, s0
+; RV64I-NEXT: or t2, s3, s2
+; RV64I-NEXT: lbu t3, 24(a0)
+; RV64I-NEXT: lbu t4, 25(a0)
+; RV64I-NEXT: lbu t5, 26(a0)
+; RV64I-NEXT: lbu t6, 27(a0)
+; RV64I-NEXT: slli s5, s5, 8
+; RV64I-NEXT: slli s6, s6, 16
+; RV64I-NEXT: slli s7, s7, 24
+; RV64I-NEXT: slli s9, s9, 8
+; RV64I-NEXT: or s0, s5, s4
+; RV64I-NEXT: or s1, s7, s6
+; RV64I-NEXT: or s2, s9, s8
+; RV64I-NEXT: lbu s3, 28(a0)
+; RV64I-NEXT: lbu s4, 29(a0)
+; RV64I-NEXT: lbu s5, 30(a0)
; RV64I-NEXT: lbu a0, 31(a0)
; RV64I-NEXT: lbu a1, 0(a1)
; RV64I-NEXT: sd zero, 32(sp)
; RV64I-NEXT: sd zero, 40(sp)
; RV64I-NEXT: sd zero, 48(sp)
; RV64I-NEXT: sd zero, 56(sp)
-; RV64I-NEXT: slli t1, t1, 16
-; RV64I-NEXT: slli t3, t3, 24
-; RV64I-NEXT: or t1, t3, t1
-; RV64I-NEXT: mv t3, sp
-; RV64I-NEXT: slli a6, a6, 8
-; RV64I-NEXT: slli t4, t4, 16
-; RV64I-NEXT: slli t5, t5, 24
-; RV64I-NEXT: slli t6, t6, 8
-; RV64I-NEXT: slli s0, s0, 16
-; RV64I-NEXT: slli s1, s1, 24
-; RV64I-NEXT: slli s8, s8, 8
-; RV64I-NEXT: slli s9, s9, 16
-; RV64I-NEXT: slli s10, s10, 24
-; RV64I-NEXT: slli s7, s7, 8
-; RV64I-NEXT: slli s11, s11, 16
+; RV64I-NEXT: slli s10, s10, 16
+; RV64I-NEXT: slli s11, s11, 24
+; RV64I-NEXT: or s6, s11, s10
+; RV64I-NEXT: mv s7, sp
+; RV64I-NEXT: slli t4, t4, 8
+; RV64I-NEXT: slli t5, t5, 16
+; RV64I-NEXT: slli t6, t6, 24
+; RV64I-NEXT: slli s4, s4, 8
+; RV64I-NEXT: slli s5, s5, 16
; RV64I-NEXT: slli a0, a0, 24
; RV64I-NEXT: slli a1, a1, 3
-; RV64I-NEXT: or a3, a6, a3
-; RV64I-NEXT: or a6, t5, t4
-; RV64I-NEXT: or a4, t6, a4
-; RV64I-NEXT: or s0, s1, s0
-; RV64I-NEXT: or t4, s8, s3
-; RV64I-NEXT: or t5, s10, s9
-; RV64I-NEXT: or t6, s7, s6
-; RV64I-NEXT: or a0, a0, s11
+; RV64I-NEXT: or t3, t4, t3
+; RV64I-NEXT: or t4, t6, t5
+; RV64I-NEXT: or t5, s4, s3
+; RV64I-NEXT: or a0, a0, s5
; RV64I-NEXT: andi a1, a1, 24
-; RV64I-NEXT: or a5, a7, a5
-; RV64I-NEXT: or a7, t2, t0
-; RV64I-NEXT: or t0, s4, s2
-; RV64I-NEXT: or t1, t1, s5
-; RV64I-NEXT: or a3, a6, a3
-; RV64I-NEXT: or a4, s0, a4
-; RV64I-NEXT: or a6, t5, t4
-; RV64I-NEXT: or a0, a0, t6
-; RV64I-NEXT: add t3, t3, a1
-; RV64I-NEXT: slli a7, a7, 32
-; RV64I-NEXT: slli t1, t1, 32
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: or a5, t0, a7
+; RV64I-NEXT: or a6, t2, t1
+; RV64I-NEXT: or s0, s1, s0
+; RV64I-NEXT: or a7, s6, s2
+; RV64I-NEXT: or t0, t4, t3
+; RV64I-NEXT: or a0, a0, t5
+; RV64I-NEXT: add s7, s7, a1
; RV64I-NEXT: slli a4, a4, 32
+; RV64I-NEXT: slli a6, a6, 32
+; RV64I-NEXT: slli a7, a7, 32
; RV64I-NEXT: slli a0, a0, 32
-; RV64I-NEXT: or a1, a7, a5
-; RV64I-NEXT: or a5, t1, t0
; RV64I-NEXT: or a3, a4, a3
-; RV64I-NEXT: or a0, a0, a6
-; RV64I-NEXT: sd a1, 0(sp)
-; RV64I-NEXT: sd a5, 8(sp)
-; RV64I-NEXT: sd a3, 16(sp)
+; RV64I-NEXT: or a1, a6, a5
+; RV64I-NEXT: or a4, a7, s0
+; RV64I-NEXT: or a0, a0, t0
+; RV64I-NEXT: sd a3, 0(sp)
+; RV64I-NEXT: sd a1, 8(sp)
+; RV64I-NEXT: sd a4, 16(sp)
; RV64I-NEXT: sd a0, 24(sp)
-; RV64I-NEXT: ld a4, 16(t3)
-; RV64I-NEXT: ld a0, 8(t3)
-; RV64I-NEXT: ld a1, 0(t3)
-; RV64I-NEXT: ld a3, 24(t3)
+; RV64I-NEXT: ld a4, 16(s7)
+; RV64I-NEXT: ld a0, 8(s7)
+; RV64I-NEXT: ld a1, 0(s7)
+; RV64I-NEXT: ld a3, 24(s7)
; RV64I-NEXT: srli a5, a4, 56
; RV64I-NEXT: srli a6, a4, 48
; RV64I-NEXT: srli a7, a4, 40
@@ -3026,25 +3017,25 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
; RV64I-NEXT: srli s5, a1, 48
; RV64I-NEXT: srli s6, a1, 40
; RV64I-NEXT: srli s7, a1, 32
-; RV64I-NEXT: srli s8, a1, 24
-; RV64I-NEXT: srli s9, a1, 16
-; RV64I-NEXT: srli s10, a1, 8
-; RV64I-NEXT: srli s11, a0, 56
; RV64I-NEXT: sb t0, 20(a2)
+; RV64I-NEXT: srli t0, a1, 24
; RV64I-NEXT: sb a7, 21(a2)
+; RV64I-NEXT: srli a7, a1, 16
; RV64I-NEXT: sb a6, 22(a2)
+; RV64I-NEXT: srli a6, a1, 8
; RV64I-NEXT: sb a5, 23(a2)
-; RV64I-NEXT: srli a5, a0, 48
+; RV64I-NEXT: srli a5, a0, 56
; RV64I-NEXT: sb a4, 16(a2)
+; RV64I-NEXT: srli a4, a0, 48
; RV64I-NEXT: sb t3, 17(a2)
; RV64I-NEXT: sb t2, 18(a2)
; RV64I-NEXT: sb t1, 19(a2)
-; RV64I-NEXT: srli a4, a0, 40
+; RV64I-NEXT: srli t1, a0, 40
; RV64I-NEXT: sb s0, 28(a2)
; RV64I-NEXT: sb t6, 29(a2)
; RV64I-NEXT: sb t5, 30(a2)
; RV64I-NEXT: sb t4, 31(a2)
-; RV64I-NEXT: srli a6, a0, 32
+; RV64I-NEXT: srli t2, a0, 32
; RV64I-NEXT: sb a3, 24(a2)
; RV64I-NEXT: sb s3, 25(a2)
; RV64I-NEXT: sb s2, 26(a2)
@@ -3054,19 +3045,19 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
; RV64I-NEXT: sb s6, 5(a2)
; RV64I-NEXT: sb s5, 6(a2)
; RV64I-NEXT: sb s4, 7(a2)
-; RV64I-NEXT: srli a7, a0, 16
+; RV64I-NEXT: srli t3, a0, 16
; RV64I-NEXT: sb a1, 0(a2)
-; RV64I-NEXT: sb s10, 1(a2)
-; RV64I-NEXT: sb s9, 2(a2)
-; RV64I-NEXT: sb s8, 3(a2)
+; RV64I-NEXT: sb a6, 1(a2)
+; RV64I-NEXT: sb a7, 2(a2)
+; RV64I-NEXT: sb t0, 3(a2)
; RV64I-NEXT: srli a1, a0, 8
-; RV64I-NEXT: sb a6, 12(a2)
-; RV64I-NEXT: sb a4, 13(a2)
-; RV64I-NEXT: sb a5, 14(a2)
-; RV64I-NEXT: sb s11, 15(a2)
+; RV64I-NEXT: sb t2, 12(a2)
+; RV64I-NEXT: sb t1, 13(a2)
+; RV64I-NEXT: sb a4, 14(a2)
+; RV64I-NEXT: sb a5, 15(a2)
; RV64I-NEXT: sb a0, 8(a2)
; RV64I-NEXT: sb a1, 9(a2)
-; RV64I-NEXT: sb a7, 10(a2)
+; RV64I-NEXT: sb t3, 10(a2)
; RV64I-NEXT: sb a3, 11(a2)
; RV64I-NEXT: ld s0, 152(sp) # 8-byte Folded Reload
; RV64I-NEXT: ld s1, 144(sp) # 8-byte Folded Reload
@@ -3085,132 +3076,128 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
;
; RV32I-LABEL: lshr_32bytes_dwordOff:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi sp, sp, -128
-; RV32I-NEXT: sw ra, 124(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s0, 120(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s1, 116(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s2, 112(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s3, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s4, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s5, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s6, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s7, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s8, 88(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s9, 84(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s10, 80(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s11, 76(sp) # 4-byte Folded Spill
-; RV32I-NEXT: mv a3, a1
-; RV32I-NEXT: lbu a5, 0(a0)
-; RV32I-NEXT: lbu a7, 1(a0)
-; RV32I-NEXT: lbu t0, 2(a0)
-; RV32I-NEXT: lbu t1, 3(a0)
-; RV32I-NEXT: lbu s2, 4(a0)
-; RV32I-NEXT: lbu s4, 5(a0)
-; RV32I-NEXT: lbu s5, 6(a0)
-; RV32I-NEXT: lbu s6, 7(a0)
-; RV32I-NEXT: lbu s3, 8(a0)
-; RV32I-NEXT: lbu s9, 9(a0)
-; RV32I-NEXT: lbu s10, 10(a0)
-; RV32I-NEXT: lbu s11, 11(a0)
-; RV32I-NEXT: lbu ra, 12(a0)
-; RV32I-NEXT: lbu a1, 13(a0)
-; RV32I-NEXT: lbu t4, 14(a0)
-; RV32I-NEXT: lbu t6, 15(a0)
-; RV32I-NEXT: lbu a4, 16(a0)
-; RV32I-NEXT: sw a4, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a6, 17(a0)
-; RV32I-NEXT: lbu t2, 18(a0)
-; RV32I-NEXT: lbu t3, 19(a0)
-; RV32I-NEXT: lbu a4, 20(a0)
-; RV32I-NEXT: lbu t5, 21(a0)
-; RV32I-NEXT: lbu s0, 22(a0)
-; RV32I-NEXT: lbu s1, 23(a0)
-; RV32I-NEXT: slli a7, a7, 8
-; RV32I-NEXT: slli t0, t0, 16
-; RV32I-NEXT: slli t1, t1, 24
-; RV32I-NEXT: slli s4, s4, 8
-; RV32I-NEXT: slli s5, s5, 16
-; RV32I-NEXT: slli s6, s6, 24
-; RV32I-NEXT: or a5, a7, a5
-; RV32I-NEXT: or a7, t1, t0
-; RV32I-NEXT: or t0, s4, s2
-; RV32I-NEXT: or t1, s6, s5
-; RV32I-NEXT: lbu s2, 24(a0)
-; RV32I-NEXT: lbu s6, 25(a0)
-; RV32I-NEXT: lbu s7, 26(a0)
-; RV32I-NEXT: lbu s8, 27(a0)
-; RV32I-NEXT: slli s9, s9, 8
-; RV32I-NEXT: slli s10, s10, 16
-; RV32I-NEXT: slli s11, s11, 24
-; RV32I-NEXT: slli a1, a1, 8
-; RV32I-NEXT: or s3, s9, s3
-; RV32I-NEXT: or s4, s11, s10
-; RV32I-NEXT: or s5, a1, ra
-; RV32I-NEXT: lbu s9, 28(a0)
-; RV32I-NEXT: lbu a1, 29(a0)
-; RV32I-NEXT: lbu s10, 30(a0)
+; RV32I-NEXT: addi sp, sp, -112
+; RV32I-NEXT: sw s0, 108(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 104(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s2, 100(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s3, 96(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s4, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s5, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s6, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s7, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s8, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s9, 72(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s10, 68(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s11, 64(sp) # 4-byte Folded Spill
+; RV32I-NEXT: lbu a3, 0(a0)
+; RV32I-NEXT: lbu a4, 1(a0)
+; RV32I-NEXT: lbu a5, 2(a0)
+; RV32I-NEXT: lbu a6, 3(a0)
+; RV32I-NEXT: lbu a7, 4(a0)
+; RV32I-NEXT: lbu t0, 5(a0)
+; RV32I-NEXT: lbu t1, 6(a0)
+; RV32I-NEXT: lbu t2, 7(a0)
+; RV32I-NEXT: lbu t3, 8(a0)
+; RV32I-NEXT: lbu t4, 9(a0)
+; RV32I-NEXT: lbu t5, 10(a0)
+; RV32I-NEXT: lbu t6, 11(a0)
+; RV32I-NEXT: lbu s2, 12(a0)
+; RV32I-NEXT: lbu s3, 13(a0)
+; RV32I-NEXT: lbu s4, 14(a0)
+; RV32I-NEXT: lbu s5, 15(a0)
+; RV32I-NEXT: lbu s6, 16(a0)
+; RV32I-NEXT: lbu s7, 17(a0)
+; RV32I-NEXT: lbu s8, 18(a0)
+; RV32I-NEXT: lbu s9, 19(a0)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: slli a5, a5, 16
+; RV32I-NEXT: slli a6, a6, 24
+; RV32I-NEXT: slli t0, t0, 8
+; RV32I-NEXT: slli t1, t1, 16
+; RV32I-NEXT: slli t2, t2, 24
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a5, t0, a7
+; RV32I-NEXT: or a6, t2, t1
+; RV32I-NEXT: lbu s10, 20(a0)
+; RV32I-NEXT: lbu s11, 21(a0)
+; RV32I-NEXT: lbu s0, 22(a0)
+; RV32I-NEXT: lbu s1, 23(a0)
+; RV32I-NEXT: slli t4, t4, 8
+; RV32I-NEXT: slli t5, t5, 16
+; RV32I-NEXT: slli t6, t6, 24
+; RV32I-NEXT: slli s3, s3, 8
+; RV32I-NEXT: slli s4, s4, 16
+; RV32I-NEXT: slli s5, s5, 24
+; RV32I-NEXT: or a7, t4, t3
+; RV32I-NEXT: or t0, t6, t5
+; RV32I-NEXT: or t1, s3, s2
+; RV32I-NEXT: or t2, s5, s4
+; RV32I-NEXT: lbu t3, 24(a0)
+; RV32I-NEXT: lbu s2, 25(a0)
+; RV32I-NEXT: lbu s3, 26(a0)
+; RV32I-NEXT: lbu s4, 27(a0)
+; RV32I-NEXT: slli s7, s7, 8
+; RV32I-NEXT: slli s8, s8, 16
+; RV32I-NEXT: slli s9, s9, 24
+; RV32I-NEXT: slli s11, s11, 8
+; RV32I-NEXT: or t4, s7, s6
+; RV32I-NEXT: or t5, s9, s8
+; RV32I-NEXT: or t6, s11, s10
+; RV32I-NEXT: lbu s5, 28(a0)
+; RV32I-NEXT: lbu s6, 29(a0)
+; RV32I-NEXT: lbu s7, 30(a0)
; RV32I-NEXT: lbu a0, 31(a0)
-; RV32I-NEXT: lbu a3, 0(a3)
+; RV32I-NEXT: lbu a1, 0(a1)
+; RV32I-NEXT: sw zero, 48(sp)
+; RV32I-NEXT: sw zero, 52(sp)
; RV32I-NEXT: sw zero, 56(sp)
; RV32I-NEXT: sw zero, 60(sp)
-; RV32I-NEXT: sw zero, 64(sp)
-; RV32I-NEXT: sw zero, 68(sp)
+; RV32I-NEXT: sw zero, 32(sp)
+; RV32I-NEXT: sw zero, 36(sp)
; RV32I-NEXT: sw zero, 40(sp)
; RV32I-NEXT: sw zero, 44(sp)
-; RV32I-NEXT: sw zero, 48(sp)
-; RV32I-NEXT: sw zero, 52(sp)
-; RV32I-NEXT: slli t4, t4, 16
-; RV32I-NEXT: slli t6, t6, 24
-; RV32I-NEXT: or t4, t6, t4
-; RV32I-NEXT: addi t6, sp, 8
-; RV32I-NEXT: slli a6, a6, 8
-; RV32I-NEXT: slli t2, t2, 16
-; RV32I-NEXT: slli t3, t3, 24
-; RV32I-NEXT: slli t5, t5, 8
; RV32I-NEXT: slli s0, s0, 16
; RV32I-NEXT: slli s1, s1, 24
+; RV32I-NEXT: or s0, s1, s0
+; RV32I-NEXT: mv s1, sp
+; RV32I-NEXT: slli s2, s2, 8
+; RV32I-NEXT: slli s3, s3, 16
+; RV32I-NEXT: slli s4, s4, 24
; RV32I-NEXT: slli s6, s6, 8
; RV32I-NEXT: slli s7, s7, 16
-; RV32I-NEXT: slli s8, s8, 24
-; RV32I-NEXT: slli a1, a1, 8
-; RV32I-NEXT: slli s10, s10, 16
; RV32I-NEXT: slli a0, a0, 24
-; RV32I-NEXT: slli a3, a3, 3
-; RV32I-NEXT: lw s11, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT: or a6, a6, s11
-; RV32I-NEXT: or t2, t3, t2
-; RV32I-NEXT: or a4, t5, a4
-; RV32I-NEXT: or s0, s1, s0
-; RV32I-NEXT: or t3, s6, s2
-; RV32I-NEXT: or t5, s8, s7
-; RV32I-NEXT: or a1, a1, s9
-; RV32I-NEXT: or a0, a0, s10
-; RV32I-NEXT: andi a3, a3, 24
-; RV32I-NEXT: or a5, a7, a5
-; RV32I-NEXT: or a7, t1, t0
-; RV32I-NEXT: or t0, s4, s3
-; RV32I-NEXT: or t1, t4, s5
-; RV32I-NEXT: or a6, t2, a6
-; RV32I-NEXT: or a4, s0, a4
-; RV32I-NEXT: or t2, t5, t3
-; RV32I-NEXT: or a0, a0, a1
-; RV32I-NEXT: add t6, t6, a3
-; RV32I-NEXT: sw a6, 24(sp)
-; RV32I-NEXT: sw a4, 28(sp)
-; RV32I-NEXT: sw t2, 32(sp)
-; RV32I-NEXT: sw a0, 36(sp)
+; RV32I-NEXT: slli a1, a1, 3
+; RV32I-NEXT: or t3, s2, t3
+; RV32I-NEXT: or s2, s4, s3
+; RV32I-NEXT: or s3, s6, s5
+; RV32I-NEXT: or a0, a0, s7
+; RV32I-NEXT: andi a1, a1, 24
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a5, t0, a7
+; RV32I-NEXT: or a6, t2, t1
+; RV32I-NEXT: or a7, t5, t4
+; RV32I-NEXT: or t0, s0, t6
+; RV32I-NEXT: or t1, s2, t3
+; RV32I-NEXT: or a0, a0, s3
+; RV32I-NEXT: add s1, s1, a1
+; RV32I-NEXT: sw a7, 16(sp)
+; RV32I-NEXT: sw t0, 20(sp)
+; RV32I-NEXT: sw t1, 24(sp)
+; RV32I-NEXT: sw a0, 28(sp)
+; RV32I-NEXT: sw a3, 0(sp)
+; RV32I-NEXT: sw a4, 4(sp)
; RV32I-NEXT: sw a5, 8(sp)
-; RV32I-NEXT: sw a7, 12(sp)
-; RV32I-NEXT: sw t0, 16(sp)
-; RV32I-NEXT: sw t1, 20(sp)
-; RV32I-NEXT: lw a6, 16(t6)
-; RV32I-NEXT: lw a5, 20(t6)
-; RV32I-NEXT: lw a7, 24(t6)
-; RV32I-NEXT: lw a1, 0(t6)
-; RV32I-NEXT: lw a0, 4(t6)
-; RV32I-NEXT: lw a4, 8(t6)
-; RV32I-NEXT: lw a3, 12(t6)
-; RV32I-NEXT: lw t0, 28(t6)
+; RV32I-NEXT: sw a6, 12(sp)
+; RV32I-NEXT: lw a6, 16(s1)
+; RV32I-NEXT: lw a5, 20(s1)
+; RV32I-NEXT: lw a7, 24(s1)
+; RV32I-NEXT: lw a1, 0(s1)
+; RV32I-NEXT: lw a0, 4(s1)
+; RV32I-NEXT: lw a4, 8(s1)
+; RV32I-NEXT: lw a3, 12(s1)
+; RV32I-NEXT: lw t0, 28(s1)
; RV32I-NEXT: srli t1, a7, 24
; RV32I-NEXT: srli t2, a7, 16
; RV32I-NEXT: srli t3, a7, 8
@@ -3225,21 +3212,21 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
; RV32I-NEXT: srli s5, a5, 8
; RV32I-NEXT: srli s6, a4, 24
; RV32I-NEXT: srli s7, a4, 16
-; RV32I-NEXT: srli s8, a4, 8
-; RV32I-NEXT: srli s9, a3, 24
-; RV32I-NEXT: srli s10, a3, 16
-; RV32I-NEXT: srli s11, a3, 8
; RV32I-NEXT: sb a7, 24(a2)
-; RV32I-NEXT: srli a7, a1, 24
+; RV32I-NEXT: srli a7, a4, 8
; RV32I-NEXT: sb t3, 25(a2)
+; RV32I-NEXT: srli t3, a3, 24
; RV32I-NEXT: sb t2, 26(a2)
+; RV32I-NEXT: srli t2, a3, 16
; RV32I-NEXT: sb t1, 27(a2)
-; RV32I-NEXT: srli t1, a1, 16
+; RV32I-NEXT: srli t1, a3, 8
; RV32I-NEXT: sb t0, 28(a2)
+; RV32I-NEXT: srli t0, a1, 24
; RV32I-NEXT: sb t6, 29(a2)
+; RV32I-NEXT: srli t6, a1, 16
; RV32I-NEXT: sb t5, 30(a2)
; RV32I-NEXT: sb t4, 31(a2)
-; RV32I-NEXT: srli t0, a1, 8
+; RV32I-NEXT: srli t4, a1, 8
; RV32I-NEXT: sb a6, 16(a2)
; RV32I-NEXT: sb s2, 17(a2)
; RV32I-NEXT: sb s1, 18(a2)
@@ -3251,36 +3238,35 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
; RV32I-NEXT: sb s3, 23(a2)
; RV32I-NEXT: srli a5, a0, 16
; RV32I-NEXT: sb a4, 8(a2)
-; RV32I-NEXT: sb s8, 9(a2)
+; RV32I-NEXT: sb a7, 9(a2)
; RV32I-NEXT: sb s7, 10(a2)
; RV32I-NEXT: sb s6, 11(a2)
; RV32I-NEXT: srli a4, a0, 8
; RV32I-NEXT: sb a3, 12(a2)
-; RV32I-NEXT: sb s11, 13(a2)
-; RV32I-NEXT: sb s10, 14(a2)
-; RV32I-NEXT: sb s9, 15(a2)
+; RV32I-NEXT: sb t1, 13(a2)
+; RV32I-NEXT: sb t2, 14(a2)
+; RV32I-NEXT: sb t3, 15(a2)
; RV32I-NEXT: sb a1, 0(a2)
-; RV32I-NEXT: sb t0, 1(a2)
-; RV32I-NEXT: sb t1, 2(a2)
-; RV32I-NEXT: sb a7, 3(a2)
+; RV32I-NEXT: sb t4, 1(a2)
+; RV32I-NEXT: sb t6, 2(a2)
+; RV32I-NEXT: sb t0, 3(a2)
; RV32I-NEXT: sb a0, 4(a2)
; RV32I-NEXT: sb a4, 5(a2)
; RV32I-NEXT: sb a5, 6(a2)
; RV32I-NEXT: sb a6, 7(a2)
-; RV32I-NEXT: lw ra, 124(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s0, 120(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s1, 116(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s2, 112(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s3, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s4, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s5, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s6, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s7, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s8, 88(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s9, 84(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s10, 80(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s11, 76(sp) # 4-byte Folded Reload
-; RV32I-NEXT: addi sp, sp, 128
+; RV32I-NEXT: lw s0, 108(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 104(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s2, 100(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s3, 96(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s4, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s5, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s6, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s7, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s8, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s9, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s10, 68(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s11, 64(sp) # 4-byte Folded Reload
+; RV32I-NEXT: addi sp, sp, 112
; RV32I-NEXT: ret
%src = load i256, ptr %src.ptr, align 1
%dwordOff = load i256, ptr %dwordOff.ptr, align 1
@@ -3524,132 +3510,129 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
;
; RV32I-LABEL: shl_32bytes:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi sp, sp, -128
-; RV32I-NEXT: sw ra, 124(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s0, 120(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s1, 116(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s2, 112(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s3, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s4, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s5, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s6, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s7, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s8, 88(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s9, 84(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s10, 80(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s11, 76(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu s1, 0(a0)
+; RV32I-NEXT: addi sp, sp, -112
+; RV32I-NEXT: sw s0, 108(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 104(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s2, 100(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s3, 96(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s4, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s5, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s6, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s7, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s8, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s9, 72(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s10, 68(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s11, 64(sp) # 4-byte Folded Spill
+; RV32I-NEXT: lbu a3, 0(a0)
; RV32I-NEXT: lbu a4, 1(a0)
; RV32I-NEXT: lbu a5, 2(a0)
; RV32I-NEXT: lbu a6, 3(a0)
-; RV32I-NEXT: lbu t1, 4(a0)
-; RV32I-NEXT: lbu t3, 5(a0)
-; RV32I-NEXT: lbu t4, 6(a0)
-; RV32I-NEXT: lbu s0, 7(a0)
-; RV32I-NEXT: lbu t2, 8(a0)
-; RV32I-NEXT: lbu s3, 9(a0)
-; RV32I-NEXT: lbu s6, 10(a0)
-; RV32I-NEXT: lbu s8, 11(a0)
-; RV32I-NEXT: lbu s9, 12(a0)
-; RV32I-NEXT: lbu s10, 13(a0)
-; RV32I-NEXT: lbu s4, 14(a0)
-; RV32I-NEXT: lbu s7, 15(a0)
-; RV32I-NEXT: lbu s5, 16(a0)
-; RV32I-NEXT: lbu s11, 17(a0)
-; RV32I-NEXT: lbu ra, 18(a0)
-; RV32I-NEXT: lbu a3, 19(a0)
-; RV32I-NEXT: lbu t5, 20(a0)
-; RV32I-NEXT: lbu t6, 21(a0)
-; RV32I-NEXT: lbu a7, 22(a0)
-; RV32I-NEXT: lbu t0, 23(a0)
+; RV32I-NEXT: lbu a7, 4(a0)
+; RV32I-NEXT: lbu t0, 5(a0)
+; RV32I-NEXT: lbu t1, 6(a0)
+; RV32I-NEXT: lbu t2, 7(a0)
+; RV32I-NEXT: lbu t3, 8(a0)
+; RV32I-NEXT: lbu t4, 9(a0)
+; RV32I-NEXT: lbu t5, 10(a0)
+; RV32I-NEXT: lbu t6, 11(a0)
+; RV32I-NEXT: lbu s0, 12(a0)
+; RV32I-NEXT: lbu s1, 13(a0)
+; RV32I-NEXT: lbu s2, 14(a0)
+; RV32I-NEXT: lbu s3, 15(a0)
+; RV32I-NEXT: lbu s4, 16(a0)
+; RV32I-NEXT: lbu s5, 17(a0)
+; RV32I-NEXT: lbu s6, 18(a0)
+; RV32I-NEXT: lbu s7, 19(a0)
; RV32I-NEXT: slli a4, a4, 8
; RV32I-NEXT: slli a5, a5, 16
; RV32I-NEXT: slli a6, a6, 24
-; RV32I-NEXT: slli t3, t3, 8
-; RV32I-NEXT: slli t4, t4, 16
-; RV32I-NEXT: slli s0, s0, 24
-; RV32I-NEXT: or a4, a4, s1
-; RV32I-NEXT: sw a4, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT: slli t0, t0, 8
+; RV32I-NEXT: slli t1, t1, 16
+; RV32I-NEXT: slli t2, t2, 24
+; RV32I-NEXT: or a3, a4, a3
; RV32I-NEXT: or a4, a6, a5
-; RV32I-NEXT: or a5, t3, t1
-; RV32I-NEXT: or a6, s0, t4
-; RV32I-NEXT: lbu t1, 24(a0)
+; RV32I-NEXT: or a5, t0, a7
+; RV32I-NEXT: or a6, t2, t1
+; RV32I-NEXT: lbu s8, 20(a0)
+; RV32I-NEXT: lbu s9, 21(a0)
+; RV32I-NEXT: lbu s10, 22(a0)
+; RV32I-NEXT: lbu s11, 23(a0)
+; RV32I-NEXT: slli t4, t4, 8
+; RV32I-NEXT: slli t5, t5, 16
+; RV32I-NEXT: slli t6, t6, 24
+; RV32I-NEXT: slli s1, s1, 8
+; RV32I-NEXT: slli s2, s2, 16
+; RV32I-NEXT: slli s3, s3, 24
+; RV32I-NEXT: or a7, t4, t3
+; RV32I-NEXT: or t0, t6, t5
+; RV32I-NEXT: or t1, s1, s0
+; RV32I-NEXT: or t2, s3, s2
+; RV32I-NEXT: lbu t6, 24(a0)
; RV32I-NEXT: lbu s0, 25(a0)
; RV32I-NEXT: lbu s1, 26(a0)
; RV32I-NEXT: lbu s2, 27(a0)
-; RV32I-NEXT: slli s3, s3, 8
+; RV32I-NEXT: slli s5, s5, 8
; RV32I-NEXT: slli s6, s6, 16
-; RV32I-NEXT: slli s8, s8, 24
-; RV32I-NEXT: slli s10, s10, 8
-; RV32I-NEXT: or t2, s3, t2
-; RV32I-NEXT: or t3, s8, s6
-; RV32I-NEXT: or t4, s10, s9
-; RV32I-NEXT: lbu s3, 28(a0)
-; RV32I-NEXT: lbu s6, 29(a0)
-; RV32I-NEXT: lbu s8, 30(a0)
-; RV32I-NEXT: lbu s9, 31(a0)
-; RV32I-NEXT: slli s4, s4, 16
; RV32I-NEXT: slli s7, s7, 24
-; RV32I-NEXT: slli s11, s11, 8
-; RV32I-NEXT: slli ra, ra, 16
-; RV32I-NEXT: slli a3, a3, 24
-; RV32I-NEXT: or a0, s7, s4
-; RV32I-NEXT: or s4, s11, s5
-; RV32I-NEXT: or s5, a3, ra
-; RV32I-NEXT: lbu a3, 0(a1)
-; RV32I-NEXT: lbu s7, 1(a1)
-; RV32I-NEXT: lbu s10, 2(a1)
+; RV32I-NEXT: slli s9, s9, 8
+; RV32I-NEXT: or t3, s5, s4
+; RV32I-NEXT: or t4, s7, s6
+; RV32I-NEXT: or t5, s9, s8
+; RV32I-NEXT: lbu s3, 28(a0)
+; RV32I-NEXT: lbu s4, 29(a0)
+; RV32I-NEXT: lbu s5, 30(a0)
+; RV32I-NEXT: lbu s6, 31(a0)
+; RV32I-NEXT: slli s10, s10, 16
+; RV32I-NEXT: slli s11, s11, 24
+; RV32I-NEXT: slli s0, s0, 8
+; RV32I-NEXT: slli s1, s1, 16
+; RV32I-NEXT: slli s2, s2, 24
+; RV32I-NEXT: or a0, s11, s10
+; RV32I-NEXT: or t6, s0, t6
+; RV32I-NEXT: or s0, s2, s1
+; RV32I-NEXT: lbu s1, 0(a1)
+; RV32I-NEXT: lbu s2, 1(a1)
+; RV32I-NEXT: lbu s7, 2(a1)
; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: sw zero, 16(sp)
+; RV32I-NEXT: sw zero, 20(sp)
; RV32I-NEXT: sw zero, 24(sp)
; RV32I-NEXT: sw zero, 28(sp)
-; RV32I-NEXT: sw zero, 32(sp)
-; RV32I-NEXT: sw zero, 36(sp)
+; RV32I-NEXT: sw zero, 0(sp)
+; RV32I-NEXT: sw zero, 4(sp)
; RV32I-NEXT: sw zero, 8(sp)
; RV32I-NEXT: sw zero, 12(sp)
-; RV32I-NEXT: sw zero, 16(sp)
-; RV32I-NEXT: sw zero, 20(sp)
-; RV32I-NEXT: slli t6, t6, 8
-; RV32I-NEXT: or t5, t6, t5
-; RV32I-NEXT: addi t6, sp, 40
-; RV32I-NEXT: slli a7, a7, 16
-; RV32I-NEXT: slli t0, t0, 24
-; RV32I-NEXT: slli s0, s0, 8
-; RV32I-NEXT: slli s1, s1, 16
-; RV32I-NEXT: slli s2, s2, 24
-; RV32I-NEXT: slli s6, s6, 8
-; RV32I-NEXT: slli s8, s8, 16
-; RV32I-NEXT: slli s9, s9, 24
-; RV32I-NEXT: slli s7, s7, 8
-; RV32I-NEXT: slli s10, s10, 16
+; RV32I-NEXT: slli s4, s4, 8
+; RV32I-NEXT: or s3, s4, s3
+; RV32I-NEXT: addi s4, sp, 32
+; RV32I-NEXT: slli s5, s5, 16
+; RV32I-NEXT: slli s6, s6, 24
+; RV32I-NEXT: slli s2, s2, 8
+; RV32I-NEXT: slli s7, s7, 16
; RV32I-NEXT: slli a1, a1, 24
-; RV32I-NEXT: or a7, t0, a7
-; RV32I-NEXT: or t0, s0, t1
-; RV32I-NEXT: or t1, s2, s1
-; RV32I-NEXT: or s0, s6, s3
-; RV32I-NEXT: or s1, s9, s8
-; RV32I-NEXT: or a3, s7, a3
-; RV32I-NEXT: or a1, a1, s10
-; RV32I-NEXT: lw s2, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT: or a4, a4, s2
-; RV32I-NEXT: or a5, a6, a5
-; RV32I-NEXT: or a6, t3, t2
-; RV32I-NEXT: or a0, a0, t4
-; RV32I-NEXT: or t2, s5, s4
-; RV32I-NEXT: or a7, a7, t5
-; RV32I-NEXT: or t0, t1, t0
-; RV32I-NEXT: or s0, s1, s0
-; RV32I-NEXT: or a1, a1, a3
-; RV32I-NEXT: sw t2, 56(sp)
-; RV32I-NEXT: sw a7, 60(sp)
-; RV32I-NEXT: sw t0, 64(sp)
-; RV32I-NEXT: sw s0, 68(sp)
-; RV32I-NEXT: sw a4, 40(sp)
-; RV32I-NEXT: sw a5, 44(sp)
-; RV32I-NEXT: sw a6, 48(sp)
+; RV32I-NEXT: or s5, s6, s5
+; RV32I-NEXT: or s1, s2, s1
+; RV32I-NEXT: or a1, a1, s7
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a5, t0, a7
+; RV32I-NEXT: or a6, t2, t1
+; RV32I-NEXT: or a7, t4, t3
+; RV32I-NEXT: or a0, a0, t5
+; RV32I-NEXT: or t0, s0, t6
+; RV32I-NEXT: or t1, s5, s3
+; RV32I-NEXT: or a1, a1, s1
+; RV32I-NEXT: sw a7, 48(sp)
; RV32I-NEXT: sw a0, 52(sp)
+; RV32I-NEXT: sw t0, 56(sp)
+; RV32I-NEXT: sw t1, 60(sp)
+; RV32I-NEXT: sw a3, 32(sp)
+; RV32I-NEXT: sw a4, 36(sp)
+; RV32I-NEXT: sw a5, 40(sp)
+; RV32I-NEXT: sw a6, 44(sp)
; RV32I-NEXT: slli a3, a1, 3
; RV32I-NEXT: andi a1, a1, 28
-; RV32I-NEXT: sub a1, t6, a1
+; RV32I-NEXT: sub a1, s4, a1
; RV32I-NEXT: andi a0, a3, 24
; RV32I-NEXT: xori a0, a0, 31
; RV32I-NEXT: lw a4, 0(a1)
@@ -3664,10 +3647,10 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: srli t4, a4, 1
; RV32I-NEXT: sll t5, a7, a3
; RV32I-NEXT: srli t6, a6, 1
-; RV32I-NEXT: sll s0, a6, a3
+; RV32I-NEXT: sll a6, a6, a3
; RV32I-NEXT: srli a5, a5, 1
-; RV32I-NEXT: sll s1, t1, a3
-; RV32I-NEXT: srli a6, t0, 1
+; RV32I-NEXT: sll s0, t1, a3
+; RV32I-NEXT: srli s1, t0, 1
; RV32I-NEXT: sll s2, t0, a3
; RV32I-NEXT: srli a7, a7, 1
; RV32I-NEXT: sll s3, a1, a3
@@ -3675,56 +3658,56 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: sll s4, t2, a3
; RV32I-NEXT: srli t0, t1, 1
; RV32I-NEXT: sll s5, a4, a3
-; RV32I-NEXT: srl t2, t4, a0
-; RV32I-NEXT: srl t4, t6, a0
-; RV32I-NEXT: srl t6, a5, a0
-; RV32I-NEXT: srl s6, a6, a0
-; RV32I-NEXT: srl s7, a7, a0
-; RV32I-NEXT: srl s8, a1, a0
-; RV32I-NEXT: srl s9, t0, a0
-; RV32I-NEXT: srli t1, s4, 24
-; RV32I-NEXT: srli a7, s3, 24
+; RV32I-NEXT: srl t4, t4, a0
+; RV32I-NEXT: srl a4, t6, a0
+; RV32I-NEXT: srl t1, a5, a0
+; RV32I-NEXT: srl t6, s1, a0
+; RV32I-NEXT: srl s1, a7, a0
+; RV32I-NEXT: srl s6, a1, a0
+; RV32I-NEXT: srl s7, t0, a0
+; RV32I-NEXT: srli t2, s4, 24
+; RV32I-NEXT: srli t0, s3, 24
; RV32I-NEXT: srli a5, s2, 24
-; RV32I-NEXT: srli a3, s1, 24
-; RV32I-NEXT: srli a1, s0, 24
+; RV32I-NEXT: srli a3, s0, 24
+; RV32I-NEXT: srli a1, a6, 24
; RV32I-NEXT: srli a0, t5, 24
-; RV32I-NEXT: srli s10, s5, 24
-; RV32I-NEXT: srli s11, s5, 16
-; RV32I-NEXT: srli ra, s5, 8
-; RV32I-NEXT: srli a4, t3, 24
-; RV32I-NEXT: or a6, t3, t2
-; RV32I-NEXT: or t0, t5, t4
-; RV32I-NEXT: or t2, s0, t6
-; RV32I-NEXT: or t3, s1, s6
-; RV32I-NEXT: or t4, s2, s7
-; RV32I-NEXT: or t5, s3, s8
-; RV32I-NEXT: or t6, s4, s9
+; RV32I-NEXT: srli s8, s5, 24
+; RV32I-NEXT: or a4, t5, a4
+; RV32I-NEXT: srli t5, s5, 16
+; RV32I-NEXT: or t1, a6, t1
+; RV32I-NEXT: srli s9, s5, 8
+; RV32I-NEXT: or a7, t3, t4
+; RV32I-NEXT: srli a6, t3, 24
+; RV32I-NEXT: or t3, s0, t6
+; RV32I-NEXT: or t4, s2, s1
+; RV32I-NEXT: or t6, s3, s6
+; RV32I-NEXT: or s0, s4, s7
; RV32I-NEXT: sb s5, 0(a2)
-; RV32I-NEXT: sb ra, 1(a2)
-; RV32I-NEXT: sb s11, 2(a2)
-; RV32I-NEXT: sb s10, 3(a2)
-; RV32I-NEXT: srli s0, t6, 16
-; RV32I-NEXT: srli s1, t6, 8
-; RV32I-NEXT: srli s2, t5, 16
-; RV32I-NEXT: srli s3, t5, 8
+; RV32I-NEXT: sb s9, 1(a2)
+; RV32I-NEXT: sb t5, 2(a2)
+; RV32I-NEXT: sb s8, 3(a2)
+; RV32I-NEXT: srli t5, s0, 16
+; RV32I-NEXT: srli s1, s0, 8
+; RV32I-NEXT: srli s2, t6, 16
+; RV32I-NEXT: srli s3, t6, 8
; RV32I-NEXT: srli s4, t4, 16
; RV32I-NEXT: srli s5, t4, 8
; RV32I-NEXT: srli s6, t3, 16
; RV32I-NEXT: srli s7, t3, 8
-; RV32I-NEXT: srli s8, t2, 16
-; RV32I-NEXT: srli s9, t2, 8
-; RV32I-NEXT: srli s10, t0, 16
-; RV32I-NEXT: srli s11, t0, 8
-; RV32I-NEXT: sb t6, 24(a2)
+; RV32I-NEXT: sb s0, 24(a2)
+; RV32I-NEXT: srli s0, t1, 16
; RV32I-NEXT: sb s1, 25(a2)
-; RV32I-NEXT: sb s0, 26(a2)
-; RV32I-NEXT: sb t1, 27(a2)
-; RV32I-NEXT: srli t1, a6, 16
-; RV32I-NEXT: sb t5, 28(a2)
+; RV32I-NEXT: srli s1, t1, 8
+; RV32I-NEXT: sb t5, 26(a2)
+; RV32I-NEXT: srli t5, a4, 16
+; RV32I-NEXT: sb t2, 27(a2)
+; RV32I-NEXT: srli t2, a4, 8
+; RV32I-NEXT: sb t6, 28(a2)
+; RV32I-NEXT: srli t6, a7, 16
; RV32I-NEXT: sb s3, 29(a2)
; RV32I-NEXT: sb s2, 30(a2)
-; RV32I-NEXT: sb a7, 31(a2)
-; RV32I-NEXT: srli a7, a6, 8
+; RV32I-NEXT: sb t0, 31(a2)
+; RV32I-NEXT: srli t0, a7, 8
; RV32I-NEXT: sb t4, 16(a2)
; RV32I-NEXT: sb s5, 17(a2)
; RV32I-NEXT: sb s4, 18(a2)
@@ -3733,32 +3716,31 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: sb s7, 21(a2)
; RV32I-NEXT: sb s6, 22(a2)
; RV32I-NEXT: sb a3, 23(a2)
-; RV32I-NEXT: sb t2, 8(a2)
-; RV32I-NEXT: sb s9, 9(a2)
-; RV32I-NEXT: sb s8, 10(a2)
+; RV32I-NEXT: sb t1, 8(a2)
+; RV32I-NEXT: sb s1, 9(a2)
+; RV32I-NEXT: sb s0, 10(a2)
; RV32I-NEXT: sb a1, 11(a2)
-; RV32I-NEXT: sb t0, 12(a2)
-; RV32I-NEXT: sb s11, 13(a2)
-; RV32I-NEXT: sb s10, 14(a2)
+; RV32I-NEXT: sb a4, 12(a2)
+; RV32I-NEXT: sb t2, 13(a2)
+; RV32I-NEXT: sb t5, 14(a2)
; RV32I-NEXT: sb a0, 15(a2)
-; RV32I-NEXT: sb a6, 4(a2)
-; RV32I-NEXT: sb a7, 5(a2)
-; RV32I-NEXT: sb t1, 6(a2)
-; RV32I-NEXT: sb a4, 7(a2)
-; RV32I-NEXT: lw ra, 124(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s0, 120(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s1, 116(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s2, 112(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s3, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s4, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s5, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s6, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s7, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s8, 88(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s9, 84(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s10, 80(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s11, 76(sp) # 4-byte Folded Reload
-; RV32I-NEXT: addi sp, sp, 128
+; RV32I-NEXT: sb a7, 4(a2)
+; RV32I-NEXT: sb t0, 5(a2)
+; RV32I-NEXT: sb t6, 6(a2)
+; RV32I-NEXT: sb a6, 7(a2)
+; RV32I-NEXT: lw s0, 108(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 104(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s2, 100(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s3, 96(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s4, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s5, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s6, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s7, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s8, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s9, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s10, 68(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s11, 64(sp) # 4-byte Folded Reload
+; RV32I-NEXT: addi sp, sp, 112
; RV32I-NEXT: ret
%src = load i256, ptr %src.ptr, align 1
%byteOff = load i256, ptr %byteOff.ptr, align 1
@@ -4003,132 +3985,128 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw
;
; RV32I-LABEL: shl_32bytes_wordOff:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi sp, sp, -128
-; RV32I-NEXT: sw ra, 124(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s0, 120(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s1, 116(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s2, 112(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s3, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s4, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s5, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s6, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s7, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s8, 88(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s9, 84(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s10, 80(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s11, 76(sp) # 4-byte Folded Spill
-; RV32I-NEXT: mv a3, a1
-; RV32I-NEXT: lbu a5, 0(a0)
-; RV32I-NEXT: lbu a7, 1(a0)
-; RV32I-NEXT: lbu t0, 2(a0)
-; RV32I-NEXT: lbu t1, 3(a0)
-; RV32I-NEXT: lbu s2, 4(a0)
-; RV32I-NEXT: lbu s4, 5(a0)
-; RV32I-NEXT: lbu s5, 6(a0)
-; RV32I-NEXT: lbu s6, 7(a0)
-; RV32I-NEXT: lbu s3, 8(a0)
-; RV32I-NEXT: lbu s9, 9(a0)
-; RV32I-NEXT: lbu s10, 10(a0)
-; RV32I-NEXT: lbu s11, 11(a0)
-; RV32I-NEXT: lbu ra, 12(a0)
-; RV32I-NEXT: lbu a1, 13(a0)
-; RV32I-NEXT: lbu t4, 14(a0)
-; RV32I-NEXT: lbu t6, 15(a0)
-; RV32I-NEXT: lbu a4, 16(a0)
-; RV32I-NEXT: sw a4, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a6, 17(a0)
-; RV32I-NEXT: lbu t2, 18(a0)
-; RV32I-NEXT: lbu t3, 19(a0)
-; RV32I-NEXT: lbu a4, 20(a0)
-; RV32I-NEXT: lbu t5, 21(a0)
+; RV32I-NEXT: addi sp, sp, -112
+; RV32I-NEXT: sw s0, 108(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 104(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s2, 100(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s3, 96(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s4, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s5, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s6, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s7, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s8, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s9, 72(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s10, 68(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s11, 64(sp) # 4-byte Folded Spill
+; RV32I-NEXT: lbu a3, 0(a0)
+; RV32I-NEXT: lbu a4, 1(a0)
+; RV32I-NEXT: lbu a5, 2(a0)
+; RV32I-NEXT: lbu a6, 3(a0)
+; RV32I-NEXT: lbu a7, 4(a0)
+; RV32I-NEXT: lbu t0, 5(a0)
+; RV32I-NEXT: lbu t1, 6(a0)
+; RV32I-NEXT: lbu t2, 7(a0)
+; RV32I-NEXT: lbu t3, 8(a0)
+; RV32I-NEXT: lbu t4, 9(a0)
+; RV32I-NEXT: lbu t5, 10(a0)
+; RV32I-NEXT: lbu t6, 11(a0)
+; RV32I-NEXT: lbu s2, 12(a0)
+; RV32I-NEXT: lbu s3, 13(a0)
+; RV32I-NEXT: lbu s4, 14(a0)
+; RV32I-NEXT: lbu s5, 15(a0)
+; RV32I-NEXT: lbu s6, 16(a0)
+; RV32I-NEXT: lbu s7, 17(a0)
+; RV32I-NEXT: lbu s8, 18(a0)
+; RV32I-NEXT: lbu s9, 19(a0)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: slli a5, a5, 16
+; RV32I-NEXT: slli a6, a6, 24
+; RV32I-NEXT: slli t0, t0, 8
+; RV32I-NEXT: slli t1, t1, 16
+; RV32I-NEXT: slli t2, t2, 24
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a5, t0, a7
+; RV32I-NEXT: or a6, t2, t1
+; RV32I-NEXT: lbu s10, 20(a0)
+; RV32I-NEXT: lbu s11, 21(a0)
; RV32I-NEXT: lbu s0, 22(a0)
; RV32I-NEXT: lbu s1, 23(a0)
-; RV32I-NEXT: slli a7, a7, 8
-; RV32I-NEXT: slli t0, t0, 16
-; RV32I-NEXT: slli t1, t1, 24
-; RV32I-NEXT: slli s4, s4, 8
-; RV32I-NEXT: slli s5, s5, 16
-; RV32I-NEXT: slli s6, s6, 24
-; RV32I-NEXT: or a5, a7, a5
-; RV32I-NEXT: or a7, t1, t0
-; RV32I-NEXT: or t0, s4, s2
-; RV32I-NEXT: or t1, s6, s5
-; RV32I-NEXT: lbu s2, 24(a0)
-; RV32I-NEXT: lbu s6, 25(a0)
-; RV32I-NEXT: lbu s7, 26(a0)
-; RV32I-NEXT: lbu s8, 27(a0)
-; RV32I-NEXT: slli s9, s9, 8
-; RV32I-NEXT: slli s10, s10, 16
-; RV32I-NEXT: slli s11, s11, 24
-; RV32I-NEXT: slli a1, a1, 8
-; RV32I-NEXT: or s3, s9, s3
-; RV32I-NEXT: or s4, s11, s10
-; RV32I-NEXT: or s5, a1, ra
-; RV32I-NEXT: lbu s9, 28(a0)
-; RV32I-NEXT: lbu a1, 29(a0)
-; RV32I-NEXT: lbu s10, 30(a0)
+; RV32I-NEXT: slli t4, t4, 8
+; RV32I-NEXT: slli t5, t5, 16
+; RV32I-NEXT: slli t6, t6, 24
+; RV32I-NEXT: slli s3, s3, 8
+; RV32I-NEXT: slli s4, s4, 16
+; RV32I-NEXT: slli s5, s5, 24
+; RV32I-NEXT: or a7, t4, t3
+; RV32I-NEXT: or t0, t6, t5
+; RV32I-NEXT: or t1, s3, s2
+; RV32I-NEXT: or t2, s5, s4
+; RV32I-NEXT: lbu t3, 24(a0)
+; RV32I-NEXT: lbu s2, 25(a0)
+; RV32I-NEXT: lbu s3, 26(a0)
+; RV32I-NEXT: lbu s4, 27(a0)
+; RV32I-NEXT: slli s7, s7, 8
+; RV32I-NEXT: slli s8, s8, 16
+; RV32I-NEXT: slli s9, s9, 24
+; RV32I-NEXT: slli s11, s11, 8
+; RV32I-NEXT: or t4, s7, s6
+; RV32I-NEXT: or t5, s9, s8
+; RV32I-NEXT: or t6, s11, s10
+; RV32I-NEXT: lbu s5, 28(a0)
+; RV32I-NEXT: lbu s6, 29(a0)
+; RV32I-NEXT: lbu s7, 30(a0)
; RV32I-NEXT: lbu a0, 31(a0)
-; RV32I-NEXT: lbu a3, 0(a3)
+; RV32I-NEXT: lbu a1, 0(a1)
+; RV32I-NEXT: sw zero, 16(sp)
+; RV32I-NEXT: sw zero, 20(sp)
; RV32I-NEXT: sw zero, 24(sp)
; RV32I-NEXT: sw zero, 28(sp)
-; RV32I-NEXT: sw zero, 32(sp)
-; RV32I-NEXT: sw zero, 36(sp)
+; RV32I-NEXT: sw zero, 0(sp)
+; RV32I-NEXT: sw zero, 4(sp)
; RV32I-NEXT: sw zero, 8(sp)
; RV32I-NEXT: sw zero, 12(sp)
-; RV32I-NEXT: sw zero, 16(sp)
-; RV32I-NEXT: sw zero, 20(sp)
-; RV32I-NEXT: slli t4, t4, 16
-; RV32I-NEXT: slli t6, t6, 24
-; RV32I-NEXT: or t4, t6, t4
-; RV32I-NEXT: addi t6, sp, 40
-; RV32I-NEXT: slli a6, a6, 8
-; RV32I-NEXT: slli t2, t2, 16
-; RV32I-NEXT: slli t3, t3, 24
-; RV32I-NEXT: slli t5, t5, 8
; RV32I-NEXT: slli s0, s0, 16
; RV32I-NEXT: slli s1, s1, 24
+; RV32I-NEXT: or s0, s1, s0
+; RV32I-NEXT: addi s1, sp, 32
+; RV32I-NEXT: slli s2, s2, 8
+; RV32I-NEXT: slli s3, s3, 16
+; RV32I-NEXT: slli s4, s4, 24
; RV32I-NEXT: slli s6, s6, 8
; RV32I-NEXT: slli s7, s7, 16
-; RV32I-NEXT: slli s8, s8, 24
-; RV32I-NEXT: slli a1, a1, 8
-; RV32I-NEXT: slli s10, s10, 16
; RV32I-NEXT: slli a0, a0, 24
-; RV32I-NEXT: slli a3, a3, 2
-; RV32I-NEXT: lw s11, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT: or a6, a6, s11
-; RV32I-NEXT: or t2, t3, t2
-; RV32I-NEXT: or a4, t5, a4
-; RV32I-NEXT: or s0, s1, s0
-; RV32I-NEXT: or t3, s6, s2
-; RV32I-NEXT: or t5, s8, s7
-; RV32I-NEXT: or a1, a1, s9
-; RV32I-NEXT: or a0, a0, s10
-; RV32I-NEXT: andi a3, a3, 28
-; RV32I-NEXT: or a5, a7, a5
-; RV32I-NEXT: or a7, t1, t0
-; RV32I-NEXT: or t0, s4, s3
-; RV32I-NEXT: or t1, t4, s5
-; RV32I-NEXT: or a6, t2, a6
-; RV32I-NEXT: or a4, s0, a4
-; RV32I-NEXT: or t2, t5, t3
-; RV32I-NEXT: or a0, a0, a1
-; RV32I-NEXT: sub t3, t6, a3
-; RV32I-NEXT: sw a6, 56(sp)
-; RV32I-NEXT: sw a4, 60(sp)
-; RV32I-NEXT: sw t2, 64(sp)
-; RV32I-NEXT: sw a0, 68(sp)
+; RV32I-NEXT: slli a1, a1, 2
+; RV32I-NEXT: or t3, s2, t3
+; RV32I-NEXT: or s2, s4, s3
+; RV32I-NEXT: or s3, s6, s5
+; RV32I-NEXT: or a0, a0, s7
+; RV32I-NEXT: andi a1, a1, 28
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a5, t0, a7
+; RV32I-NEXT: or a6, t2, t1
+; RV32I-NEXT: or a7, t5, t4
+; RV32I-NEXT: or t0, s0, t6
+; RV32I-NEXT: or t1, s2, t3
+; RV32I-NEXT: or a0, a0, s3
+; RV32I-NEXT: sub s1, s1, a1
+; RV32I-NEXT: sw a7, 48(sp)
+; RV32I-NEXT: sw t0, 52(sp)
+; RV32I-NEXT: sw t1, 56(sp)
+; RV32I-NEXT: sw a0, 60(sp)
+; RV32I-NEXT: sw a3, 32(sp)
+; RV32I-NEXT: sw a4, 36(sp)
; RV32I-NEXT: sw a5, 40(sp)
-; RV32I-NEXT: sw a7, 44(sp)
-; RV32I-NEXT: sw t0, 48(sp)
-; RV32I-NEXT: sw t1, 52(sp)
-; RV32I-NEXT: lw a6, 16(t3)
-; RV32I-NEXT: lw a5, 20(t3)
-; RV32I-NEXT: lw a7, 24(t3)
-; RV32I-NEXT: lw a1, 0(t3)
-; RV32I-NEXT: lw a0, 4(t3)
-; RV32I-NEXT: lw a4, 8(t3)
-; RV32I-NEXT: lw a3, 12(t3)
-; RV32I-NEXT: lw t0, 28(t3)
+; RV32I-NEXT: sw a6, 44(sp)
+; RV32I-NEXT: lw a6, 16(s1)
+; RV32I-NEXT: lw a5, 20(s1)
+; RV32I-NEXT: lw a7, 24(s1)
+; RV32I-NEXT: lw a1, 0(s1)
+; RV32I-NEXT: lw a0, 4(s1)
+; RV32I-NEXT: lw a4, 8(s1)
+; RV32I-NEXT: lw a3, 12(s1)
+; RV32I-NEXT: lw t0, 28(s1)
; RV32I-NEXT: srli t1, a7, 24
; RV32I-NEXT: srli t2, a7, 16
; RV32I-NEXT: srli t3, a7, 8
@@ -4143,21 +4121,21 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw
; RV32I-NEXT: srli s5, a5, 8
; RV32I-NEXT: srli s6, a4, 24
; RV32I-NEXT: srli s7, a4, 16
-; RV32I-NEXT: srli s8, a4, 8
-; RV32I-NEXT: srli s9, a3, 24
-; RV32I-NEXT: srli s10, a3, 16
-; RV32I-NEXT: srli s11, a3, 8
; RV32I-NEXT: sb a7, 24(a2)
-; RV32I-NEXT: srli a7, a1, 24
+; RV32I-NEXT: srli a7, a4, 8
; RV32I-NEXT: sb t3, 25(a2)
+; RV32I-NEXT: srli t3, a3, 24
; RV32I-NEXT: sb t2, 26(a2)
+; RV32I-NEXT: srli t2, a3, 16
; RV32I-NEXT: sb t1, 27(a2)
-; RV32I-NEXT: srli t1, a1, 16
+; RV32I-NEXT: srli t1, a3, 8
; RV32I-NEXT: sb t0, 28(a2)
+; RV32I-NEXT: srli t0, a1, 24
; RV32I-NEXT: sb t6, 29(a2)
+; RV32I-NEXT: srli t6, a1, 16
; RV32I-NEXT: sb t5, 30(a2)
; RV32I-NEXT: sb t4, 31(a2)
-; RV32I-NEXT: srli t0, a1, 8
+; RV32I-NEXT: srli t4, a1, 8
; RV32I-NEXT: sb a6, 16(a2)
; RV32I-NEXT: sb s2, 17(a2)
; RV32I-NEXT: sb s1, 18(a2)
@@ -4169,36 +4147,35 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw
; RV32I-NEXT: sb s3, 23(a2)
; RV32I-NEXT: srli a5, a0, 16
; RV32I-NEXT: sb a4, 8(a2)
-; RV32I-NEXT: sb s8, 9(a2)
+; RV32I-NEXT: sb a7, 9(a2)
; RV32I-NEXT: sb s7, 10(a2)
; RV32I-NEXT: sb s6, 11(a2)
; RV32I-NEXT: srli a4, a0, 8
; RV32I-NEXT: sb a3, 12(a2)
-; RV32I-NEXT: sb s11, 13(a2)
-; RV32I-NEXT: sb s10, 14(a2)
-; RV32I-NEXT: sb s9, 15(a2)
+; RV32I-NEXT: sb t1, 13(a2)
+; RV32I-NEXT: sb t2, 14(a2)
+; RV32I-NEXT: sb t3, 15(a2)
; RV32I-NEXT: sb a1, 0(a2)
-; RV32I-NEXT: sb t0, 1(a2)
-; RV32I-NEXT: sb t1, 2(a2)
-; RV32I-NEXT: sb a7, 3(a2)
+; RV32I-NEXT: sb t4, 1(a2)
+; RV32I-NEXT: sb t6, 2(a2)
+; RV32I-NEXT: sb t0, 3(a2)
; RV32I-NEXT: sb a0, 4(a2)
; RV32I-NEXT: sb a4, 5(a2)
; RV32I-NEXT: sb a5, 6(a2)
; RV32I-NEXT: sb a6, 7(a2)
-; RV32I-NEXT: lw ra, 124(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s0, 120(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s1, 116(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s2, 112(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s3, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s4, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s5, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s6, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s7, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s8, 88(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s9, 84(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s10, 80(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s11, 76(sp) # 4-byte Folded Reload
-; RV32I-NEXT: addi sp, sp, 128
+; RV32I-NEXT: lw s0, 108(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 104(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s2, 100(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s3, 96(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s4, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s5, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s6, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s7, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s8, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s9, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s10, 68(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s11, 64(sp) # 4-byte Folded Reload
+; RV32I-NEXT: addi sp, sp, 112
; RV32I-NEXT: ret
%src = load i256, ptr %src.ptr, align 1
%wordOff = load i256, ptr %wordOff.ptr, align 1
@@ -4224,111 +4201,111 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
; RV64I-NEXT: sd s9, 80(sp) # 8-byte Folded Spill
; RV64I-NEXT: sd s10, 72(sp) # 8-byte Folded Spill
; RV64I-NEXT: sd s11, 64(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a5, 0(a0)
-; RV64I-NEXT: lbu a7, 1(a0)
-; RV64I-NEXT: lbu t2, 2(a0)
-; RV64I-NEXT: lbu s3, 3(a0)
-; RV64I-NEXT: lbu t0, 4(a0)
-; RV64I-NEXT: lbu s8, 5(a0)
-; RV64I-NEXT: lbu s9, 6(a0)
-; RV64I-NEXT: lbu s10, 7(a0)
-; RV64I-NEXT: lbu s2, 8(a0)
-; RV64I-NEXT: lbu s4, 9(a0)
-; RV64I-NEXT: lbu s5, 10(a0)
-; RV64I-NEXT: lbu s6, 11(a0)
-; RV64I-NEXT: lbu s7, 12(a0)
-; RV64I-NEXT: lbu s11, 13(a0)
-; RV64I-NEXT: lbu t1, 14(a0)
-; RV64I-NEXT: lbu t3, 15(a0)
-; RV64I-NEXT: lbu a3, 16(a0)
-; RV64I-NEXT: lbu a6, 17(a0)
-; RV64I-NEXT: lbu t4, 18(a0)
-; RV64I-NEXT: lbu t5, 19(a0)
-; RV64I-NEXT: lbu a4, 20(a0)
-; RV64I-NEXT: lbu t6, 21(a0)
-; RV64I-NEXT: lbu s0, 22(a0)
-; RV64I-NEXT: lbu s1, 23(a0)
-; RV64I-NEXT: slli a7, a7, 8
-; RV64I-NEXT: slli t2, t2, 16
+; RV64I-NEXT: lbu a3, 0(a0)
+; RV64I-NEXT: lbu a4, 1(a0)
+; RV64I-NEXT: lbu a5, 2(a0)
+; RV64I-NEXT: lbu a6, 3(a0)
+; RV64I-NEXT: lbu a7, 4(a0)
+; RV64I-NEXT: lbu t0, 5(a0)
+; RV64I-NEXT: lbu t1, 6(a0)
+; RV64I-NEXT: lbu t2, 7(a0)
+; RV64I-NEXT: lbu t3, 8(a0)
+; RV64I-NEXT: lbu t4, 9(a0)
+; RV64I-NEXT: lbu t5, 10(a0)
+; RV64I-NEXT: lbu t6, 11(a0)
+; RV64I-NEXT: lbu s0, 12(a0)
+; RV64I-NEXT: lbu s1, 13(a0)
+; RV64I-NEXT: lbu s2, 14(a0)
+; RV64I-NEXT: lbu s3, 15(a0)
+; RV64I-NEXT: lbu s4, 16(a0)
+; RV64I-NEXT: lbu s5, 17(a0)
+; RV64I-NEXT: lbu s6, 18(a0)
+; RV64I-NEXT: lbu s7, 19(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: slli a5, a5, 16
+; RV64I-NEXT: slli a6, a6, 24
+; RV64I-NEXT: slli t0, t0, 8
+; RV64I-NEXT: slli t1, t1, 16
+; RV64I-NEXT: slli t2, t2, 24
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: or a5, t0, a7
+; RV64I-NEXT: or a6, t2, t1
+; RV64I-NEXT: lbu s8, 20(a0)
+; RV64I-NEXT: lbu s9, 21(a0)
+; RV64I-NEXT: lbu s10, 22(a0)
+; RV64I-NEXT: lbu s11, 23(a0)
+; RV64I-NEXT: slli t4, t4, 8
+; RV64I-NEXT: slli t5, t5, 16
+; RV64I-NEXT: slli t6, t6, 24
+; RV64I-NEXT: slli s1, s1, 8
+; RV64I-NEXT: slli s2, s2, 16
; RV64I-NEXT: slli s3, s3, 24
-; RV64I-NEXT: slli s8, s8, 8
-; RV64I-NEXT: slli s9, s9, 16
-; RV64I-NEXT: slli s10, s10, 24
-; RV64I-NEXT: or a5, a7, a5
-; RV64I-NEXT: or a7, s3, t2
-; RV64I-NEXT: or t0, s8, t0
-; RV64I-NEXT: or t2, s10, s9
-; RV64I-NEXT: lbu s3, 24(a0)
-; RV64I-NEXT: lbu s8, 25(a0)
-; RV64I-NEXT: lbu s9, 26(a0)
-; RV64I-NEXT: lbu s10, 27(a0)
-; RV64I-NEXT: slli s4, s4, 8
-; RV64I-NEXT: slli s5, s5, 16
-; RV64I-NEXT: slli s6, s6, 24
-; RV64I-NEXT: slli s11, s11, 8
-; RV64I-NEXT: or s2, s4, s2
-; RV64I-NEXT: or s4, s6, s5
-; RV64I-NEXT: or s5, s11, s7
-; RV64I-NEXT: lbu s6, 28(a0)
-; RV64I-NEXT: lbu s7, 29(a0)
-; RV64I-NEXT: lbu s11, 30(a0)
+; RV64I-NEXT: or a7, t4, t3
+; RV64I-NEXT: or t0, t6, t5
+; RV64I-NEXT: or t1, s1, s0
+; RV64I-NEXT: or t2, s3, s2
+; RV64I-NEXT: lbu t3, 24(a0)
+; RV64I-NEXT: lbu t4, 25(a0)
+; RV64I-NEXT: lbu t5, 26(a0)
+; RV64I-NEXT: lbu t6, 27(a0)
+; RV64I-NEXT: slli s5, s5, 8
+; RV64I-NEXT: slli s6, s6, 16
+; RV64I-NEXT: slli s7, s7, 24
+; RV64I-NEXT: slli s9, s9, 8
+; RV64I-NEXT: or s0, s5, s4
+; RV64I-NEXT: or s1, s7, s6
+; RV64I-NEXT: or s2, s9, s8
+; RV64I-NEXT: lbu s3, 28(a0)
+; RV64I-NEXT: lbu s4, 29(a0)
+; RV64I-NEXT: lbu s5, 30(a0)
; RV64I-NEXT: lbu a0, 31(a0)
; RV64I-NEXT: lbu a1, 0(a1)
; RV64I-NEXT: sd zero, 0(sp)
; RV64I-NEXT: sd zero, 8(sp)
; RV64I-NEXT: sd zero, 16(sp)
; RV64I-NEXT: sd zero, 24(sp)
-; RV64I-NEXT: slli t1, t1, 16
-; RV64I-NEXT: slli t3, t3, 24
-; RV64I-NEXT: or t1, t3, t1
-; RV64I-NEXT: addi t3, sp, 32
-; RV64I-NEXT: slli a6, a6, 8
-; RV64I-NEXT: slli t4, t4, 16
-; RV64I-NEXT: slli t5, t5, 24
-; RV64I-NEXT: slli t6, t6, 8
-; RV64I-NEXT: slli s0, s0, 16
-; RV64I-NEXT: slli s1, s1, 24
-; RV64I-NEXT: slli s8, s8, 8
-; RV64I-NEXT: slli s9, s9, 16
-; RV64I-NEXT: slli s10, s10, 24
-; RV64I-NEXT: slli s7, s7, 8
-; RV64I-NEXT: slli s11, s11, 16
+; RV64I-NEXT: slli s10, s10, 16
+; RV64I-NEXT: slli s11, s11, 24
+; RV64I-NEXT: or s6, s11, s10
+; RV64I-NEXT: addi s7, sp, 32
+; RV64I-NEXT: slli t4, t4, 8
+; RV64I-NEXT: slli t5, t5, 16
+; RV64I-NEXT: slli t6, t6, 24
+; RV64I-NEXT: slli s4, s4, 8
+; RV64I-NEXT: slli s5, s5, 16
; RV64I-NEXT: slli a0, a0, 24
; RV64I-NEXT: slli a1, a1, 3
-; RV64I-NEXT: or a3, a6, a3
-; RV64I-NEXT: or a6, t5, t4
-; RV64I-NEXT: or a4, t6, a4
-; RV64I-NEXT: or s0, s1, s0
-; RV64I-NEXT: or t4, s8, s3
-; RV64I-NEXT: or t5, s10, s9
-; RV64I-NEXT: or t6, s7, s6
-; RV64I-NEXT: or a0, a0, s11
+; RV64I-NEXT: or t3, t4, t3
+; RV64I-NEXT: or t4, t6, t5
+; RV64I-NEXT: or t5, s4, s3
+; RV64I-NEXT: or a0, a0, s5
; RV64I-NEXT: andi a1, a1, 24
-; RV64I-NEXT: or a5, a7, a5
-; RV64I-NEXT: or a7, t2, t0
-; RV64I-NEXT: or t0, s4, s2
-; RV64I-NEXT: or t1, t1, s5
-; RV64I-NEXT: or a3, a6, a3
-; RV64I-NEXT: or a4, s0, a4
-; RV64I-NEXT: or a6, t5, t4
-; RV64I-NEXT: or a0, a0, t6
-; RV64I-NEXT: sub t2, t3, a1
-; RV64I-NEXT: slli a7, a7, 32
-; RV64I-NEXT: slli t1, t1, 32
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: or a5, t0, a7
+; RV64I-NEXT: or a6, t2, t1
+; RV64I-NEXT: or s0, s1, s0
+; RV64I-NEXT: or a7, s6, s2
+; RV64I-NEXT: or t0, t4, t3
+; RV64I-NEXT: or a0, a0, t5
+; RV64I-NEXT: sub t1, s7, a1
; RV64I-NEXT: slli a4, a4, 32
+; RV64I-NEXT: slli a6, a6, 32
+; RV64I-NEXT: slli a7, a7, 32
; RV64I-NEXT: slli a0, a0, 32
-; RV64I-NEXT: or a1, a7, a5
-; RV64I-NEXT: or a5, t1, t0
; RV64I-NEXT: or a3, a4, a3
-; RV64I-NEXT: or a0, a0, a6
-; RV64I-NEXT: sd a1, 32(sp)
-; RV64I-NEXT: sd a5, 40(sp)
-; RV64I-NEXT: sd a3, 48(sp)
+; RV64I-NEXT: or a1, a6, a5
+; RV64I-NEXT: or a4, a7, s0
+; RV64I-NEXT: or a0, a0, t0
+; RV64I-NEXT: sd a3, 32(sp)
+; RV64I-NEXT: sd a1, 40(sp)
+; RV64I-NEXT: sd a4, 48(sp)
; RV64I-NEXT: sd a0, 56(sp)
-; RV64I-NEXT: ld a4, 16(t2)
-; RV64I-NEXT: ld a0, 8(t2)
-; RV64I-NEXT: ld a1, 0(t2)
-; RV64I-NEXT: ld a3, 24(t2)
+; RV64I-NEXT: ld a4, 16(t1)
+; RV64I-NEXT: ld a0, 8(t1)
+; RV64I-NEXT: ld a1, 0(t1)
+; RV64I-NEXT: ld a3, 24(t1)
; RV64I-NEXT: srli a5, a4, 56
; RV64I-NEXT: srli a6, a4, 48
; RV64I-NEXT: srli a7, a4, 40
@@ -4347,25 +4324,25 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
; RV64I-NEXT: srli s5, a1, 48
; RV64I-NEXT: srli s6, a1, 40
; RV64I-NEXT: srli s7, a1, 32
-; RV64I-NEXT: srli s8, a1, 24
-; RV64I-NEXT: srli s9, a1, 16
-; RV64I-NEXT: srli s10, a1, 8
-; RV64I-NEXT: srli s11, a0, 56
; RV64I-NEXT: sb t0, 20(a2)
+; RV64I-NEXT: srli t0, a1, 24
; RV64I-NEXT: sb a7, 21(a2)
+; RV64I-NEXT: srli a7, a1, 16
; RV64I-NEXT: sb a6, 22(a2)
+; RV64I-NEXT: srli a6, a1, 8
; RV64I-NEXT: sb a5, 23(a2)
-; RV64I-NEXT: srli a5, a0, 48
+; RV64I-NEXT: srli a5, a0, 56
; RV64I-NEXT: sb a4, 16(a2)
+; RV64I-NEXT: srli a4, a0, 48
; RV64I-NEXT: sb t3, 17(a2)
; RV64I-NEXT: sb t2, 18(a2)
; RV64I-NEXT: sb t1, 19(a2)
-; RV64I-NEXT: srli a4, a0, 40
+; RV64I-NEXT: srli t1, a0, 40
; RV64I-NEXT: sb s0, 28(a2)
; RV64I-NEXT: sb t6, 29(a2)
; RV64I-NEXT: sb t5, 30(a2)
; RV64I-NEXT: sb t4, 31(a2)
-; RV64I-NEXT: srli a6, a0, 32
+; RV64I-NEXT: srli t2, a0, 32
; RV64I-NEXT: sb a3, 24(a2)
; RV64I-NEXT: sb s3, 25(a2)
; RV64I-NEXT: sb s2, 26(a2)
@@ -4375,19 +4352,19 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
; RV64I-NEXT: sb s6, 5(a2)
; RV64I-NEXT: sb s5, 6(a2)
; RV64I-NEXT: sb s4, 7(a2)
-; RV64I-NEXT: srli a7, a0, 16
+; RV64I-NEXT: srli t3, a0, 16
; RV64I-NEXT: sb a1, 0(a2)
-; RV64I-NEXT: sb s10, 1(a2)
-; RV64I-NEXT: sb s9, 2(a2)
-; RV64I-NEXT: sb s8, 3(a2)
+; RV64I-NEXT: sb a6, 1(a2)
+; RV64I-NEXT: sb a7, 2(a2)
+; RV64I-NEXT: sb t0, 3(a2)
; RV64I-NEXT: srli a1, a0, 8
-; RV64I-NEXT: sb a6, 12(a2)
-; RV64I-NEXT: sb a4, 13(a2)
-; RV64I-NEXT: sb a5, 14(a2)
-; RV64I-NEXT: sb s11, 15(a2)
+; RV64I-NEXT: sb t2, 12(a2)
+; RV64I-NEXT: sb t1, 13(a2)
+; RV64I-NEXT: sb a4, 14(a2)
+; RV64I-NEXT: sb a5, 15(a2)
; RV64I-NEXT: sb a0, 8(a2)
; RV64I-NEXT: sb a1, 9(a2)
-; RV64I-NEXT: sb a7, 10(a2)
+; RV64I-NEXT: sb t3, 10(a2)
; RV64I-NEXT: sb a3, 11(a2)
; RV64I-NEXT: ld s0, 152(sp) # 8-byte Folded Reload
; RV64I-NEXT: ld s1, 144(sp) # 8-byte Folded Reload
@@ -4406,132 +4383,128 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
;
; RV32I-LABEL: shl_32bytes_dwordOff:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi sp, sp, -128
-; RV32I-NEXT: sw ra, 124(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s0, 120(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s1, 116(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s2, 112(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s3, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s4, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s5, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s6, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s7, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s8, 88(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s9, 84(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s10, 80(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s11, 76(sp) # 4-byte Folded Spill
-; RV32I-NEXT: mv a3, a1
-; RV32I-NEXT: lbu a5, 0(a0)
-; RV32I-NEXT: lbu a7, 1(a0)
-; RV32I-NEXT: lbu t0, 2(a0)
-; RV32I-NEXT: lbu t1, 3(a0)
-; RV32I-NEXT: lbu s2, 4(a0)
-; RV32I-NEXT: lbu s4, 5(a0)
-; RV32I-NEXT: lbu s5, 6(a0)
-; RV32I-NEXT: lbu s6, 7(a0)
-; RV32I-NEXT: lbu s3, 8(a0)
-; RV32I-NEXT: lbu s9, 9(a0)
-; RV32I-NEXT: lbu s10, 10(a0)
-; RV32I-NEXT: lbu s11, 11(a0)
-; RV32I-NEXT: lbu ra, 12(a0)
-; RV32I-NEXT: lbu a1, 13(a0)
-; RV32I-NEXT: lbu t4, 14(a0)
-; RV32I-NEXT: lbu t6, 15(a0)
-; RV32I-NEXT: lbu a4, 16(a0)
-; RV32I-NEXT: sw a4, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a6, 17(a0)
-; RV32I-NEXT: lbu t2, 18(a0)
-; RV32I-NEXT: lbu t3, 19(a0)
-; RV32I-NEXT: lbu a4, 20(a0)
-; RV32I-NEXT: lbu t5, 21(a0)
+; RV32I-NEXT: addi sp, sp, -112
+; RV32I-NEXT: sw s0, 108(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 104(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s2, 100(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s3, 96(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s4, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s5, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s6, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s7, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s8, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s9, 72(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s10, 68(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s11, 64(sp) # 4-byte Folded Spill
+; RV32I-NEXT: lbu a3, 0(a0)
+; RV32I-NEXT: lbu a4, 1(a0)
+; RV32I-NEXT: lbu a5, 2(a0)
+; RV32I-NEXT: lbu a6, 3(a0)
+; RV32I-NEXT: lbu a7, 4(a0)
+; RV32I-NEXT: lbu t0, 5(a0)
+; RV32I-NEXT: lbu t1, 6(a0)
+; RV32I-NEXT: lbu t2, 7(a0)
+; RV32I-NEXT: lbu t3, 8(a0)
+; RV32I-NEXT: lbu t4, 9(a0)
+; RV32I-NEXT: lbu t5, 10(a0)
+; RV32I-NEXT: lbu t6, 11(a0)
+; RV32I-NEXT: lbu s2, 12(a0)
+; RV32I-NEXT: lbu s3, 13(a0)
+; RV32I-NEXT: lbu s4, 14(a0)
+; RV32I-NEXT: lbu s5, 15(a0)
+; RV32I-NEXT: lbu s6, 16(a0)
+; RV32I-NEXT: lbu s7, 17(a0)
+; RV32I-NEXT: lbu s8, 18(a0)
+; RV32I-NEXT: lbu s9, 19(a0)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: slli a5, a5, 16
+; RV32I-NEXT: slli a6, a6, 24
+; RV32I-NEXT: slli t0, t0, 8
+; RV32I-NEXT: slli t1, t1, 16
+; RV32I-NEXT: slli t2, t2, 24
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a5, t0, a7
+; RV32I-NEXT: or a6, t2, t1
+; RV32I-NEXT: lbu s10, 20(a0)
+; RV32I-NEXT: lbu s11, 21(a0)
; RV32I-NEXT: lbu s0, 22(a0)
; RV32I-NEXT: lbu s1, 23(a0)
-; RV32I-NEXT: slli a7, a7, 8
-; RV32I-NEXT: slli t0, t0, 16
-; RV32I-NEXT: slli t1, t1, 24
-; RV32I-NEXT: slli s4, s4, 8
-; RV32I-NEXT: slli s5, s5, 16
-; RV32I-NEXT: slli s6, s6, 24
-; RV32I-NEXT: or a5, a7, a5
-; RV32I-NEXT: or a7, t1, t0
-; RV32I-NEXT: or t0, s4, s2
-; RV32I-NEXT: or t1, s6, s5
-; RV32I-NEXT: lbu s2, 24(a0)
-; RV32I-NEXT: lbu s6, 25(a0)
-; RV32I-NEXT: lbu s7, 26(a0)
-; RV32I-NEXT: lbu s8, 27(a0)
-; RV32I-NEXT: slli s9, s9, 8
-; RV32I-NEXT: slli s10, s10, 16
-; RV32I-NEXT: slli s11, s11, 24
-; RV32I-NEXT: slli a1, a1, 8
-; RV32I-NEXT: or s3, s9, s3
-; RV32I-NEXT: or s4, s11, s10
-; RV32I-NEXT: or s5, a1, ra
-; RV32I-NEXT: lbu s9, 28(a0)
-; RV32I-NEXT: lbu a1, 29(a0)
-; RV32I-NEXT: lbu s10, 30(a0)
+; RV32I-NEXT: slli t4, t4, 8
+; RV32I-NEXT: slli t5, t5, 16
+; RV32I-NEXT: slli t6, t6, 24
+; RV32I-NEXT: slli s3, s3, 8
+; RV32I-NEXT: slli s4, s4, 16
+; RV32I-NEXT: slli s5, s5, 24
+; RV32I-NEXT: or a7, t4, t3
+; RV32I-NEXT: or t0, t6, t5
+; RV32I-NEXT: or t1, s3, s2
+; RV32I-NEXT: or t2, s5, s4
+; RV32I-NEXT: lbu t3, 24(a0)
+; RV32I-NEXT: lbu s2, 25(a0)
+; RV32I-NEXT: lbu s3, 26(a0)
+; RV32I-NEXT: lbu s4, 27(a0)
+; RV32I-NEXT: slli s7, s7, 8
+; RV32I-NEXT: slli s8, s8, 16
+; RV32I-NEXT: slli s9, s9, 24
+; RV32I-NEXT: slli s11, s11, 8
+; RV32I-NEXT: or t4, s7, s6
+; RV32I-NEXT: or t5, s9, s8
+; RV32I-NEXT: or t6, s11, s10
+; RV32I-NEXT: lbu s5, 28(a0)
+; RV32I-NEXT: lbu s6, 29(a0)
+; RV32I-NEXT: lbu s7, 30(a0)
; RV32I-NEXT: lbu a0, 31(a0)
-; RV32I-NEXT: lbu a3, 0(a3)
+; RV32I-NEXT: lbu a1, 0(a1)
+; RV32I-NEXT: sw zero, 16(sp)
+; RV32I-NEXT: sw zero, 20(sp)
; RV32I-NEXT: sw zero, 24(sp)
; RV32I-NEXT: sw zero, 28(sp)
-; RV32I-NEXT: sw zero, 32(sp)
-; RV32I-NEXT: sw zero, 36(sp)
+; RV32I-NEXT: sw zero, 0(sp)
+; RV32I-NEXT: sw zero, 4(sp)
; RV32I-NEXT: sw zero, 8(sp)
; RV32I-NEXT: sw zero, 12(sp)
-; RV32I-NEXT: sw zero, 16(sp)
-; RV32I-NEXT: sw zero, 20(sp)
-; RV32I-NEXT: slli t4, t4, 16
-; RV32I-NEXT: slli t6, t6, 24
-; RV32I-NEXT: or t4, t6, t4
-; RV32I-NEXT: addi t6, sp, 40
-; RV32I-NEXT: slli a6, a6, 8
-; RV32I-NEXT: slli t2, t2, 16
-; RV32I-NEXT: slli t3, t3, 24
-; RV32I-NEXT: slli t5, t5, 8
; RV32I-NEXT: slli s0, s0, 16
; RV32I-NEXT: slli s1, s1, 24
+; RV32I-NEXT: or s0, s1, s0
+; RV32I-NEXT: addi s1, sp, 32
+; RV32I-NEXT: slli s2, s2, 8
+; RV32I-NEXT: slli s3, s3, 16
+; RV32I-NEXT: slli s4, s4, 24
; RV32I-NEXT: slli s6, s6, 8
; RV32I-NEXT: slli s7, s7, 16
-; RV32I-NEXT: slli s8, s8, 24
-; RV32I-NEXT: slli a1, a1, 8
-; RV32I-NEXT: slli s10, s10, 16
; RV32I-NEXT: slli a0, a0, 24
-; RV32I-NEXT: slli a3, a3, 3
-; RV32I-NEXT: lw s11, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT: or a6, a6, s11
-; RV32I-NEXT: or t2, t3, t2
-; RV32I-NEXT: or a4, t5, a4
-; RV32I-NEXT: or s0, s1, s0
-; RV32I-NEXT: or t3, s6, s2
-; RV32I-NEXT: or t5, s8, s7
-; RV32I-NEXT: or a1, a1, s9
-; RV32I-NEXT: or a0, a0, s10
-; RV32I-NEXT: andi a3, a3, 24
-; RV32I-NEXT: or a5, a7, a5
-; RV32I-NEXT: or a7, t1, t0
-; RV32I-NEXT: or t0, s4, s3
-; RV32I-NEXT: or t1, t4, s5
-; RV32I-NEXT: or a6, t2, a6
-; RV32I-NEXT: or a4, s0, a4
-; RV32I-NEXT: or t2, t5, t3
-; RV32I-NEXT: or a0, a0, a1
-; RV32I-NEXT: sub t3, t6, a3
-; RV32I-NEXT: sw a6, 56(sp)
-; RV32I-NEXT: sw a4, 60(sp)
-; RV32I-NEXT: sw t2, 64(sp)
-; RV32I-NEXT: sw a0, 68(sp)
+; RV32I-NEXT: slli a1, a1, 3
+; RV32I-NEXT: or t3, s2, t3
+; RV32I-NEXT: or s2, s4, s3
+; RV32I-NEXT: or s3, s6, s5
+; RV32I-NEXT: or a0, a0, s7
+; RV32I-NEXT: andi a1, a1, 24
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a5, t0, a7
+; RV32I-NEXT: or a6, t2, t1
+; RV32I-NEXT: or a7, t5, t4
+; RV32I-NEXT: or t0, s0, t6
+; RV32I-NEXT: or t1, s2, t3
+; RV32I-NEXT: or a0, a0, s3
+; RV32I-NEXT: sub s1, s1, a1
+; RV32I-NEXT: sw a7, 48(sp)
+; RV32I-NEXT: sw t0, 52(sp)
+; RV32I-NEXT: sw t1, 56(sp)
+; RV32I-NEXT: sw a0, 60(sp)
+; RV32I-NEXT: sw a3, 32(sp)
+; RV32I-NEXT: sw a4, 36(sp)
; RV32I-NEXT: sw a5, 40(sp)
-; RV32I-NEXT: sw a7, 44(sp)
-; RV32I-NEXT: sw t0, 48(sp)
-; RV32I-NEXT: sw t1, 52(sp)
-; RV32I-NEXT: lw a6, 16(t3)
-; RV32I-NEXT: lw a5, 20(t3)
-; RV32I-NEXT: lw a7, 24(t3)
-; RV32I-NEXT: lw a1, 0(t3)
-; RV32I-NEXT: lw a0, 4(t3)
-; RV32I-NEXT: lw a4, 8(t3)
-; RV32I-NEXT: lw a3, 12(t3)
-; RV32I-NEXT: lw t0, 28(t3)
+; RV32I-NEXT: sw a6, 44(sp)
+; RV32I-NEXT: lw a6, 16(s1)
+; RV32I-NEXT: lw a5, 20(s1)
+; RV32I-NEXT: lw a7, 24(s1)
+; RV32I-NEXT: lw a1, 0(s1)
+; RV32I-NEXT: lw a0, 4(s1)
+; RV32I-NEXT: lw a4, 8(s1)
+; RV32I-NEXT: lw a3, 12(s1)
+; RV32I-NEXT: lw t0, 28(s1)
; RV32I-NEXT: srli t1, a7, 24
; RV32I-NEXT: srli t2, a7, 16
; RV32I-NEXT: srli t3, a7, 8
@@ -4546,21 +4519,21 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
; RV32I-NEXT: srli s5, a5, 8
; RV32I-NEXT: srli s6, a4, 24
; RV32I-NEXT: srli s7, a4, 16
-; RV32I-NEXT: srli s8, a4, 8
-; RV32I-NEXT: srli s9, a3, 24
-; RV32I-NEXT: srli s10, a3, 16
-; RV32I-NEXT: srli s11, a3, 8
; RV32I-NEXT: sb a7, 24(a2)
-; RV32I-NEXT: srli a7, a1, 24
+; RV32I-NEXT: srli a7, a4, 8
; RV32I-NEXT: sb t3, 25(a2)
+; RV32I-NEXT: srli t3, a3, 24
; RV32I-NEXT: sb t2, 26(a2)
+; RV32I-NEXT: srli t2, a3, 16
; RV32I-NEXT: sb t1, 27(a2)
-; RV32I-NEXT: srli t1, a1, 16
+; RV32I-NEXT: srli t1, a3, 8
; RV32I-NEXT: sb t0, 28(a2)
+; RV32I-NEXT: srli t0, a1, 24
; RV32I-NEXT: sb t6, 29(a2)
+; RV32I-NEXT: srli t6, a1, 16
; RV32I-NEXT: sb t5, 30(a2)
; RV32I-NEXT: sb t4, 31(a2)
-; RV32I-NEXT: srli t0, a1, 8
+; RV32I-NEXT: srli t4, a1, 8
; RV32I-NEXT: sb a6, 16(a2)
; RV32I-NEXT: sb s2, 17(a2)
; RV32I-NEXT: sb s1, 18(a2)
@@ -4572,36 +4545,35 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
; RV32I-NEXT: sb s3, 23(a2)
; RV32I-NEXT: srli a5, a0, 16
; RV32I-NEXT: sb a4, 8(a2)
-; RV32I-NEXT: sb s8, 9(a2)
+; RV32I-NEXT: sb a7, 9(a2)
; RV32I-NEXT: sb s7, 10(a2)
; RV32I-NEXT: sb s6, 11(a2)
; RV32I-NEXT: srli a4, a0, 8
; RV32I-NEXT: sb a3, 12(a2)
-; RV32I-NEXT: sb s11, 13(a2)
-; RV32I-NEXT: sb s10, 14(a2)
-; RV32I-NEXT: sb s9, 15(a2)
+; RV32I-NEXT: sb t1, 13(a2)
+; RV32I-NEXT: sb t2, 14(a2)
+; RV32I-NEXT: sb t3, 15(a2)
; RV32I-NEXT: sb a1, 0(a2)
-; RV32I-NEXT: sb t0, 1(a2)
-; RV32I-NEXT: sb t1, 2(a2)
-; RV32I-NEXT: sb a7, 3(a2)
+; RV32I-NEXT: sb t4, 1(a2)
+; RV32I-NEXT: sb t6, 2(a2)
+; RV32I-NEXT: sb t0, 3(a2)
; RV32I-NEXT: sb a0, 4(a2)
; RV32I-NEXT: sb a4, 5(a2)
; RV32I-NEXT: sb a5, 6(a2)
; RV32I-NEXT: sb a6, 7(a2)
-; RV32I-NEXT: lw ra, 124(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s0, 120(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s1, 116(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s2, 112(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s3, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s4, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s5, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s6, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s7, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s8, 88(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s9, 84(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s10, 80(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s11, 76(sp) # 4-byte Folded Reload
-; RV32I-NEXT: addi sp, sp, 128
+; RV32I-NEXT: lw s0, 108(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 104(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s2, 100(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s3, 96(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s4, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s5, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s6, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s7, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s8, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s9, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s10, 68(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s11, 64(sp) # 4-byte Folded Reload
+; RV32I-NEXT: addi sp, sp, 112
; RV32I-NEXT: ret
%src = load i256, ptr %src.ptr, align 1
%dwordOff = load i256, ptr %dwordOff.ptr, align 1
@@ -4846,140 +4818,137 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
;
; RV32I-LABEL: ashr_32bytes:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi sp, sp, -128
-; RV32I-NEXT: sw ra, 124(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s0, 120(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s1, 116(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s2, 112(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s3, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s4, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s5, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s6, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s7, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s8, 88(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s9, 84(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s10, 80(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s11, 76(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu t6, 0(a0)
+; RV32I-NEXT: addi sp, sp, -112
+; RV32I-NEXT: sw s0, 108(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 104(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s2, 100(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s3, 96(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s4, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s5, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s6, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s7, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s8, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s9, 72(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s10, 68(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s11, 64(sp) # 4-byte Folded Spill
+; RV32I-NEXT: lbu a3, 0(a0)
; RV32I-NEXT: lbu a4, 1(a0)
; RV32I-NEXT: lbu a5, 2(a0)
; RV32I-NEXT: lbu a6, 3(a0)
-; RV32I-NEXT: lbu t1, 4(a0)
-; RV32I-NEXT: lbu t3, 5(a0)
-; RV32I-NEXT: lbu t4, 6(a0)
-; RV32I-NEXT: lbu t5, 7(a0)
-; RV32I-NEXT: lbu t2, 8(a0)
-; RV32I-NEXT: lbu s1, 9(a0)
-; RV32I-NEXT: lbu s7, 10(a0)
-; RV32I-NEXT: lbu s8, 11(a0)
-; RV32I-NEXT: lbu s9, 12(a0)
-; RV32I-NEXT: lbu s10, 13(a0)
-; RV32I-NEXT: lbu s4, 14(a0)
-; RV32I-NEXT: lbu s6, 15(a0)
-; RV32I-NEXT: lbu s5, 16(a0)
-; RV32I-NEXT: lbu s11, 17(a0)
-; RV32I-NEXT: lbu ra, 18(a0)
-; RV32I-NEXT: lbu a3, 19(a0)
-; RV32I-NEXT: lbu s2, 20(a0)
-; RV32I-NEXT: lbu s3, 21(a0)
-; RV32I-NEXT: lbu a7, 22(a0)
-; RV32I-NEXT: lbu t0, 23(a0)
+; RV32I-NEXT: lbu a7, 4(a0)
+; RV32I-NEXT: lbu t0, 5(a0)
+; RV32I-NEXT: lbu t1, 6(a0)
+; RV32I-NEXT: lbu t2, 7(a0)
+; RV32I-NEXT: lbu t3, 8(a0)
+; RV32I-NEXT: lbu t4, 9(a0)
+; RV32I-NEXT: lbu t5, 10(a0)
+; RV32I-NEXT: lbu t6, 11(a0)
+; RV32I-NEXT: lbu s0, 12(a0)
+; RV32I-NEXT: lbu s1, 13(a0)
+; RV32I-NEXT: lbu s2, 14(a0)
+; RV32I-NEXT: lbu s3, 15(a0)
+; RV32I-NEXT: lbu s4, 16(a0)
+; RV32I-NEXT: lbu s5, 17(a0)
+; RV32I-NEXT: lbu s6, 18(a0)
+; RV32I-NEXT: lbu s7, 19(a0)
; RV32I-NEXT: slli a4, a4, 8
; RV32I-NEXT: slli a5, a5, 16
; RV32I-NEXT: slli a6, a6, 24
-; RV32I-NEXT: slli t3, t3, 8
-; RV32I-NEXT: slli t4, t4, 16
-; RV32I-NEXT: slli t5, t5, 24
-; RV32I-NEXT: or a4, a4, t6
-; RV32I-NEXT: sw a4, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT: slli t0, t0, 8
+; RV32I-NEXT: slli t1, t1, 16
+; RV32I-NEXT: slli t2, t2, 24
+; RV32I-NEXT: or a3, a4, a3
; RV32I-NEXT: or a4, a6, a5
-; RV32I-NEXT: or a5, t3, t1
-; RV32I-NEXT: or a6, t5, t4
-; RV32I-NEXT: lbu t1, 24(a0)
-; RV32I-NEXT: lbu t5, 25(a0)
-; RV32I-NEXT: lbu t6, 26(a0)
-; RV32I-NEXT: lbu s0, 27(a0)
+; RV32I-NEXT: or a5, t0, a7
+; RV32I-NEXT: or a6, t2, t1
+; RV32I-NEXT: lbu s8, 20(a0)
+; RV32I-NEXT: lbu s9, 21(a0)
+; RV32I-NEXT: lbu s10, 22(a0)
+; RV32I-NEXT: lbu s11, 23(a0)
+; RV32I-NEXT: slli t4, t4, 8
+; RV32I-NEXT: slli t5, t5, 16
+; RV32I-NEXT: slli t6, t6, 24
; RV32I-NEXT: slli s1, s1, 8
-; RV32I-NEXT: slli s7, s7, 16
-; RV32I-NEXT: slli s8, s8, 24
-; RV32I-NEXT: slli s10, s10, 8
-; RV32I-NEXT: or t2, s1, t2
-; RV32I-NEXT: or t3, s8, s7
-; RV32I-NEXT: or t4, s10, s9
-; RV32I-NEXT: lbu s1, 28(a0)
-; RV32I-NEXT: lbu s7, 29(a0)
-; RV32I-NEXT: lbu s8, 30(a0)
-; RV32I-NEXT: lbu s9, 31(a0)
-; RV32I-NEXT: slli s4, s4, 16
-; RV32I-NEXT: slli s6, s6, 24
-; RV32I-NEXT: slli s11, s11, 8
-; RV32I-NEXT: slli ra, ra, 16
-; RV32I-NEXT: slli a3, a3, 24
-; RV32I-NEXT: or a0, s6, s4
-; RV32I-NEXT: or s4, s11, s5
-; RV32I-NEXT: or s5, a3, ra
-; RV32I-NEXT: lbu a3, 0(a1)
-; RV32I-NEXT: lbu s6, 1(a1)
-; RV32I-NEXT: lbu s10, 2(a1)
-; RV32I-NEXT: lbu a1, 3(a1)
-; RV32I-NEXT: slli s3, s3, 8
-; RV32I-NEXT: or s2, s3, s2
-; RV32I-NEXT: addi s3, sp, 8
-; RV32I-NEXT: slli a7, a7, 16
-; RV32I-NEXT: slli t0, t0, 24
-; RV32I-NEXT: slli t5, t5, 8
-; RV32I-NEXT: slli t6, t6, 16
-; RV32I-NEXT: slli s0, s0, 24
-; RV32I-NEXT: slli s7, s7, 8
-; RV32I-NEXT: slli s8, s8, 16
-; RV32I-NEXT: slli s9, s9, 24
-; RV32I-NEXT: slli s6, s6, 8
+; RV32I-NEXT: slli s2, s2, 16
+; RV32I-NEXT: slli s3, s3, 24
+; RV32I-NEXT: or a7, t4, t3
+; RV32I-NEXT: or t0, t6, t5
+; RV32I-NEXT: or t1, s1, s0
+; RV32I-NEXT: or t2, s3, s2
+; RV32I-NEXT: lbu t6, 24(a0)
+; RV32I-NEXT: lbu s0, 25(a0)
+; RV32I-NEXT: lbu s1, 26(a0)
+; RV32I-NEXT: lbu s2, 27(a0)
+; RV32I-NEXT: slli s5, s5, 8
+; RV32I-NEXT: slli s6, s6, 16
+; RV32I-NEXT: slli s7, s7, 24
+; RV32I-NEXT: slli s9, s9, 8
+; RV32I-NEXT: or t3, s5, s4
+; RV32I-NEXT: or t4, s7, s6
+; RV32I-NEXT: or t5, s9, s8
+; RV32I-NEXT: lbu s3, 28(a0)
+; RV32I-NEXT: lbu s4, 29(a0)
+; RV32I-NEXT: lbu s5, 30(a0)
+; RV32I-NEXT: lbu a0, 31(a0)
; RV32I-NEXT: slli s10, s10, 16
+; RV32I-NEXT: slli s11, s11, 24
+; RV32I-NEXT: slli s0, s0, 8
+; RV32I-NEXT: slli s1, s1, 16
+; RV32I-NEXT: slli s2, s2, 24
+; RV32I-NEXT: or s6, s11, s10
+; RV32I-NEXT: or t6, s0, t6
+; RV32I-NEXT: or s0, s2, s1
+; RV32I-NEXT: lbu s1, 0(a1)
+; RV32I-NEXT: lbu s2, 1(a1)
+; RV32I-NEXT: lbu s7, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: slli s4, s4, 8
+; RV32I-NEXT: or s3, s4, s3
+; RV32I-NEXT: mv s4, sp
+; RV32I-NEXT: slli s5, s5, 16
+; RV32I-NEXT: slli a0, a0, 24
+; RV32I-NEXT: slli s2, s2, 8
+; RV32I-NEXT: slli s7, s7, 16
; RV32I-NEXT: slli a1, a1, 24
-; RV32I-NEXT: or a7, t0, a7
-; RV32I-NEXT: or t0, t5, t1
+; RV32I-NEXT: or s5, a0, s5
+; RV32I-NEXT: or s1, s2, s1
+; RV32I-NEXT: or a1, a1, s7
+; RV32I-NEXT: srai a0, a0, 31
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a5, t0, a7
+; RV32I-NEXT: or a6, t2, t1
+; RV32I-NEXT: or a7, t4, t3
+; RV32I-NEXT: or t0, s6, t5
; RV32I-NEXT: or t1, s0, t6
-; RV32I-NEXT: or t5, s7, s1
-; RV32I-NEXT: or t6, s9, s8
-; RV32I-NEXT: or a3, s6, a3
-; RV32I-NEXT: or a1, a1, s10
-; RV32I-NEXT: srai s0, s9, 31
-; RV32I-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT: or a4, a4, s1
-; RV32I-NEXT: or a5, a6, a5
-; RV32I-NEXT: or a6, t3, t2
-; RV32I-NEXT: or a0, a0, t4
-; RV32I-NEXT: or t2, s5, s4
-; RV32I-NEXT: or a7, a7, s2
-; RV32I-NEXT: or t0, t1, t0
-; RV32I-NEXT: or t1, t6, t5
-; RV32I-NEXT: or a1, a1, a3
-; RV32I-NEXT: sw s0, 56(sp)
-; RV32I-NEXT: sw s0, 60(sp)
-; RV32I-NEXT: sw s0, 64(sp)
-; RV32I-NEXT: sw s0, 68(sp)
-; RV32I-NEXT: sw s0, 40(sp)
-; RV32I-NEXT: sw s0, 44(sp)
-; RV32I-NEXT: sw s0, 48(sp)
-; RV32I-NEXT: sw s0, 52(sp)
-; RV32I-NEXT: sw t2, 24(sp)
-; RV32I-NEXT: sw a7, 28(sp)
-; RV32I-NEXT: sw t0, 32(sp)
-; RV32I-NEXT: sw t1, 36(sp)
-; RV32I-NEXT: sw a4, 8(sp)
-; RV32I-NEXT: sw a5, 12(sp)
-; RV32I-NEXT: sw a6, 16(sp)
-; RV32I-NEXT: sw a0, 20(sp)
+; RV32I-NEXT: or t2, s5, s3
+; RV32I-NEXT: or a1, a1, s1
+; RV32I-NEXT: sw a0, 48(sp)
+; RV32I-NEXT: sw a0, 52(sp)
+; RV32I-NEXT: sw a0, 56(sp)
+; RV32I-NEXT: sw a0, 60(sp)
+; RV32I-NEXT: sw a0, 32(sp)
+; RV32I-NEXT: sw a0, 36(sp)
+; RV32I-NEXT: sw a0, 40(sp)
+; RV32I-NEXT: sw a0, 44(sp)
+; RV32I-NEXT: sw a7, 16(sp)
+; RV32I-NEXT: sw t0, 20(sp)
+; RV32I-NEXT: sw t1, 24(sp)
+; RV32I-NEXT: sw t2, 28(sp)
+; RV32I-NEXT: sw a3, 0(sp)
+; RV32I-NEXT: sw a4, 4(sp)
+; RV32I-NEXT: sw a5, 8(sp)
+; RV32I-NEXT: sw a6, 12(sp)
; RV32I-NEXT: slli t1, a1, 3
; RV32I-NEXT: andi a1, a1, 28
-; RV32I-NEXT: add a1, s3, a1
+; RV32I-NEXT: add a1, s4, a1
; RV32I-NEXT: andi a0, t1, 24
-; RV32I-NEXT: xori t0, a0, 31
+; RV32I-NEXT: xori a7, a0, 31
; RV32I-NEXT: lw a3, 0(a1)
; RV32I-NEXT: lw a4, 4(a1)
; RV32I-NEXT: lw a5, 8(a1)
; RV32I-NEXT: lw a6, 12(a1)
-; RV32I-NEXT: lw a7, 16(a1)
+; RV32I-NEXT: lw t0, 16(a1)
; RV32I-NEXT: lw t2, 20(a1)
; RV32I-NEXT: lw t3, 24(a1)
; RV32I-NEXT: lw t4, 28(a1)
@@ -4988,33 +4957,33 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: srl a1, a3, t1
; RV32I-NEXT: slli t6, a4, 1
; RV32I-NEXT: srl a3, a6, t1
-; RV32I-NEXT: slli s0, a7, 1
+; RV32I-NEXT: slli s0, t0, 1
; RV32I-NEXT: srl a4, a5, t1
; RV32I-NEXT: slli s1, a6, 1
; RV32I-NEXT: srl a5, t2, t1
; RV32I-NEXT: slli s2, t3, 1
-; RV32I-NEXT: srl a6, a7, t1
+; RV32I-NEXT: srl a6, t0, t1
; RV32I-NEXT: slli t2, t2, 1
-; RV32I-NEXT: srl a7, t3, t1
+; RV32I-NEXT: srl t0, t3, t1
; RV32I-NEXT: slli t3, t4, 1
; RV32I-NEXT: sra t1, t4, t1
-; RV32I-NEXT: sll t4, t5, t0
-; RV32I-NEXT: sll t5, t6, t0
-; RV32I-NEXT: sll t6, s0, t0
-; RV32I-NEXT: sll s0, s1, t0
-; RV32I-NEXT: sll s1, s2, t0
-; RV32I-NEXT: sll t2, t2, t0
-; RV32I-NEXT: sll t3, t3, t0
+; RV32I-NEXT: sll t4, t5, a7
+; RV32I-NEXT: sll t5, t6, a7
+; RV32I-NEXT: sll t6, s0, a7
+; RV32I-NEXT: sll s0, s1, a7
+; RV32I-NEXT: sll s1, s2, a7
+; RV32I-NEXT: sll t2, t2, a7
+; RV32I-NEXT: sll t3, t3, a7
; RV32I-NEXT: srli s2, t1, 24
; RV32I-NEXT: srli s3, t1, 16
; RV32I-NEXT: srli s4, t1, 8
-; RV32I-NEXT: or t0, a0, t4
+; RV32I-NEXT: or a7, a0, t4
; RV32I-NEXT: or t4, a1, t5
; RV32I-NEXT: or t5, a3, t6
; RV32I-NEXT: or s0, a4, s0
; RV32I-NEXT: or s1, a5, s1
; RV32I-NEXT: or t2, a6, t2
-; RV32I-NEXT: or t3, a7, t3
+; RV32I-NEXT: or t3, t0, t3
; RV32I-NEXT: sb t1, 28(a2)
; RV32I-NEXT: sb s4, 29(a2)
; RV32I-NEXT: sb s3, 30(a2)
@@ -5031,23 +5000,23 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: srli s6, s0, 24
; RV32I-NEXT: srli s7, s0, 16
; RV32I-NEXT: srli s0, s0, 8
-; RV32I-NEXT: srli s8, t5, 24
-; RV32I-NEXT: srli s9, t5, 16
-; RV32I-NEXT: srli t5, t5, 8
-; RV32I-NEXT: srli s10, t4, 24
-; RV32I-NEXT: srli s11, t4, 16
-; RV32I-NEXT: srli t4, t4, 8
-; RV32I-NEXT: sb a7, 24(a2)
+; RV32I-NEXT: sb t0, 24(a2)
+; RV32I-NEXT: srli t0, t5, 24
; RV32I-NEXT: sb t3, 25(a2)
+; RV32I-NEXT: srli t3, t5, 16
+; RV32I-NEXT: srli t5, t5, 8
; RV32I-NEXT: sb t6, 26(a2)
+; RV32I-NEXT: srli t6, t4, 24
; RV32I-NEXT: sb t1, 27(a2)
-; RV32I-NEXT: srli a7, t0, 24
+; RV32I-NEXT: srli t1, t4, 16
+; RV32I-NEXT: srli t4, t4, 8
; RV32I-NEXT: sb a6, 16(a2)
+; RV32I-NEXT: srli a6, a7, 24
; RV32I-NEXT: sb t2, 17(a2)
; RV32I-NEXT: sb s3, 18(a2)
; RV32I-NEXT: sb s2, 19(a2)
-; RV32I-NEXT: srli a6, t0, 16
-; RV32I-NEXT: srli t0, t0, 8
+; RV32I-NEXT: srli t2, a7, 16
+; RV32I-NEXT: srli a7, a7, 8
; RV32I-NEXT: sb a5, 20(a2)
; RV32I-NEXT: sb s1, 21(a2)
; RV32I-NEXT: sb s5, 22(a2)
@@ -5058,30 +5027,29 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: sb s6, 11(a2)
; RV32I-NEXT: sb a3, 12(a2)
; RV32I-NEXT: sb t5, 13(a2)
-; RV32I-NEXT: sb s9, 14(a2)
-; RV32I-NEXT: sb s8, 15(a2)
+; RV32I-NEXT: sb t3, 14(a2)
+; RV32I-NEXT: sb t0, 15(a2)
; RV32I-NEXT: sb a1, 0(a2)
; RV32I-NEXT: sb t4, 1(a2)
-; RV32I-NEXT: sb s11, 2(a2)
-; RV32I-NEXT: sb s10, 3(a2)
+; RV32I-NEXT: sb t1, 2(a2)
+; RV32I-NEXT: sb t6, 3(a2)
; RV32I-NEXT: sb a0, 4(a2)
-; RV32I-NEXT: sb t0, 5(a2)
-; RV32I-NEXT: sb a6, 6(a2)
-; RV32I-NEXT: sb a7, 7(a2)
-; RV32I-NEXT: lw ra, 124(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s0, 120(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s1, 116(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s2, 112(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s3, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s4, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s5, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s6, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s7, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s8, 88(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s9, 84(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s10, 80(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s11, 76(sp) # 4-byte Folded Reload
-; RV32I-NEXT: addi sp, sp, 128
+; RV32I-NEXT: sb a7, 5(a2)
+; RV32I-NEXT: sb t2, 6(a2)
+; RV32I-NEXT: sb a6, 7(a2)
+; RV32I-NEXT: lw s0, 108(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 104(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s2, 100(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s3, 96(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s4, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s5, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s6, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s7, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s8, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s9, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s10, 68(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s11, 64(sp) # 4-byte Folded Reload
+; RV32I-NEXT: addi sp, sp, 112
; RV32I-NEXT: ret
%src = load i256, ptr %src.ptr, align 1
%byteOff = load i256, ptr %byteOff.ptr, align 1
@@ -5327,130 +5295,129 @@ define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
;
; RV32I-LABEL: ashr_32bytes_wordOff:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi sp, sp, -128
-; RV32I-NEXT: sw ra, 124(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s0, 120(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s1, 116(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s2, 112(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s3, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s4, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s5, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s6, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s7, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s8, 88(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s9, 84(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s10, 80(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s11, 76(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a5, 0(a0)
-; RV32I-NEXT: lbu a6, 1(a0)
-; RV32I-NEXT: lbu a7, 2(a0)
-; RV32I-NEXT: lbu t1, 3(a0)
-; RV32I-NEXT: lbu s0, 4(a0)
-; RV32I-NEXT: lbu s2, 5(a0)
-; RV32I-NEXT: lbu s3, 6(a0)
-; RV32I-NEXT: lbu s6, 7(a0)
-; RV32I-NEXT: lbu s1, 8(a0)
-; RV32I-NEXT: lbu s7, 9(a0)
-; RV32I-NEXT: lbu s8, 10(a0)
-; RV32I-NEXT: lbu s9, 11(a0)
-; RV32I-NEXT: lbu s10, 12(a0)
-; RV32I-NEXT: lbu s11, 13(a0)
-; RV32I-NEXT: lbu s4, 14(a0)
-; RV32I-NEXT: lbu s5, 15(a0)
-; RV32I-NEXT: lbu a3, 16(a0)
-; RV32I-NEXT: lbu t0, 17(a0)
-; RV32I-NEXT: lbu t2, 18(a0)
-; RV32I-NEXT: lbu t3, 19(a0)
-; RV32I-NEXT: lbu a4, 20(a0)
-; RV32I-NEXT: lbu t4, 21(a0)
-; RV32I-NEXT: lbu t5, 22(a0)
-; RV32I-NEXT: lbu t6, 23(a0)
-; RV32I-NEXT: slli a6, a6, 8
-; RV32I-NEXT: slli a7, a7, 16
-; RV32I-NEXT: slli t1, t1, 24
-; RV32I-NEXT: slli s2, s2, 8
-; RV32I-NEXT: slli s3, s3, 16
-; RV32I-NEXT: slli s6, s6, 24
-; RV32I-NEXT: or a5, a6, a5
-; RV32I-NEXT: or a6, t1, a7
-; RV32I-NEXT: or a7, s2, s0
-; RV32I-NEXT: or t1, s6, s3
-; RV32I-NEXT: lbu s0, 24(a0)
-; RV32I-NEXT: lbu s6, 25(a0)
-; RV32I-NEXT: lbu ra, 26(a0)
-; RV32I-NEXT: lbu s2, 27(a0)
-; RV32I-NEXT: slli s7, s7, 8
-; RV32I-NEXT: slli s8, s8, 16
-; RV32I-NEXT: slli s9, s9, 24
-; RV32I-NEXT: slli s11, s11, 8
-; RV32I-NEXT: or s1, s7, s1
-; RV32I-NEXT: or s7, s9, s8
-; RV32I-NEXT: or s3, s11, s10
-; RV32I-NEXT: lbu s8, 28(a0)
-; RV32I-NEXT: lbu s9, 29(a0)
-; RV32I-NEXT: lbu s10, 30(a0)
-; RV32I-NEXT: lbu a0, 31(a0)
-; RV32I-NEXT: lbu a1, 0(a1)
-; RV32I-NEXT: slli s4, s4, 16
-; RV32I-NEXT: slli s5, s5, 24
-; RV32I-NEXT: or s4, s5, s4
-; RV32I-NEXT: addi s5, sp, 8
+; RV32I-NEXT: addi sp, sp, -112
+; RV32I-NEXT: sw s0, 108(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 104(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s2, 100(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s3, 96(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s4, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s5, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s6, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s7, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s8, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s9, 72(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s10, 68(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s11, 64(sp) # 4-byte Folded Spill
+; RV32I-NEXT: lbu a3, 0(a0)
+; RV32I-NEXT: lbu a4, 1(a0)
+; RV32I-NEXT: lbu a5, 2(a0)
+; RV32I-NEXT: lbu a6, 3(a0)
+; RV32I-NEXT: lbu a7, 4(a0)
+; RV32I-NEXT: lbu t0, 5(a0)
+; RV32I-NEXT: lbu t1, 6(a0)
+; RV32I-NEXT: lbu t2, 7(a0)
+; RV32I-NEXT: lbu t3, 8(a0)
+; RV32I-NEXT: lbu t4, 9(a0)
+; RV32I-NEXT: lbu t5, 10(a0)
+; RV32I-NEXT: lbu t6, 11(a0)
+; RV32I-NEXT: lbu s0, 12(a0)
+; RV32I-NEXT: lbu s1, 13(a0)
+; RV32I-NEXT: lbu s2, 14(a0)
+; RV32I-NEXT: lbu s3, 15(a0)
+; RV32I-NEXT: lbu s4, 16(a0)
+; RV32I-NEXT: lbu s5, 17(a0)
+; RV32I-NEXT: lbu s6, 18(a0)
+; RV32I-NEXT: lbu s7, 19(a0)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: slli a5, a5, 16
+; RV32I-NEXT: slli a6, a6, 24
; RV32I-NEXT: slli t0, t0, 8
-; RV32I-NEXT: slli t2, t2, 16
-; RV32I-NEXT: slli t3, t3, 24
+; RV32I-NEXT: slli t1, t1, 16
+; RV32I-NEXT: slli t2, t2, 24
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a5, t0, a7
+; RV32I-NEXT: or a6, t2, t1
+; RV32I-NEXT: lbu s8, 20(a0)
+; RV32I-NEXT: lbu s9, 21(a0)
+; RV32I-NEXT: lbu s10, 22(a0)
+; RV32I-NEXT: lbu s11, 23(a0)
; RV32I-NEXT: slli t4, t4, 8
; RV32I-NEXT: slli t5, t5, 16
; RV32I-NEXT: slli t6, t6, 24
-; RV32I-NEXT: slli s6, s6, 8
-; RV32I-NEXT: slli ra, ra, 16
-; RV32I-NEXT: slli s2, s2, 24
+; RV32I-NEXT: slli s1, s1, 8
+; RV32I-NEXT: slli s2, s2, 16
+; RV32I-NEXT: slli s3, s3, 24
+; RV32I-NEXT: or a7, t4, t3
+; RV32I-NEXT: or t0, t6, t5
+; RV32I-NEXT: or t1, s1, s0
+; RV32I-NEXT: or t2, s3, s2
+; RV32I-NEXT: lbu t3, 24(a0)
+; RV32I-NEXT: lbu t5, 25(a0)
+; RV32I-NEXT: lbu t6, 26(a0)
+; RV32I-NEXT: lbu s0, 27(a0)
+; RV32I-NEXT: slli s5, s5, 8
+; RV32I-NEXT: slli s6, s6, 16
+; RV32I-NEXT: slli s7, s7, 24
; RV32I-NEXT: slli s9, s9, 8
+; RV32I-NEXT: or t4, s5, s4
+; RV32I-NEXT: or s1, s7, s6
+; RV32I-NEXT: or s2, s9, s8
+; RV32I-NEXT: lbu s3, 28(a0)
+; RV32I-NEXT: lbu s4, 29(a0)
+; RV32I-NEXT: lbu s5, 30(a0)
+; RV32I-NEXT: lbu a0, 31(a0)
+; RV32I-NEXT: lbu a1, 0(a1)
; RV32I-NEXT: slli s10, s10, 16
+; RV32I-NEXT: slli s11, s11, 24
+; RV32I-NEXT: or s6, s11, s10
+; RV32I-NEXT: mv s7, sp
+; RV32I-NEXT: slli t5, t5, 8
+; RV32I-NEXT: slli t6, t6, 16
+; RV32I-NEXT: slli s0, s0, 24
+; RV32I-NEXT: slli s4, s4, 8
+; RV32I-NEXT: slli s5, s5, 16
; RV32I-NEXT: slli a0, a0, 24
; RV32I-NEXT: slli a1, a1, 2
-; RV32I-NEXT: or a3, t0, a3
-; RV32I-NEXT: or t0, t3, t2
-; RV32I-NEXT: or a4, t4, a4
-; RV32I-NEXT: or t2, t6, t5
-; RV32I-NEXT: or t3, s6, s0
-; RV32I-NEXT: or t4, s2, ra
-; RV32I-NEXT: or t5, s9, s8
-; RV32I-NEXT: or t6, a0, s10
+; RV32I-NEXT: or t3, t5, t3
+; RV32I-NEXT: or t5, s0, t6
+; RV32I-NEXT: or t6, s4, s3
+; RV32I-NEXT: or s0, a0, s5
; RV32I-NEXT: srai a0, a0, 31
; RV32I-NEXT: andi a1, a1, 28
-; RV32I-NEXT: or a5, a6, a5
-; RV32I-NEXT: or a6, t1, a7
-; RV32I-NEXT: or a7, s7, s1
-; RV32I-NEXT: or t1, s4, s3
-; RV32I-NEXT: or a3, t0, a3
-; RV32I-NEXT: or a4, t2, a4
-; RV32I-NEXT: or t0, t4, t3
-; RV32I-NEXT: or t2, t6, t5
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a5, t0, a7
+; RV32I-NEXT: or a6, t2, t1
+; RV32I-NEXT: or a7, s1, t4
+; RV32I-NEXT: or t0, s6, s2
+; RV32I-NEXT: or t1, t5, t3
+; RV32I-NEXT: or t2, s0, t6
+; RV32I-NEXT: sw a0, 48(sp)
+; RV32I-NEXT: sw a0, 52(sp)
; RV32I-NEXT: sw a0, 56(sp)
; RV32I-NEXT: sw a0, 60(sp)
-; RV32I-NEXT: sw a0, 64(sp)
-; RV32I-NEXT: sw a0, 68(sp)
+; RV32I-NEXT: sw a0, 32(sp)
+; RV32I-NEXT: sw a0, 36(sp)
; RV32I-NEXT: sw a0, 40(sp)
; RV32I-NEXT: sw a0, 44(sp)
-; RV32I-NEXT: sw a0, 48(sp)
-; RV32I-NEXT: sw a0, 52(sp)
-; RV32I-NEXT: add s5, s5, a1
-; RV32I-NEXT: sw a3, 24(sp)
-; RV32I-NEXT: sw a4, 28(sp)
-; RV32I-NEXT: sw t0, 32(sp)
-; RV32I-NEXT: sw t2, 36(sp)
+; RV32I-NEXT: add s7, s7, a1
+; RV32I-NEXT: sw a7, 16(sp)
+; RV32I-NEXT: sw t0, 20(sp)
+; RV32I-NEXT: sw t1, 24(sp)
+; RV32I-NEXT: sw t2, 28(sp)
+; RV32I-NEXT: sw a3, 0(sp)
+; RV32I-NEXT: sw a4, 4(sp)
; RV32I-NEXT: sw a5, 8(sp)
; RV32I-NEXT: sw a6, 12(sp)
-; RV32I-NEXT: sw a7, 16(sp)
-; RV32I-NEXT: sw t1, 20(sp)
-; RV32I-NEXT: lw a6, 16(s5)
-; RV32I-NEXT: lw a5, 20(s5)
-; RV32I-NEXT: lw a7, 24(s5)
-; RV32I-NEXT: lw a1, 0(s5)
-; RV32I-NEXT: lw a0, 4(s5)
-; RV32I-NEXT: lw a4, 8(s5)
-; RV32I-NEXT: lw a3, 12(s5)
-; RV32I-NEXT: lw t0, 28(s5)
+; RV32I-NEXT: lw a6, 16(s7)
+; RV32I-NEXT: lw a5, 20(s7)
+; RV32I-NEXT: lw a7, 24(s7)
+; RV32I-NEXT: lw a1, 0(s7)
+; RV32I-NEXT: lw a0, 4(s7)
+; RV32I-NEXT: lw a4, 8(s7)
+; RV32I-NEXT: lw a3, 12(s7)
+; RV32I-NEXT: lw t0, 28(s7)
; RV32I-NEXT: srli t1, a7, 24
; RV32I-NEXT: srli t2, a7, 16
; RV32I-NEXT: srli t3, a7, 8
@@ -5465,21 +5432,21 @@ define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
; RV32I-NEXT: srli s5, a5, 8
; RV32I-NEXT: srli s6, a4, 24
; RV32I-NEXT: srli s7, a4, 16
-; RV32I-NEXT: srli s8, a4, 8
-; RV32I-NEXT: srli s9, a3, 24
-; RV32I-NEXT: srli s10, a3, 16
-; RV32I-NEXT: srli s11, a3, 8
; RV32I-NEXT: sb a7, 24(a2)
-; RV32I-NEXT: srli a7, a1, 24
+; RV32I-NEXT: srli a7, a4, 8
; RV32I-NEXT: sb t3, 25(a2)
+; RV32I-NEXT: srli t3, a3, 24
; RV32I-NEXT: sb t2, 26(a2)
+; RV32I-NEXT: srli t2, a3, 16
; RV32I-NEXT: sb t1, 27(a2)
-; RV32I-NEXT: srli t1, a1, 16
+; RV32I-NEXT: srli t1, a3, 8
; RV32I-NEXT: sb t0, 28(a2)
+; RV32I-NEXT: srli t0, a1, 24
; RV32I-NEXT: sb t6, 29(a2)
+; RV32I-NEXT: srli t6, a1, 16
; RV32I-NEXT: sb t5, 30(a2)
; RV32I-NEXT: sb t4, 31(a2)
-; RV32I-NEXT: srli t0, a1, 8
+; RV32I-NEXT: srli t4, a1, 8
; RV32I-NEXT: sb a6, 16(a2)
; RV32I-NEXT: sb s2, 17(a2)
; RV32I-NEXT: sb s1, 18(a2)
@@ -5491,36 +5458,35 @@ define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
; RV32I-NEXT: sb s3, 23(a2)
; RV32I-NEXT: srli a5, a0, 16
; RV32I-NEXT: sb a4, 8(a2)
-; RV32I-NEXT: sb s8, 9(a2)
+; RV32I-NEXT: sb a7, 9(a2)
; RV32I-NEXT: sb s7, 10(a2)
; RV32I-NEXT: sb s6, 11(a2)
; RV32I-NEXT: srli a4, a0, 8
; RV32I-NEXT: sb a3, 12(a2)
-; RV32I-NEXT: sb s11, 13(a2)
-; RV32I-NEXT: sb s10, 14(a2)
-; RV32I-NEXT: sb s9, 15(a2)
+; RV32I-NEXT: sb t1, 13(a2)
+; RV32I-NEXT: sb t2, 14(a2)
+; RV32I-NEXT: sb t3, 15(a2)
; RV32I-NEXT: sb a1, 0(a2)
-; RV32I-NEXT: sb t0, 1(a2)
-; RV32I-NEXT: sb t1, 2(a2)
-; RV32I-NEXT: sb a7, 3(a2)
+; RV32I-NEXT: sb t4, 1(a2)
+; RV32I-NEXT: sb t6, 2(a2)
+; RV32I-NEXT: sb t0, 3(a2)
; RV32I-NEXT: sb a0, 4(a2)
; RV32I-NEXT: sb a4, 5(a2)
; RV32I-NEXT: sb a5, 6(a2)
; RV32I-NEXT: sb a6, 7(a2)
-; RV32I-NEXT: lw ra, 124(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s0, 120(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s1, 116(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s2, 112(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s3, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s4, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s5, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s6, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s7, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s8, 88(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s9, 84(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s10, 80(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s11, 76(sp) # 4-byte Folded Reload
-; RV32I-NEXT: addi sp, sp, 128
+; RV32I-NEXT: lw s0, 108(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 104(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s2, 100(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s3, 96(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s4, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s5, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s6, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s7, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s8, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s9, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s10, 68(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s11, 64(sp) # 4-byte Folded Reload
+; RV32I-NEXT: addi sp, sp, 112
; RV32I-NEXT: ret
%src = load i256, ptr %src.ptr, align 1
%wordOff = load i256, ptr %wordOff.ptr, align 1
@@ -5546,112 +5512,112 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
; RV64I-NEXT: sd s9, 80(sp) # 8-byte Folded Spill
; RV64I-NEXT: sd s10, 72(sp) # 8-byte Folded Spill
; RV64I-NEXT: sd s11, 64(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a5, 0(a0)
-; RV64I-NEXT: lbu a7, 1(a0)
-; RV64I-NEXT: lbu t1, 2(a0)
-; RV64I-NEXT: lbu s3, 3(a0)
-; RV64I-NEXT: lbu t0, 4(a0)
-; RV64I-NEXT: lbu s8, 5(a0)
-; RV64I-NEXT: lbu s9, 6(a0)
-; RV64I-NEXT: lbu s10, 7(a0)
-; RV64I-NEXT: lbu s2, 8(a0)
-; RV64I-NEXT: lbu s4, 9(a0)
-; RV64I-NEXT: lbu s5, 10(a0)
-; RV64I-NEXT: lbu s6, 11(a0)
-; RV64I-NEXT: lbu s7, 12(a0)
-; RV64I-NEXT: lbu s11, 13(a0)
-; RV64I-NEXT: lbu t4, 14(a0)
-; RV64I-NEXT: lbu t5, 15(a0)
-; RV64I-NEXT: lbu a3, 16(a0)
-; RV64I-NEXT: lbu a6, 17(a0)
-; RV64I-NEXT: lbu t2, 18(a0)
-; RV64I-NEXT: lbu t3, 19(a0)
-; RV64I-NEXT: lbu a4, 20(a0)
-; RV64I-NEXT: lbu t6, 21(a0)
-; RV64I-NEXT: lbu s0, 22(a0)
-; RV64I-NEXT: lbu s1, 23(a0)
-; RV64I-NEXT: slli a7, a7, 8
+; RV64I-NEXT: lbu a3, 0(a0)
+; RV64I-NEXT: lbu a4, 1(a0)
+; RV64I-NEXT: lbu a5, 2(a0)
+; RV64I-NEXT: lbu a6, 3(a0)
+; RV64I-NEXT: lbu a7, 4(a0)
+; RV64I-NEXT: lbu t0, 5(a0)
+; RV64I-NEXT: lbu t1, 6(a0)
+; RV64I-NEXT: lbu t2, 7(a0)
+; RV64I-NEXT: lbu t3, 8(a0)
+; RV64I-NEXT: lbu t4, 9(a0)
+; RV64I-NEXT: lbu t5, 10(a0)
+; RV64I-NEXT: lbu t6, 11(a0)
+; RV64I-NEXT: lbu s0, 12(a0)
+; RV64I-NEXT: lbu s1, 13(a0)
+; RV64I-NEXT: lbu s2, 14(a0)
+; RV64I-NEXT: lbu s3, 15(a0)
+; RV64I-NEXT: lbu s4, 16(a0)
+; RV64I-NEXT: lbu s5, 17(a0)
+; RV64I-NEXT: lbu s6, 18(a0)
+; RV64I-NEXT: lbu s7, 19(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: slli a5, a5, 16
+; RV64I-NEXT: slli a6, a6, 24
+; RV64I-NEXT: slli t0, t0, 8
; RV64I-NEXT: slli t1, t1, 16
+; RV64I-NEXT: slli t2, t2, 24
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: or a5, t0, a7
+; RV64I-NEXT: or a6, t2, t1
+; RV64I-NEXT: lbu s8, 20(a0)
+; RV64I-NEXT: lbu s9, 21(a0)
+; RV64I-NEXT: lbu s10, 22(a0)
+; RV64I-NEXT: lbu s11, 23(a0)
+; RV64I-NEXT: slli t4, t4, 8
+; RV64I-NEXT: slli t5, t5, 16
+; RV64I-NEXT: slli t6, t6, 24
+; RV64I-NEXT: slli s1, s1, 8
+; RV64I-NEXT: slli s2, s2, 16
; RV64I-NEXT: slli s3, s3, 24
-; RV64I-NEXT: slli s8, s8, 8
-; RV64I-NEXT: slli s9, s9, 16
-; RV64I-NEXT: slli s10, s10, 24
-; RV64I-NEXT: or a5, a7, a5
-; RV64I-NEXT: or a7, s3, t1
-; RV64I-NEXT: or t0, s8, t0
-; RV64I-NEXT: or t1, s10, s9
-; RV64I-NEXT: lbu s3, 24(a0)
-; RV64I-NEXT: lbu s8, 25(a0)
-; RV64I-NEXT: lbu s9, 26(a0)
-; RV64I-NEXT: lbu s10, 27(a0)
-; RV64I-NEXT: slli s4, s4, 8
-; RV64I-NEXT: slli s5, s5, 16
-; RV64I-NEXT: slli s6, s6, 24
-; RV64I-NEXT: slli s11, s11, 8
-; RV64I-NEXT: or s2, s4, s2
-; RV64I-NEXT: or s4, s6, s5
-; RV64I-NEXT: or s5, s11, s7
-; RV64I-NEXT: lbu s6, 28(a0)
-; RV64I-NEXT: lbu s7, 29(a0)
-; RV64I-NEXT: lbu s11, 30(a0)
+; RV64I-NEXT: or a7, t4, t3
+; RV64I-NEXT: or t0, t6, t5
+; RV64I-NEXT: or t1, s1, s0
+; RV64I-NEXT: or t2, s3, s2
+; RV64I-NEXT: lbu t3, 24(a0)
+; RV64I-NEXT: lbu t4, 25(a0)
+; RV64I-NEXT: lbu t5, 26(a0)
+; RV64I-NEXT: lbu t6, 27(a0)
+; RV64I-NEXT: slli s5, s5, 8
+; RV64I-NEXT: slli s6, s6, 16
+; RV64I-NEXT: slli s7, s7, 24
+; RV64I-NEXT: slli s9, s9, 8
+; RV64I-NEXT: or s0, s5, s4
+; RV64I-NEXT: or s1, s7, s6
+; RV64I-NEXT: or s2, s9, s8
+; RV64I-NEXT: lbu s3, 28(a0)
+; RV64I-NEXT: lbu s4, 29(a0)
+; RV64I-NEXT: lbu s5, 30(a0)
; RV64I-NEXT: lbu a0, 31(a0)
; RV64I-NEXT: lbu a1, 0(a1)
-; RV64I-NEXT: slli t4, t4, 16
-; RV64I-NEXT: slli t5, t5, 24
-; RV64I-NEXT: or t4, t5, t4
-; RV64I-NEXT: mv t5, sp
-; RV64I-NEXT: slli a6, a6, 8
-; RV64I-NEXT: slli t2, t2, 16
-; RV64I-NEXT: slli t3, t3, 24
-; RV64I-NEXT: slli t6, t6, 8
-; RV64I-NEXT: slli s0, s0, 16
-; RV64I-NEXT: slli s1, s1, 24
-; RV64I-NEXT: slli s8, s8, 8
-; RV64I-NEXT: slli s9, s9, 16
-; RV64I-NEXT: slli s10, s10, 24
-; RV64I-NEXT: slli s7, s7, 8
-; RV64I-NEXT: slli s11, s11, 16
+; RV64I-NEXT: slli s10, s10, 16
+; RV64I-NEXT: slli s11, s11, 24
+; RV64I-NEXT: or s6, s11, s10
+; RV64I-NEXT: mv s7, sp
+; RV64I-NEXT: slli t4, t4, 8
+; RV64I-NEXT: slli t5, t5, 16
+; RV64I-NEXT: slli t6, t6, 24
+; RV64I-NEXT: slli s4, s4, 8
+; RV64I-NEXT: slli s5, s5, 16
; RV64I-NEXT: slli a0, a0, 24
; RV64I-NEXT: slli a1, a1, 3
-; RV64I-NEXT: or a3, a6, a3
-; RV64I-NEXT: or a6, t3, t2
-; RV64I-NEXT: or a4, t6, a4
-; RV64I-NEXT: or s0, s1, s0
-; RV64I-NEXT: or t2, s8, s3
-; RV64I-NEXT: or t3, s10, s9
-; RV64I-NEXT: or t6, s7, s6
-; RV64I-NEXT: or a0, a0, s11
+; RV64I-NEXT: or t3, t4, t3
+; RV64I-NEXT: or t4, t6, t5
+; RV64I-NEXT: or t5, s4, s3
+; RV64I-NEXT: or a0, a0, s5
; RV64I-NEXT: andi a1, a1, 24
-; RV64I-NEXT: or a5, a7, a5
-; RV64I-NEXT: or a7, t1, t0
-; RV64I-NEXT: or t0, s4, s2
-; RV64I-NEXT: or t1, t4, s5
-; RV64I-NEXT: or a3, a6, a3
-; RV64I-NEXT: or a4, s0, a4
-; RV64I-NEXT: or a6, t3, t2
-; RV64I-NEXT: or a0, a0, t6
-; RV64I-NEXT: add t5, t5, a1
-; RV64I-NEXT: slli a7, a7, 32
-; RV64I-NEXT: slli t1, t1, 32
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: or a5, t0, a7
+; RV64I-NEXT: or a6, t2, t1
+; RV64I-NEXT: or s0, s1, s0
+; RV64I-NEXT: or a7, s6, s2
+; RV64I-NEXT: or t0, t4, t3
+; RV64I-NEXT: or a0, a0, t5
+; RV64I-NEXT: add s7, s7, a1
; RV64I-NEXT: slli a4, a4, 32
+; RV64I-NEXT: slli a6, a6, 32
+; RV64I-NEXT: slli a7, a7, 32
; RV64I-NEXT: slli a1, a0, 32
; RV64I-NEXT: sraiw a0, a0, 31
-; RV64I-NEXT: or a5, a7, a5
-; RV64I-NEXT: or a7, t1, t0
; RV64I-NEXT: or a3, a4, a3
-; RV64I-NEXT: or a1, a1, a6
+; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: or a5, a7, s0
+; RV64I-NEXT: or a1, a1, t0
; RV64I-NEXT: sd a0, 32(sp)
; RV64I-NEXT: sd a0, 40(sp)
; RV64I-NEXT: sd a0, 48(sp)
; RV64I-NEXT: sd a0, 56(sp)
-; RV64I-NEXT: sd a5, 0(sp)
-; RV64I-NEXT: sd a7, 8(sp)
-; RV64I-NEXT: sd a3, 16(sp)
+; RV64I-NEXT: sd a3, 0(sp)
+; RV64I-NEXT: sd a4, 8(sp)
+; RV64I-NEXT: sd a5, 16(sp)
; RV64I-NEXT: sd a1, 24(sp)
-; RV64I-NEXT: ld a4, 16(t5)
-; RV64I-NEXT: ld a0, 8(t5)
-; RV64I-NEXT: ld a1, 0(t5)
-; RV64I-NEXT: ld a3, 24(t5)
+; RV64I-NEXT: ld a4, 16(s7)
+; RV64I-NEXT: ld a0, 8(s7)
+; RV64I-NEXT: ld a1, 0(s7)
+; RV64I-NEXT: ld a3, 24(s7)
; RV64I-NEXT: srli a5, a4, 56
; RV64I-NEXT: srli a6, a4, 48
; RV64I-NEXT: srli a7, a4, 40
@@ -5670,25 +5636,25 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
; RV64I-NEXT: srli s5, a1, 48
; RV64I-NEXT: srli s6, a1, 40
; RV64I-NEXT: srli s7, a1, 32
-; RV64I-NEXT: srli s8, a1, 24
-; RV64I-NEXT: srli s9, a1, 16
-; RV64I-NEXT: srli s10, a1, 8
-; RV64I-NEXT: srli s11, a0, 56
; RV64I-NEXT: sb t0, 20(a2)
+; RV64I-NEXT: srli t0, a1, 24
; RV64I-NEXT: sb a7, 21(a2)
+; RV64I-NEXT: srli a7, a1, 16
; RV64I-NEXT: sb a6, 22(a2)
+; RV64I-NEXT: srli a6, a1, 8
; RV64I-NEXT: sb a5, 23(a2)
-; RV64I-NEXT: srli a5, a0, 48
+; RV64I-NEXT: srli a5, a0, 56
; RV64I-NEXT: sb a4, 16(a2)
+; RV64I-NEXT: srli a4, a0, 48
; RV64I-NEXT: sb t3, 17(a2)
; RV64I-NEXT: sb t2, 18(a2)
; RV64I-NEXT: sb t1, 19(a2)
-; RV64I-NEXT: srli a4, a0, 40
+; RV64I-NEXT: srli t1, a0, 40
; RV64I-NEXT: sb s0, 28(a2)
; RV64I-NEXT: sb t6, 29(a2)
; RV64I-NEXT: sb t5, 30(a2)
; RV64I-NEXT: sb t4, 31(a2)
-; RV64I-NEXT: srli a6, a0, 32
+; RV64I-NEXT: srli t2, a0, 32
; RV64I-NEXT: sb a3, 24(a2)
; RV64I-NEXT: sb s3, 25(a2)
; RV64I-NEXT: sb s2, 26(a2)
@@ -5698,19 +5664,19 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
; RV64I-NEXT: sb s6, 5(a2)
; RV64I-NEXT: sb s5, 6(a2)
; RV64I-NEXT: sb s4, 7(a2)
-; RV64I-NEXT: srli a7, a0, 16
+; RV64I-NEXT: srli t3, a0, 16
; RV64I-NEXT: sb a1, 0(a2)
-; RV64I-NEXT: sb s10, 1(a2)
-; RV64I-NEXT: sb s9, 2(a2)
-; RV64I-NEXT: sb s8, 3(a2)
+; RV64I-NEXT: sb a6, 1(a2)
+; RV64I-NEXT: sb a7, 2(a2)
+; RV64I-NEXT: sb t0, 3(a2)
; RV64I-NEXT: srli a1, a0, 8
-; RV64I-NEXT: sb a6, 12(a2)
-; RV64I-NEXT: sb a4, 13(a2)
-; RV64I-NEXT: sb a5, 14(a2)
-; RV64I-NEXT: sb s11, 15(a2)
+; RV64I-NEXT: sb t2, 12(a2)
+; RV64I-NEXT: sb t1, 13(a2)
+; RV64I-NEXT: sb a4, 14(a2)
+; RV64I-NEXT: sb a5, 15(a2)
; RV64I-NEXT: sb a0, 8(a2)
; RV64I-NEXT: sb a1, 9(a2)
-; RV64I-NEXT: sb a7, 10(a2)
+; RV64I-NEXT: sb t3, 10(a2)
; RV64I-NEXT: sb a3, 11(a2)
; RV64I-NEXT: ld s0, 152(sp) # 8-byte Folded Reload
; RV64I-NEXT: ld s1, 144(sp) # 8-byte Folded Reload
@@ -5729,130 +5695,129 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
;
; RV32I-LABEL: ashr_32bytes_dwordOff:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi sp, sp, -128
-; RV32I-NEXT: sw ra, 124(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s0, 120(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s1, 116(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s2, 112(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s3, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s4, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s5, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s6, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s7, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s8, 88(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s9, 84(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s10, 80(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s11, 76(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a5, 0(a0)
-; RV32I-NEXT: lbu a6, 1(a0)
-; RV32I-NEXT: lbu a7, 2(a0)
-; RV32I-NEXT: lbu t1, 3(a0)
-; RV32I-NEXT: lbu s0, 4(a0)
-; RV32I-NEXT: lbu s2, 5(a0)
-; RV32I-NEXT: lbu s3, 6(a0)
-; RV32I-NEXT: lbu s6, 7(a0)
-; RV32I-NEXT: lbu s1, 8(a0)
-; RV32I-NEXT: lbu s7, 9(a0)
-; RV32I-NEXT: lbu s8, 10(a0)
-; RV32I-NEXT: lbu s9, 11(a0)
-; RV32I-NEXT: lbu s10, 12(a0)
-; RV32I-NEXT: lbu s11, 13(a0)
-; RV32I-NEXT: lbu s4, 14(a0)
-; RV32I-NEXT: lbu s5, 15(a0)
-; RV32I-NEXT: lbu a3, 16(a0)
-; RV32I-NEXT: lbu t0, 17(a0)
-; RV32I-NEXT: lbu t2, 18(a0)
-; RV32I-NEXT: lbu t3, 19(a0)
-; RV32I-NEXT: lbu a4, 20(a0)
-; RV32I-NEXT: lbu t4, 21(a0)
-; RV32I-NEXT: lbu t5, 22(a0)
-; RV32I-NEXT: lbu t6, 23(a0)
-; RV32I-NEXT: slli a6, a6, 8
-; RV32I-NEXT: slli a7, a7, 16
-; RV32I-NEXT: slli t1, t1, 24
-; RV32I-NEXT: slli s2, s2, 8
-; RV32I-NEXT: slli s3, s3, 16
-; RV32I-NEXT: slli s6, s6, 24
-; RV32I-NEXT: or a5, a6, a5
-; RV32I-NEXT: or a6, t1, a7
-; RV32I-NEXT: or a7, s2, s0
-; RV32I-NEXT: or t1, s6, s3
-; RV32I-NEXT: lbu s0, 24(a0)
-; RV32I-NEXT: lbu s6, 25(a0)
-; RV32I-NEXT: lbu ra, 26(a0)
-; RV32I-NEXT: lbu s2, 27(a0)
-; RV32I-NEXT: slli s7, s7, 8
-; RV32I-NEXT: slli s8, s8, 16
-; RV32I-NEXT: slli s9, s9, 24
-; RV32I-NEXT: slli s11, s11, 8
-; RV32I-NEXT: or s1, s7, s1
-; RV32I-NEXT: or s7, s9, s8
-; RV32I-NEXT: or s3, s11, s10
-; RV32I-NEXT: lbu s8, 28(a0)
-; RV32I-NEXT: lbu s9, 29(a0)
-; RV32I-NEXT: lbu s10, 30(a0)
-; RV32I-NEXT: lbu a0, 31(a0)
-; RV32I-NEXT: lbu a1, 0(a1)
-; RV32I-NEXT: slli s4, s4, 16
-; RV32I-NEXT: slli s5, s5, 24
-; RV32I-NEXT: or s4, s5, s4
-; RV32I-NEXT: addi s5, sp, 8
+; RV32I-NEXT: addi sp, sp, -112
+; RV32I-NEXT: sw s0, 108(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 104(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s2, 100(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s3, 96(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s4, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s5, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s6, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s7, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s8, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s9, 72(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s10, 68(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s11, 64(sp) # 4-byte Folded Spill
+; RV32I-NEXT: lbu a3, 0(a0)
+; RV32I-NEXT: lbu a4, 1(a0)
+; RV32I-NEXT: lbu a5, 2(a0)
+; RV32I-NEXT: lbu a6, 3(a0)
+; RV32I-NEXT: lbu a7, 4(a0)
+; RV32I-NEXT: lbu t0, 5(a0)
+; RV32I-NEXT: lbu t1, 6(a0)
+; RV32I-NEXT: lbu t2, 7(a0)
+; RV32I-NEXT: lbu t3, 8(a0)
+; RV32I-NEXT: lbu t4, 9(a0)
+; RV32I-NEXT: lbu t5, 10(a0)
+; RV32I-NEXT: lbu t6, 11(a0)
+; RV32I-NEXT: lbu s0, 12(a0)
+; RV32I-NEXT: lbu s1, 13(a0)
+; RV32I-NEXT: lbu s2, 14(a0)
+; RV32I-NEXT: lbu s3, 15(a0)
+; RV32I-NEXT: lbu s4, 16(a0)
+; RV32I-NEXT: lbu s5, 17(a0)
+; RV32I-NEXT: lbu s6, 18(a0)
+; RV32I-NEXT: lbu s7, 19(a0)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: slli a5, a5, 16
+; RV32I-NEXT: slli a6, a6, 24
; RV32I-NEXT: slli t0, t0, 8
-; RV32I-NEXT: slli t2, t2, 16
-; RV32I-NEXT: slli t3, t3, 24
+; RV32I-NEXT: slli t1, t1, 16
+; RV32I-NEXT: slli t2, t2, 24
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a5, t0, a7
+; RV32I-NEXT: or a6, t2, t1
+; RV32I-NEXT: lbu s8, 20(a0)
+; RV32I-NEXT: lbu s9, 21(a0)
+; RV32I-NEXT: lbu s10, 22(a0)
+; RV32I-NEXT: lbu s11, 23(a0)
; RV32I-NEXT: slli t4, t4, 8
; RV32I-NEXT: slli t5, t5, 16
; RV32I-NEXT: slli t6, t6, 24
-; RV32I-NEXT: slli s6, s6, 8
-; RV32I-NEXT: slli ra, ra, 16
-; RV32I-NEXT: slli s2, s2, 24
+; RV32I-NEXT: slli s1, s1, 8
+; RV32I-NEXT: slli s2, s2, 16
+; RV32I-NEXT: slli s3, s3, 24
+; RV32I-NEXT: or a7, t4, t3
+; RV32I-NEXT: or t0, t6, t5
+; RV32I-NEXT: or t1, s1, s0
+; RV32I-NEXT: or t2, s3, s2
+; RV32I-NEXT: lbu t3, 24(a0)
+; RV32I-NEXT: lbu t5, 25(a0)
+; RV32I-NEXT: lbu t6, 26(a0)
+; RV32I-NEXT: lbu s0, 27(a0)
+; RV32I-NEXT: slli s5, s5, 8
+; RV32I-NEXT: slli s6, s6, 16
+; RV32I-NEXT: slli s7, s7, 24
; RV32I-NEXT: slli s9, s9, 8
+; RV32I-NEXT: or t4, s5, s4
+; RV32I-NEXT: or s1, s7, s6
+; RV32I-NEXT: or s2, s9, s8
+; RV32I-NEXT: lbu s3, 28(a0)
+; RV32I-NEXT: lbu s4, 29(a0)
+; RV32I-NEXT: lbu s5, 30(a0)
+; RV32I-NEXT: lbu a0, 31(a0)
+; RV32I-NEXT: lbu a1, 0(a1)
; RV32I-NEXT: slli s10, s10, 16
+; RV32I-NEXT: slli s11, s11, 24
+; RV32I-NEXT: or s6, s11, s10
+; RV32I-NEXT: mv s7, sp
+; RV32I-NEXT: slli t5, t5, 8
+; RV32I-NEXT: slli t6, t6, 16
+; RV32I-NEXT: slli s0, s0, 24
+; RV32I-NEXT: slli s4, s4, 8
+; RV32I-NEXT: slli s5, s5, 16
; RV32I-NEXT: slli a0, a0, 24
; RV32I-NEXT: slli a1, a1, 3
-; RV32I-NEXT: or a3, t0, a3
-; RV32I-NEXT: or t0, t3, t2
-; RV32I-NEXT: or a4, t4, a4
-; RV32I-NEXT: or t2, t6, t5
-; RV32I-NEXT: or t3, s6, s0
-; RV32I-NEXT: or t4, s2, ra
-; RV32I-NEXT: or t5, s9, s8
-; RV32I-NEXT: or t6, a0, s10
+; RV32I-NEXT: or t3, t5, t3
+; RV32I-NEXT: or t5, s0, t6
+; RV32I-NEXT: or t6, s4, s3
+; RV32I-NEXT: or s0, a0, s5
; RV32I-NEXT: srai a0, a0, 31
; RV32I-NEXT: andi a1, a1, 24
-; RV32I-NEXT: or a5, a6, a5
-; RV32I-NEXT: or a6, t1, a7
-; RV32I-NEXT: or a7, s7, s1
-; RV32I-NEXT: or t1, s4, s3
-; RV32I-NEXT: or a3, t0, a3
-; RV32I-NEXT: or a4, t2, a4
-; RV32I-NEXT: or t0, t4, t3
-; RV32I-NEXT: or t2, t6, t5
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a5, t0, a7
+; RV32I-NEXT: or a6, t2, t1
+; RV32I-NEXT: or a7, s1, t4
+; RV32I-NEXT: or t0, s6, s2
+; RV32I-NEXT: or t1, t5, t3
+; RV32I-NEXT: or t2, s0, t6
+; RV32I-NEXT: sw a0, 48(sp)
+; RV32I-NEXT: sw a0, 52(sp)
; RV32I-NEXT: sw a0, 56(sp)
; RV32I-NEXT: sw a0, 60(sp)
-; RV32I-NEXT: sw a0, 64(sp)
-; RV32I-NEXT: sw a0, 68(sp)
+; RV32I-NEXT: sw a0, 32(sp)
+; RV32I-NEXT: sw a0, 36(sp)
; RV32I-NEXT: sw a0, 40(sp)
; RV32I-NEXT: sw a0, 44(sp)
-; RV32I-NEXT: sw a0, 48(sp)
-; RV32I-NEXT: sw a0, 52(sp)
-; RV32I-NEXT: add s5, s5, a1
-; RV32I-NEXT: sw a3, 24(sp)
-; RV32I-NEXT: sw a4, 28(sp)
-; RV32I-NEXT: sw t0, 32(sp)
-; RV32I-NEXT: sw t2, 36(sp)
+; RV32I-NEXT: add s7, s7, a1
+; RV32I-NEXT: sw a7, 16(sp)
+; RV32I-NEXT: sw t0, 20(sp)
+; RV32I-NEXT: sw t1, 24(sp)
+; RV32I-NEXT: sw t2, 28(sp)
+; RV32I-NEXT: sw a3, 0(sp)
+; RV32I-NEXT: sw a4, 4(sp)
; RV32I-NEXT: sw a5, 8(sp)
; RV32I-NEXT: sw a6, 12(sp)
-; RV32I-NEXT: sw a7, 16(sp)
-; RV32I-NEXT: sw t1, 20(sp)
-; RV32I-NEXT: lw a6, 16(s5)
-; RV32I-NEXT: lw a5, 20(s5)
-; RV32I-NEXT: lw a7, 24(s5)
-; RV32I-NEXT: lw a1, 0(s5)
-; RV32I-NEXT: lw a0, 4(s5)
-; RV32I-NEXT: lw a4, 8(s5)
-; RV32I-NEXT: lw a3, 12(s5)
-; RV32I-NEXT: lw t0, 28(s5)
+; RV32I-NEXT: lw a6, 16(s7)
+; RV32I-NEXT: lw a5, 20(s7)
+; RV32I-NEXT: lw a7, 24(s7)
+; RV32I-NEXT: lw a1, 0(s7)
+; RV32I-NEXT: lw a0, 4(s7)
+; RV32I-NEXT: lw a4, 8(s7)
+; RV32I-NEXT: lw a3, 12(s7)
+; RV32I-NEXT: lw t0, 28(s7)
; RV32I-NEXT: srli t1, a7, 24
; RV32I-NEXT: srli t2, a7, 16
; RV32I-NEXT: srli t3, a7, 8
@@ -5867,21 +5832,21 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
; RV32I-NEXT: srli s5, a5, 8
; RV32I-NEXT: srli s6, a4, 24
; RV32I-NEXT: srli s7, a4, 16
-; RV32I-NEXT: srli s8, a4, 8
-; RV32I-NEXT: srli s9, a3, 24
-; RV32I-NEXT: srli s10, a3, 16
-; RV32I-NEXT: srli s11, a3, 8
; RV32I-NEXT: sb a7, 24(a2)
-; RV32I-NEXT: srli a7, a1, 24
+; RV32I-NEXT: srli a7, a4, 8
; RV32I-NEXT: sb t3, 25(a2)
+; RV32I-NEXT: srli t3, a3, 24
; RV32I-NEXT: sb t2, 26(a2)
+; RV32I-NEXT: srli t2, a3, 16
; RV32I-NEXT: sb t1, 27(a2)
-; RV32I-NEXT: srli t1, a1, 16
+; RV32I-NEXT: srli t1, a3, 8
; RV32I-NEXT: sb t0, 28(a2)
+; RV32I-NEXT: srli t0, a1, 24
; RV32I-NEXT: sb t6, 29(a2)
+; RV32I-NEXT: srli t6, a1, 16
; RV32I-NEXT: sb t5, 30(a2)
; RV32I-NEXT: sb t4, 31(a2)
-; RV32I-NEXT: srli t0, a1, 8
+; RV32I-NEXT: srli t4, a1, 8
; RV32I-NEXT: sb a6, 16(a2)
; RV32I-NEXT: sb s2, 17(a2)
; RV32I-NEXT: sb s1, 18(a2)
@@ -5893,36 +5858,35 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
; RV32I-NEXT: sb s3, 23(a2)
; RV32I-NEXT: srli a5, a0, 16
; RV32I-NEXT: sb a4, 8(a2)
-; RV32I-NEXT: sb s8, 9(a2)
+; RV32I-NEXT: sb a7, 9(a2)
; RV32I-NEXT: sb s7, 10(a2)
; RV32I-NEXT: sb s6, 11(a2)
; RV32I-NEXT: srli a4, a0, 8
; RV32I-NEXT: sb a3, 12(a2)
-; RV32I-NEXT: sb s11, 13(a2)
-; RV32I-NEXT: sb s10, 14(a2)
-; RV32I-NEXT: sb s9, 15(a2)
+; RV32I-NEXT: sb t1, 13(a2)
+; RV32I-NEXT: sb t2, 14(a2)
+; RV32I-NEXT: sb t3, 15(a2)
; RV32I-NEXT: sb a1, 0(a2)
-; RV32I-NEXT: sb t0, 1(a2)
-; RV32I-NEXT: sb t1, 2(a2)
-; RV32I-NEXT: sb a7, 3(a2)
+; RV32I-NEXT: sb t4, 1(a2)
+; RV32I-NEXT: sb t6, 2(a2)
+; RV32I-NEXT: sb t0, 3(a2)
; RV32I-NEXT: sb a0, 4(a2)
; RV32I-NEXT: sb a4, 5(a2)
; RV32I-NEXT: sb a5, 6(a2)
; RV32I-NEXT: sb a6, 7(a2)
-; RV32I-NEXT: lw ra, 124(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s0, 120(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s1, 116(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s2, 112(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s3, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s4, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s5, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s6, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s7, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s8, 88(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s9, 84(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s10, 80(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s11, 76(sp) # 4-byte Folded Reload
-; RV32I-NEXT: addi sp, sp, 128
+; RV32I-NEXT: lw s0, 108(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 104(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s2, 100(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s3, 96(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s4, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s5, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s6, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s7, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s8, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s9, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s10, 68(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s11, 64(sp) # 4-byte Folded Reload
+; RV32I-NEXT: addi sp, sp, 112
; RV32I-NEXT: ret
%src = load i256, ptr %src.ptr, align 1
%dwordOff = load i256, ptr %dwordOff.ptr, align 1
diff --git a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
index b2c130c2d7c10a..b8952d2cb2b29e 100644
--- a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
+++ b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
@@ -1530,25 +1530,24 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
;
; RV32I-LABEL: lshr_32bytes:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi sp, sp, -128
-; RV32I-NEXT: sw ra, 124(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s0, 120(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s1, 116(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s2, 112(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s3, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s4, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s5, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s6, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s7, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s8, 88(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s9, 84(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s10, 80(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s11, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT: addi sp, sp, -112
+; RV32I-NEXT: sw s0, 108(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 104(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s2, 100(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s3, 96(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s4, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s5, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s6, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s7, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s8, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s9, 72(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s10, 68(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s11, 64(sp) # 4-byte Folded Spill
; RV32I-NEXT: lbu a3, 0(a0)
; RV32I-NEXT: lbu a4, 1(a0)
-; RV32I-NEXT: lbu a6, 2(a0)
-; RV32I-NEXT: lbu a7, 3(a0)
-; RV32I-NEXT: lbu a5, 4(a0)
+; RV32I-NEXT: lbu a5, 2(a0)
+; RV32I-NEXT: lbu a6, 3(a0)
+; RV32I-NEXT: lbu a7, 4(a0)
; RV32I-NEXT: lbu t0, 5(a0)
; RV32I-NEXT: lbu t1, 6(a0)
; RV32I-NEXT: lbu t2, 7(a0)
@@ -1557,107 +1556,105 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: lbu t5, 10(a0)
; RV32I-NEXT: lbu t6, 11(a0)
; RV32I-NEXT: lbu s0, 12(a0)
-; RV32I-NEXT: lbu s2, 13(a0)
-; RV32I-NEXT: lbu s4, 14(a0)
-; RV32I-NEXT: lbu s5, 15(a0)
-; RV32I-NEXT: lbu s6, 16(a0)
-; RV32I-NEXT: lbu s7, 17(a0)
-; RV32I-NEXT: lbu s8, 18(a0)
-; RV32I-NEXT: lbu s9, 19(a0)
+; RV32I-NEXT: lbu s1, 13(a0)
+; RV32I-NEXT: lbu s2, 14(a0)
+; RV32I-NEXT: lbu s3, 15(a0)
+; RV32I-NEXT: lbu s4, 16(a0)
+; RV32I-NEXT: lbu s5, 17(a0)
+; RV32I-NEXT: lbu s6, 18(a0)
+; RV32I-NEXT: lbu s7, 19(a0)
; RV32I-NEXT: slli a4, a4, 8
-; RV32I-NEXT: slli a6, a6, 16
-; RV32I-NEXT: slli a7, a7, 24
-; RV32I-NEXT: or a3, a4, a3
-; RV32I-NEXT: sw a3, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT: or a4, a7, a6
-; RV32I-NEXT: lbu s10, 20(a0)
-; RV32I-NEXT: lbu s11, 21(a0)
-; RV32I-NEXT: lbu ra, 22(a0)
-; RV32I-NEXT: lbu a3, 23(a0)
+; RV32I-NEXT: slli a5, a5, 16
+; RV32I-NEXT: slli a6, a6, 24
; RV32I-NEXT: slli t0, t0, 8
; RV32I-NEXT: slli t1, t1, 16
; RV32I-NEXT: slli t2, t2, 24
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a5, t0, a7
+; RV32I-NEXT: or a6, t2, t1
+; RV32I-NEXT: lbu s8, 20(a0)
+; RV32I-NEXT: lbu s9, 21(a0)
+; RV32I-NEXT: lbu s10, 22(a0)
+; RV32I-NEXT: lbu s11, 23(a0)
; RV32I-NEXT: slli t4, t4, 8
; RV32I-NEXT: slli t5, t5, 16
; RV32I-NEXT: slli t6, t6, 24
-; RV32I-NEXT: or a5, t0, a5
-; RV32I-NEXT: or a6, t2, t1
+; RV32I-NEXT: slli s1, s1, 8
+; RV32I-NEXT: slli s2, s2, 16
+; RV32I-NEXT: slli s3, s3, 24
; RV32I-NEXT: or a7, t4, t3
; RV32I-NEXT: or t0, t6, t5
-; RV32I-NEXT: lbu s1, 24(a0)
-; RV32I-NEXT: lbu s3, 25(a0)
-; RV32I-NEXT: lbu t4, 26(a0)
-; RV32I-NEXT: lbu t5, 27(a0)
-; RV32I-NEXT: slli s2, s2, 8
-; RV32I-NEXT: slli s4, s4, 16
-; RV32I-NEXT: slli s5, s5, 24
-; RV32I-NEXT: slli s7, s7, 8
-; RV32I-NEXT: or t1, s2, s0
-; RV32I-NEXT: or t2, s5, s4
-; RV32I-NEXT: or t3, s7, s6
-; RV32I-NEXT: lbu t6, 28(a0)
+; RV32I-NEXT: or t1, s1, s0
+; RV32I-NEXT: or t2, s3, s2
+; RV32I-NEXT: lbu t6, 24(a0)
+; RV32I-NEXT: lbu s0, 25(a0)
+; RV32I-NEXT: lbu s1, 26(a0)
+; RV32I-NEXT: lbu s2, 27(a0)
+; RV32I-NEXT: slli s5, s5, 8
+; RV32I-NEXT: slli s6, s6, 16
+; RV32I-NEXT: slli s7, s7, 24
+; RV32I-NEXT: slli s9, s9, 8
+; RV32I-NEXT: or t3, s5, s4
+; RV32I-NEXT: or t4, s7, s6
+; RV32I-NEXT: or t5, s9, s8
+; RV32I-NEXT: lbu s3, 28(a0)
; RV32I-NEXT: lbu s4, 29(a0)
; RV32I-NEXT: lbu s5, 30(a0)
; RV32I-NEXT: lbu s6, 31(a0)
-; RV32I-NEXT: slli s8, s8, 16
-; RV32I-NEXT: slli s9, s9, 24
-; RV32I-NEXT: slli s11, s11, 8
-; RV32I-NEXT: slli ra, ra, 16
-; RV32I-NEXT: slli a3, a3, 24
-; RV32I-NEXT: or a0, s9, s8
-; RV32I-NEXT: or s0, s11, s10
-; RV32I-NEXT: or s2, a3, ra
-; RV32I-NEXT: lbu a3, 0(a1)
-; RV32I-NEXT: lbu s7, 1(a1)
-; RV32I-NEXT: lbu s8, 2(a1)
+; RV32I-NEXT: slli s10, s10, 16
+; RV32I-NEXT: slli s11, s11, 24
+; RV32I-NEXT: slli s0, s0, 8
+; RV32I-NEXT: slli s1, s1, 16
+; RV32I-NEXT: slli s2, s2, 24
+; RV32I-NEXT: or a0, s11, s10
+; RV32I-NEXT: or t6, s0, t6
+; RV32I-NEXT: or s0, s2, s1
+; RV32I-NEXT: lbu s1, 0(a1)
+; RV32I-NEXT: lbu s2, 1(a1)
+; RV32I-NEXT: lbu s7, 2(a1)
; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: sw zero, 48(sp)
+; RV32I-NEXT: sw zero, 52(sp)
; RV32I-NEXT: sw zero, 56(sp)
; RV32I-NEXT: sw zero, 60(sp)
-; RV32I-NEXT: sw zero, 64(sp)
-; RV32I-NEXT: sw zero, 68(sp)
+; RV32I-NEXT: sw zero, 32(sp)
+; RV32I-NEXT: sw zero, 36(sp)
; RV32I-NEXT: sw zero, 40(sp)
; RV32I-NEXT: sw zero, 44(sp)
-; RV32I-NEXT: sw zero, 48(sp)
-; RV32I-NEXT: sw zero, 52(sp)
-; RV32I-NEXT: slli s3, s3, 8
-; RV32I-NEXT: or s1, s3, s1
-; RV32I-NEXT: addi s3, sp, 8
-; RV32I-NEXT: slli t4, t4, 16
-; RV32I-NEXT: slli t5, t5, 24
; RV32I-NEXT: slli s4, s4, 8
+; RV32I-NEXT: or s3, s4, s3
+; RV32I-NEXT: mv s4, sp
; RV32I-NEXT: slli s5, s5, 16
; RV32I-NEXT: slli s6, s6, 24
-; RV32I-NEXT: slli s7, s7, 8
-; RV32I-NEXT: slli s8, s8, 16
+; RV32I-NEXT: slli s2, s2, 8
+; RV32I-NEXT: slli s7, s7, 16
; RV32I-NEXT: slli a1, a1, 24
-; RV32I-NEXT: or t4, t5, t4
-; RV32I-NEXT: or t5, s4, t6
-; RV32I-NEXT: or t6, s6, s5
-; RV32I-NEXT: or a3, s7, a3
-; RV32I-NEXT: or a1, a1, s8
-; RV32I-NEXT: lw s4, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT: or a4, a4, s4
-; RV32I-NEXT: or a5, a6, a5
-; RV32I-NEXT: or a6, t0, a7
-; RV32I-NEXT: or a7, t2, t1
-; RV32I-NEXT: or t0, a0, t3
-; RV32I-NEXT: or t1, s2, s0
-; RV32I-NEXT: or t2, t4, s1
-; RV32I-NEXT: or t3, t6, t5
-; RV32I-NEXT: or a0, a1, a3
-; RV32I-NEXT: sw t0, 24(sp)
-; RV32I-NEXT: sw t1, 28(sp)
-; RV32I-NEXT: sw t2, 32(sp)
-; RV32I-NEXT: sw t3, 36(sp)
-; RV32I-NEXT: sw a4, 8(sp)
-; RV32I-NEXT: sw a5, 12(sp)
-; RV32I-NEXT: sw a6, 16(sp)
-; RV32I-NEXT: sw a7, 20(sp)
+; RV32I-NEXT: or s5, s6, s5
+; RV32I-NEXT: or s1, s2, s1
+; RV32I-NEXT: or a1, a1, s7
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a5, t0, a7
+; RV32I-NEXT: or a6, t2, t1
+; RV32I-NEXT: or a7, t4, t3
+; RV32I-NEXT: or t0, a0, t5
+; RV32I-NEXT: or t1, s0, t6
+; RV32I-NEXT: or t2, s5, s3
+; RV32I-NEXT: or a0, a1, s1
+; RV32I-NEXT: sw a7, 16(sp)
+; RV32I-NEXT: sw t0, 20(sp)
+; RV32I-NEXT: sw t1, 24(sp)
+; RV32I-NEXT: sw t2, 28(sp)
+; RV32I-NEXT: sw a3, 0(sp)
+; RV32I-NEXT: sw a4, 4(sp)
+; RV32I-NEXT: sw a5, 8(sp)
+; RV32I-NEXT: sw a6, 12(sp)
; RV32I-NEXT: srli a1, a0, 3
; RV32I-NEXT: andi a3, a0, 31
; RV32I-NEXT: andi a4, a1, 28
; RV32I-NEXT: xori a1, a3, 31
-; RV32I-NEXT: add a4, s3, a4
+; RV32I-NEXT: add a4, s4, a4
; RV32I-NEXT: lw a3, 0(a4)
; RV32I-NEXT: lw a5, 4(a4)
; RV32I-NEXT: lw a6, 8(a4)
@@ -1717,13 +1714,13 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: srli s5, a3, 24
; RV32I-NEXT: srli s6, a3, 16
; RV32I-NEXT: srli s7, a3, 8
-; RV32I-NEXT: srli s8, a1, 24
-; RV32I-NEXT: srli s9, a1, 16
; RV32I-NEXT: sb a7, 24(a2)
+; RV32I-NEXT: srli a7, a1, 24
; RV32I-NEXT: sb t2, 25(a2)
+; RV32I-NEXT: srli t2, a1, 16
; RV32I-NEXT: sb t1, 26(a2)
; RV32I-NEXT: sb t0, 27(a2)
-; RV32I-NEXT: srli a7, a1, 8
+; RV32I-NEXT: srli t0, a1, 8
; RV32I-NEXT: sb a6, 16(a2)
; RV32I-NEXT: sb t5, 17(a2)
; RV32I-NEXT: sb t4, 18(a2)
@@ -1744,27 +1741,26 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: sb s6, 14(a2)
; RV32I-NEXT: sb s5, 15(a2)
; RV32I-NEXT: sb a1, 0(a2)
-; RV32I-NEXT: sb a7, 1(a2)
-; RV32I-NEXT: sb s9, 2(a2)
-; RV32I-NEXT: sb s8, 3(a2)
+; RV32I-NEXT: sb t0, 1(a2)
+; RV32I-NEXT: sb t2, 2(a2)
+; RV32I-NEXT: sb a7, 3(a2)
; RV32I-NEXT: sb a0, 4(a2)
; RV32I-NEXT: sb a4, 5(a2)
; RV32I-NEXT: sb a5, 6(a2)
; RV32I-NEXT: sb a6, 7(a2)
-; RV32I-NEXT: lw ra, 124(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s0, 120(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s1, 116(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s2, 112(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s3, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s4, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s5, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s6, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s7, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s8, 88(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s9, 84(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s10, 80(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s11, 76(sp) # 4-byte Folded Reload
-; RV32I-NEXT: addi sp, sp, 128
+; RV32I-NEXT: lw s0, 108(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 104(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s2, 100(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s3, 96(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s4, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s5, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s6, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s7, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s8, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s9, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s10, 68(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s11, 64(sp) # 4-byte Folded Reload
+; RV32I-NEXT: addi sp, sp, 112
; RV32I-NEXT: ret
%src = load i256, ptr %src.ptr, align 1
%bitOff = load i256, ptr %bitOff.ptr, align 1
@@ -2006,25 +2002,24 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
;
; RV32I-LABEL: shl_32bytes:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi sp, sp, -128
-; RV32I-NEXT: sw ra, 124(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s0, 120(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s1, 116(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s2, 112(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s3, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s4, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s5, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s6, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s7, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s8, 88(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s9, 84(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s10, 80(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s11, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT: addi sp, sp, -112
+; RV32I-NEXT: sw s0, 108(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 104(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s2, 100(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s3, 96(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s4, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s5, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s6, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s7, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s8, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s9, 72(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s10, 68(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s11, 64(sp) # 4-byte Folded Spill
; RV32I-NEXT: lbu a3, 0(a0)
; RV32I-NEXT: lbu a4, 1(a0)
-; RV32I-NEXT: lbu a6, 2(a0)
-; RV32I-NEXT: lbu a7, 3(a0)
-; RV32I-NEXT: lbu a5, 4(a0)
+; RV32I-NEXT: lbu a5, 2(a0)
+; RV32I-NEXT: lbu a6, 3(a0)
+; RV32I-NEXT: lbu a7, 4(a0)
; RV32I-NEXT: lbu t0, 5(a0)
; RV32I-NEXT: lbu t1, 6(a0)
; RV32I-NEXT: lbu t2, 7(a0)
@@ -2033,107 +2028,105 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: lbu t5, 10(a0)
; RV32I-NEXT: lbu t6, 11(a0)
; RV32I-NEXT: lbu s0, 12(a0)
-; RV32I-NEXT: lbu s2, 13(a0)
-; RV32I-NEXT: lbu s4, 14(a0)
-; RV32I-NEXT: lbu s5, 15(a0)
-; RV32I-NEXT: lbu s6, 16(a0)
-; RV32I-NEXT: lbu s7, 17(a0)
-; RV32I-NEXT: lbu s8, 18(a0)
-; RV32I-NEXT: lbu s9, 19(a0)
+; RV32I-NEXT: lbu s1, 13(a0)
+; RV32I-NEXT: lbu s2, 14(a0)
+; RV32I-NEXT: lbu s3, 15(a0)
+; RV32I-NEXT: lbu s4, 16(a0)
+; RV32I-NEXT: lbu s5, 17(a0)
+; RV32I-NEXT: lbu s6, 18(a0)
+; RV32I-NEXT: lbu s7, 19(a0)
; RV32I-NEXT: slli a4, a4, 8
-; RV32I-NEXT: slli a6, a6, 16
-; RV32I-NEXT: slli a7, a7, 24
-; RV32I-NEXT: or a3, a4, a3
-; RV32I-NEXT: sw a3, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT: or a4, a7, a6
-; RV32I-NEXT: lbu s10, 20(a0)
-; RV32I-NEXT: lbu s11, 21(a0)
-; RV32I-NEXT: lbu ra, 22(a0)
-; RV32I-NEXT: lbu a3, 23(a0)
+; RV32I-NEXT: slli a5, a5, 16
+; RV32I-NEXT: slli a6, a6, 24
; RV32I-NEXT: slli t0, t0, 8
; RV32I-NEXT: slli t1, t1, 16
; RV32I-NEXT: slli t2, t2, 24
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a5, t0, a7
+; RV32I-NEXT: or a6, t2, t1
+; RV32I-NEXT: lbu s8, 20(a0)
+; RV32I-NEXT: lbu s9, 21(a0)
+; RV32I-NEXT: lbu s10, 22(a0)
+; RV32I-NEXT: lbu s11, 23(a0)
; RV32I-NEXT: slli t4, t4, 8
; RV32I-NEXT: slli t5, t5, 16
; RV32I-NEXT: slli t6, t6, 24
-; RV32I-NEXT: or a5, t0, a5
-; RV32I-NEXT: or a6, t2, t1
+; RV32I-NEXT: slli s1, s1, 8
+; RV32I-NEXT: slli s2, s2, 16
+; RV32I-NEXT: slli s3, s3, 24
; RV32I-NEXT: or a7, t4, t3
; RV32I-NEXT: or t0, t6, t5
-; RV32I-NEXT: lbu s1, 24(a0)
-; RV32I-NEXT: lbu s3, 25(a0)
-; RV32I-NEXT: lbu t4, 26(a0)
-; RV32I-NEXT: lbu t5, 27(a0)
-; RV32I-NEXT: slli s2, s2, 8
-; RV32I-NEXT: slli s4, s4, 16
-; RV32I-NEXT: slli s5, s5, 24
-; RV32I-NEXT: slli s7, s7, 8
-; RV32I-NEXT: or t1, s2, s0
-; RV32I-NEXT: or t2, s5, s4
-; RV32I-NEXT: or t3, s7, s6
-; RV32I-NEXT: lbu t6, 28(a0)
+; RV32I-NEXT: or t1, s1, s0
+; RV32I-NEXT: or t2, s3, s2
+; RV32I-NEXT: lbu t6, 24(a0)
+; RV32I-NEXT: lbu s0, 25(a0)
+; RV32I-NEXT: lbu s1, 26(a0)
+; RV32I-NEXT: lbu s2, 27(a0)
+; RV32I-NEXT: slli s5, s5, 8
+; RV32I-NEXT: slli s6, s6, 16
+; RV32I-NEXT: slli s7, s7, 24
+; RV32I-NEXT: slli s9, s9, 8
+; RV32I-NEXT: or t3, s5, s4
+; RV32I-NEXT: or t4, s7, s6
+; RV32I-NEXT: or t5, s9, s8
+; RV32I-NEXT: lbu s3, 28(a0)
; RV32I-NEXT: lbu s4, 29(a0)
; RV32I-NEXT: lbu s5, 30(a0)
; RV32I-NEXT: lbu s6, 31(a0)
-; RV32I-NEXT: slli s8, s8, 16
-; RV32I-NEXT: slli s9, s9, 24
-; RV32I-NEXT: slli s11, s11, 8
-; RV32I-NEXT: slli ra, ra, 16
-; RV32I-NEXT: slli a3, a3, 24
-; RV32I-NEXT: or a0, s9, s8
-; RV32I-NEXT: or s0, s11, s10
-; RV32I-NEXT: or s2, a3, ra
-; RV32I-NEXT: lbu a3, 0(a1)
-; RV32I-NEXT: lbu s7, 1(a1)
-; RV32I-NEXT: lbu s8, 2(a1)
+; RV32I-NEXT: slli s10, s10, 16
+; RV32I-NEXT: slli s11, s11, 24
+; RV32I-NEXT: slli s0, s0, 8
+; RV32I-NEXT: slli s1, s1, 16
+; RV32I-NEXT: slli s2, s2, 24
+; RV32I-NEXT: or a0, s11, s10
+; RV32I-NEXT: or t6, s0, t6
+; RV32I-NEXT: or s0, s2, s1
+; RV32I-NEXT: lbu s1, 0(a1)
+; RV32I-NEXT: lbu s2, 1(a1)
+; RV32I-NEXT: lbu s7, 2(a1)
; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: sw zero, 16(sp)
+; RV32I-NEXT: sw zero, 20(sp)
; RV32I-NEXT: sw zero, 24(sp)
; RV32I-NEXT: sw zero, 28(sp)
-; RV32I-NEXT: sw zero, 32(sp)
-; RV32I-NEXT: sw zero, 36(sp)
+; RV32I-NEXT: sw zero, 0(sp)
+; RV32I-NEXT: sw zero, 4(sp)
; RV32I-NEXT: sw zero, 8(sp)
; RV32I-NEXT: sw zero, 12(sp)
-; RV32I-NEXT: sw zero, 16(sp)
-; RV32I-NEXT: sw zero, 20(sp)
-; RV32I-NEXT: slli s3, s3, 8
-; RV32I-NEXT: or s1, s3, s1
-; RV32I-NEXT: addi s3, sp, 40
-; RV32I-NEXT: slli t4, t4, 16
-; RV32I-NEXT: slli t5, t5, 24
; RV32I-NEXT: slli s4, s4, 8
+; RV32I-NEXT: or s3, s4, s3
+; RV32I-NEXT: addi s4, sp, 32
; RV32I-NEXT: slli s5, s5, 16
; RV32I-NEXT: slli s6, s6, 24
-; RV32I-NEXT: slli s7, s7, 8
-; RV32I-NEXT: slli s8, s8, 16
+; RV32I-NEXT: slli s2, s2, 8
+; RV32I-NEXT: slli s7, s7, 16
; RV32I-NEXT: slli a1, a1, 24
-; RV32I-NEXT: or t4, t5, t4
-; RV32I-NEXT: or t5, s4, t6
-; RV32I-NEXT: or t6, s6, s5
-; RV32I-NEXT: or a3, s7, a3
-; RV32I-NEXT: or a1, a1, s8
-; RV32I-NEXT: lw s4, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT: or a4, a4, s4
-; RV32I-NEXT: or a5, a6, a5
-; RV32I-NEXT: or a6, t0, a7
-; RV32I-NEXT: or a7, t2, t1
-; RV32I-NEXT: or t0, a0, t3
-; RV32I-NEXT: or t1, s2, s0
-; RV32I-NEXT: or t2, t4, s1
-; RV32I-NEXT: or t3, t6, t5
-; RV32I-NEXT: or a0, a1, a3
-; RV32I-NEXT: sw t0, 56(sp)
-; RV32I-NEXT: sw t1, 60(sp)
-; RV32I-NEXT: sw t2, 64(sp)
-; RV32I-NEXT: sw t3, 68(sp)
-; RV32I-NEXT: sw a4, 40(sp)
-; RV32I-NEXT: sw a5, 44(sp)
-; RV32I-NEXT: sw a6, 48(sp)
-; RV32I-NEXT: sw a7, 52(sp)
+; RV32I-NEXT: or s5, s6, s5
+; RV32I-NEXT: or s1, s2, s1
+; RV32I-NEXT: or a1, a1, s7
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a5, t0, a7
+; RV32I-NEXT: or a6, t2, t1
+; RV32I-NEXT: or a7, t4, t3
+; RV32I-NEXT: or t0, a0, t5
+; RV32I-NEXT: or t1, s0, t6
+; RV32I-NEXT: or t2, s5, s3
+; RV32I-NEXT: or a0, a1, s1
+; RV32I-NEXT: sw a7, 48(sp)
+; RV32I-NEXT: sw t0, 52(sp)
+; RV32I-NEXT: sw t1, 56(sp)
+; RV32I-NEXT: sw t2, 60(sp)
+; RV32I-NEXT: sw a3, 32(sp)
+; RV32I-NEXT: sw a4, 36(sp)
+; RV32I-NEXT: sw a5, 40(sp)
+; RV32I-NEXT: sw a6, 44(sp)
; RV32I-NEXT: srli a1, a0, 3
; RV32I-NEXT: andi a3, a0, 31
; RV32I-NEXT: andi a4, a1, 28
; RV32I-NEXT: xori a1, a3, 31
-; RV32I-NEXT: sub a3, s3, a4
+; RV32I-NEXT: sub a3, s4, a4
; RV32I-NEXT: lw a4, 0(a3)
; RV32I-NEXT: lw a5, 4(a3)
; RV32I-NEXT: lw a6, 8(a3)
@@ -2193,13 +2186,13 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: srli s5, a3, 24
; RV32I-NEXT: srli s6, a3, 16
; RV32I-NEXT: srli s7, a3, 8
-; RV32I-NEXT: srli s8, a1, 24
-; RV32I-NEXT: srli s9, a1, 16
; RV32I-NEXT: sb a7, 24(a2)
+; RV32I-NEXT: srli a7, a1, 24
; RV32I-NEXT: sb t2, 25(a2)
+; RV32I-NEXT: srli t2, a1, 16
; RV32I-NEXT: sb t1, 26(a2)
; RV32I-NEXT: sb t0, 27(a2)
-; RV32I-NEXT: srli a7, a1, 8
+; RV32I-NEXT: srli t0, a1, 8
; RV32I-NEXT: sb a6, 28(a2)
; RV32I-NEXT: sb t5, 29(a2)
; RV32I-NEXT: sb t4, 30(a2)
@@ -2220,27 +2213,26 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: sb s6, 10(a2)
; RV32I-NEXT: sb s5, 11(a2)
; RV32I-NEXT: sb a1, 12(a2)
-; RV32I-NEXT: sb a7, 13(a2)
-; RV32I-NEXT: sb s9, 14(a2)
-; RV32I-NEXT: sb s8, 15(a2)
+; RV32I-NEXT: sb t0, 13(a2)
+; RV32I-NEXT: sb t2, 14(a2)
+; RV32I-NEXT: sb a7, 15(a2)
; RV32I-NEXT: sb a0, 4(a2)
; RV32I-NEXT: sb a4, 5(a2)
; RV32I-NEXT: sb a5, 6(a2)
; RV32I-NEXT: sb a6, 7(a2)
-; RV32I-NEXT: lw ra, 124(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s0, 120(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s1, 116(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s2, 112(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s3, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s4, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s5, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s6, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s7, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s8, 88(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s9, 84(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s10, 80(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s11, 76(sp) # 4-byte Folded Reload
-; RV32I-NEXT: addi sp, sp, 128
+; RV32I-NEXT: lw s0, 108(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 104(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s2, 100(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s3, 96(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s4, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s5, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s6, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s7, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s8, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s9, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s10, 68(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s11, 64(sp) # 4-byte Folded Reload
+; RV32I-NEXT: addi sp, sp, 112
; RV32I-NEXT: ret
%src = load i256, ptr %src.ptr, align 1
%bitOff = load i256, ptr %bitOff.ptr, align 1
@@ -2483,25 +2475,24 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
;
; RV32I-LABEL: ashr_32bytes:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi sp, sp, -128
-; RV32I-NEXT: sw ra, 124(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s0, 120(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s1, 116(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s2, 112(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s3, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s4, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s5, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s6, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s7, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s8, 88(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s9, 84(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s10, 80(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s11, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT: addi sp, sp, -112
+; RV32I-NEXT: sw s0, 108(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 104(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s2, 100(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s3, 96(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s4, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s5, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s6, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s7, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s8, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s9, 72(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s10, 68(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s11, 64(sp) # 4-byte Folded Spill
; RV32I-NEXT: lbu a3, 0(a0)
; RV32I-NEXT: lbu a4, 1(a0)
-; RV32I-NEXT: lbu a6, 2(a0)
-; RV32I-NEXT: lbu a7, 3(a0)
-; RV32I-NEXT: lbu a5, 4(a0)
+; RV32I-NEXT: lbu a5, 2(a0)
+; RV32I-NEXT: lbu a6, 3(a0)
+; RV32I-NEXT: lbu a7, 4(a0)
; RV32I-NEXT: lbu t0, 5(a0)
; RV32I-NEXT: lbu t1, 6(a0)
; RV32I-NEXT: lbu t2, 7(a0)
@@ -2518,100 +2509,98 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: lbu s6, 18(a0)
; RV32I-NEXT: lbu s7, 19(a0)
; RV32I-NEXT: slli a4, a4, 8
-; RV32I-NEXT: slli a6, a6, 16
-; RV32I-NEXT: slli a7, a7, 24
+; RV32I-NEXT: slli a5, a5, 16
+; RV32I-NEXT: slli a6, a6, 24
+; RV32I-NEXT: slli t0, t0, 8
+; RV32I-NEXT: slli t1, t1, 16
+; RV32I-NEXT: slli t2, t2, 24
; RV32I-NEXT: or a3, a4, a3
-; RV32I-NEXT: sw a3, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT: or a4, a7, a6
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a5, t0, a7
+; RV32I-NEXT: or a6, t2, t1
; RV32I-NEXT: lbu s8, 20(a0)
; RV32I-NEXT: lbu s9, 21(a0)
; RV32I-NEXT: lbu s10, 22(a0)
; RV32I-NEXT: lbu s11, 23(a0)
-; RV32I-NEXT: slli t0, t0, 8
-; RV32I-NEXT: slli t1, t1, 16
-; RV32I-NEXT: slli t2, t2, 24
; RV32I-NEXT: slli t4, t4, 8
; RV32I-NEXT: slli t5, t5, 16
; RV32I-NEXT: slli t6, t6, 24
-; RV32I-NEXT: or a5, t0, a5
-; RV32I-NEXT: or a6, t2, t1
-; RV32I-NEXT: or a7, t4, t3
-; RV32I-NEXT: or t0, t6, t5
-; RV32I-NEXT: lbu ra, 24(a0)
-; RV32I-NEXT: lbu a3, 25(a0)
-; RV32I-NEXT: lbu t4, 26(a0)
-; RV32I-NEXT: lbu t5, 27(a0)
; RV32I-NEXT: slli s1, s1, 8
; RV32I-NEXT: slli s2, s2, 16
; RV32I-NEXT: slli s3, s3, 24
-; RV32I-NEXT: slli s5, s5, 8
+; RV32I-NEXT: or a7, t4, t3
+; RV32I-NEXT: or t0, t6, t5
; RV32I-NEXT: or t1, s1, s0
; RV32I-NEXT: or t2, s3, s2
-; RV32I-NEXT: or t3, s5, s4
-; RV32I-NEXT: lbu t6, 28(a0)
-; RV32I-NEXT: lbu s0, 29(a0)
-; RV32I-NEXT: lbu s1, 30(a0)
-; RV32I-NEXT: lbu a0, 31(a0)
+; RV32I-NEXT: lbu t6, 24(a0)
+; RV32I-NEXT: lbu s0, 25(a0)
+; RV32I-NEXT: lbu s1, 26(a0)
+; RV32I-NEXT: lbu s2, 27(a0)
+; RV32I-NEXT: slli s5, s5, 8
; RV32I-NEXT: slli s6, s6, 16
; RV32I-NEXT: slli s7, s7, 24
; RV32I-NEXT: slli s9, s9, 8
+; RV32I-NEXT: or t3, s5, s4
+; RV32I-NEXT: or t4, s7, s6
+; RV32I-NEXT: or t5, s9, s8
+; RV32I-NEXT: lbu s3, 28(a0)
+; RV32I-NEXT: lbu s4, 29(a0)
+; RV32I-NEXT: lbu s5, 30(a0)
+; RV32I-NEXT: lbu a0, 31(a0)
; RV32I-NEXT: slli s10, s10, 16
; RV32I-NEXT: slli s11, s11, 24
-; RV32I-NEXT: or s2, s7, s6
-; RV32I-NEXT: or s3, s9, s8
-; RV32I-NEXT: or s4, s11, s10
-; RV32I-NEXT: lbu s5, 0(a1)
-; RV32I-NEXT: lbu s6, 1(a1)
-; RV32I-NEXT: lbu s7, 2(a1)
-; RV32I-NEXT: lbu a1, 3(a1)
-; RV32I-NEXT: slli a3, a3, 8
-; RV32I-NEXT: or a3, a3, ra
-; RV32I-NEXT: addi s8, sp, 8
-; RV32I-NEXT: slli t4, t4, 16
-; RV32I-NEXT: slli t5, t5, 24
; RV32I-NEXT: slli s0, s0, 8
; RV32I-NEXT: slli s1, s1, 16
+; RV32I-NEXT: slli s2, s2, 24
+; RV32I-NEXT: or s6, s11, s10
+; RV32I-NEXT: or t6, s0, t6
+; RV32I-NEXT: or s0, s2, s1
+; RV32I-NEXT: lbu s1, 0(a1)
+; RV32I-NEXT: lbu s2, 1(a1)
+; RV32I-NEXT: lbu s7, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: slli s4, s4, 8
+; RV32I-NEXT: or s3, s4, s3
+; RV32I-NEXT: mv s4, sp
+; RV32I-NEXT: slli s5, s5, 16
; RV32I-NEXT: slli a0, a0, 24
-; RV32I-NEXT: slli s6, s6, 8
+; RV32I-NEXT: slli s2, s2, 8
; RV32I-NEXT: slli s7, s7, 16
; RV32I-NEXT: slli a1, a1, 24
-; RV32I-NEXT: or t4, t5, t4
-; RV32I-NEXT: or t5, s0, t6
-; RV32I-NEXT: or s1, a0, s1
-; RV32I-NEXT: or t6, s6, s5
+; RV32I-NEXT: or s5, a0, s5
+; RV32I-NEXT: or s1, s2, s1
; RV32I-NEXT: or a1, a1, s7
-; RV32I-NEXT: srai s0, a0, 31
-; RV32I-NEXT: lw a0, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT: or a4, a4, a0
-; RV32I-NEXT: or a5, a6, a5
-; RV32I-NEXT: or a6, t0, a7
-; RV32I-NEXT: or a7, t2, t1
-; RV32I-NEXT: or t0, s2, t3
-; RV32I-NEXT: or t1, s4, s3
-; RV32I-NEXT: or a3, t4, a3
-; RV32I-NEXT: or t2, s1, t5
-; RV32I-NEXT: or a0, a1, t6
-; RV32I-NEXT: sw s0, 56(sp)
-; RV32I-NEXT: sw s0, 60(sp)
-; RV32I-NEXT: sw s0, 64(sp)
-; RV32I-NEXT: sw s0, 68(sp)
-; RV32I-NEXT: sw s0, 40(sp)
-; RV32I-NEXT: sw s0, 44(sp)
-; RV32I-NEXT: sw s0, 48(sp)
-; RV32I-NEXT: sw s0, 52(sp)
-; RV32I-NEXT: sw t0, 24(sp)
-; RV32I-NEXT: sw t1, 28(sp)
-; RV32I-NEXT: sw a3, 32(sp)
-; RV32I-NEXT: sw t2, 36(sp)
-; RV32I-NEXT: sw a4, 8(sp)
-; RV32I-NEXT: sw a5, 12(sp)
-; RV32I-NEXT: sw a6, 16(sp)
-; RV32I-NEXT: sw a7, 20(sp)
+; RV32I-NEXT: srai s2, a0, 31
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a5, t0, a7
+; RV32I-NEXT: or a6, t2, t1
+; RV32I-NEXT: or a7, t4, t3
+; RV32I-NEXT: or t0, s6, t5
+; RV32I-NEXT: or t1, s0, t6
+; RV32I-NEXT: or t2, s5, s3
+; RV32I-NEXT: or a0, a1, s1
+; RV32I-NEXT: sw s2, 48(sp)
+; RV32I-NEXT: sw s2, 52(sp)
+; RV32I-NEXT: sw s2, 56(sp)
+; RV32I-NEXT: sw s2, 60(sp)
+; RV32I-NEXT: sw s2, 32(sp)
+; RV32I-NEXT: sw s2, 36(sp)
+; RV32I-NEXT: sw s2, 40(sp)
+; RV32I-NEXT: sw s2, 44(sp)
+; RV32I-NEXT: sw a7, 16(sp)
+; RV32I-NEXT: sw t0, 20(sp)
+; RV32I-NEXT: sw t1, 24(sp)
+; RV32I-NEXT: sw t2, 28(sp)
+; RV32I-NEXT: sw a3, 0(sp)
+; RV32I-NEXT: sw a4, 4(sp)
+; RV32I-NEXT: sw a5, 8(sp)
+; RV32I-NEXT: sw a6, 12(sp)
; RV32I-NEXT: srli a1, a0, 3
; RV32I-NEXT: andi a3, a0, 31
; RV32I-NEXT: andi a4, a1, 28
; RV32I-NEXT: xori a1, a3, 31
-; RV32I-NEXT: add a4, s8, a4
+; RV32I-NEXT: add a4, s4, a4
; RV32I-NEXT: lw a3, 0(a4)
; RV32I-NEXT: lw a5, 4(a4)
; RV32I-NEXT: lw a6, 8(a4)
@@ -2671,13 +2660,13 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: srli s5, a3, 24
; RV32I-NEXT: srli s6, a3, 16
; RV32I-NEXT: srli s7, a3, 8
-; RV32I-NEXT: srli s8, a1, 24
-; RV32I-NEXT: srli s9, a1, 16
; RV32I-NEXT: sb a7, 24(a2)
+; RV32I-NEXT: srli a7, a1, 24
; RV32I-NEXT: sb t2, 25(a2)
+; RV32I-NEXT: srli t2, a1, 16
; RV32I-NEXT: sb t1, 26(a2)
; RV32I-NEXT: sb t0, 27(a2)
-; RV32I-NEXT: srli a7, a1, 8
+; RV32I-NEXT: srli t0, a1, 8
; RV32I-NEXT: sb a6, 16(a2)
; RV32I-NEXT: sb t5, 17(a2)
; RV32I-NEXT: sb t4, 18(a2)
@@ -2698,27 +2687,26 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: sb s6, 14(a2)
; RV32I-NEXT: sb s5, 15(a2)
; RV32I-NEXT: sb a1, 0(a2)
-; RV32I-NEXT: sb a7, 1(a2)
-; RV32I-NEXT: sb s9, 2(a2)
-; RV32I-NEXT: sb s8, 3(a2)
+; RV32I-NEXT: sb t0, 1(a2)
+; RV32I-NEXT: sb t2, 2(a2)
+; RV32I-NEXT: sb a7, 3(a2)
; RV32I-NEXT: sb a0, 4(a2)
; RV32I-NEXT: sb a4, 5(a2)
; RV32I-NEXT: sb a5, 6(a2)
; RV32I-NEXT: sb a6, 7(a2)
-; RV32I-NEXT: lw ra, 124(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s0, 120(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s1, 116(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s2, 112(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s3, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s4, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s5, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s6, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s7, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s8, 88(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s9, 84(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s10, 80(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s11, 76(sp) # 4-byte Folded Reload
-; RV32I-NEXT: addi sp, sp, 128
+; RV32I-NEXT: lw s0, 108(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 104(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s2, 100(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s3, 96(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s4, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s5, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s6, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s7, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s8, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s9, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s10, 68(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s11, 64(sp) # 4-byte Folded Reload
+; RV32I-NEXT: addi sp, sp, 112
; RV32I-NEXT: ret
%src = load i256, ptr %src.ptr, align 1
%bitOff = load i256, ptr %bitOff.ptr, align 1
diff --git a/llvm/unittests/CodeGen/MFCommon.inc b/llvm/unittests/CodeGen/MFCommon.inc
index 2c4b1f36ffd23d..67759bd5c4632e 100644
--- a/llvm/unittests/CodeGen/MFCommon.inc
+++ b/llvm/unittests/CodeGen/MFCommon.inc
@@ -50,8 +50,8 @@ public:
const char *getRegPressureSetName(unsigned Idx) const override {
return "bogus";
}
- unsigned getRegPressureSetLimit(const MachineFunction &MF, unsigned Idx,
- bool RemoveReserved) const override {
+ unsigned getRegPressureSetLimit(const MachineFunction &MF,
+ unsigned Idx) const override {
return 0;
}
const int *
diff --git a/llvm/utils/TableGen/RegisterInfoEmitter.cpp b/llvm/utils/TableGen/RegisterInfoEmitter.cpp
index 674925c1b2acd3..a6f87119aca5ba 100644
--- a/llvm/utils/TableGen/RegisterInfoEmitter.cpp
+++ b/llvm/utils/TableGen/RegisterInfoEmitter.cpp
@@ -275,8 +275,7 @@ void RegisterInfoEmitter::EmitRegUnitPressure(raw_ostream &OS,
OS << "// Get the register unit pressure limit for this dimension.\n"
<< "// This limit must be adjusted dynamically for reserved registers.\n"
<< "unsigned " << ClassName << "::\n"
- << "getRegPressureSetLimit(const MachineFunction &MF, unsigned Idx, bool "
- "RemoveReserved) const "
+ << "getRegPressureSetLimit(const MachineFunction &MF, unsigned Idx) const "
"{\n"
<< " static const " << getMinimalTypeForRange(MaxRegUnitWeight, 32)
<< " PressureLimitTable[] = {\n";
@@ -1131,7 +1130,7 @@ void RegisterInfoEmitter::runTargetHeader(raw_ostream &OS) {
<< " unsigned getNumRegPressureSets() const override;\n"
<< " const char *getRegPressureSetName(unsigned Idx) const override;\n"
<< " unsigned getRegPressureSetLimit(const MachineFunction &MF, unsigned "
- "Idx, bool RemoveReserved = true) const override;\n"
+ "Idx) const override;\n"
<< " const int *getRegClassPressureSets("
<< "const TargetRegisterClass *RC) const override;\n"
<< " const int *getRegUnitPressureSets("
More information about the llvm-commits
mailing list