[llvm] [RISCV] Correct the limit of RegPresureSet `GPRAll` (PR #118473)

Pengcheng Wang via llvm-commits llvm-commits at lists.llvm.org
Tue Dec 3 05:47:49 PST 2024


https://github.com/wangpc-pp updated https://github.com/llvm/llvm-project/pull/118473

>From 17117c191b5fd5e9c047af74d39dc3a7be9d2091 Mon Sep 17 00:00:00 2001
From: Wang Pengcheng <wangpengcheng.pp at bytedance.com>
Date: Tue, 3 Dec 2024 19:00:18 +0800
Subject: [PATCH 1/3] [RISCV] Correct the limit of RegPresureSet `GPRAll`

The generated limit is 33, which is the total number of scalar registers
plus 1 (for `DUMMY_REG_PAIR_WITH_X0`).

This is not right as not all scalar registers can be used. There are 4-6
reserved registers, so we need to adjust the limit by the reserved set.

This change has impacts on instruction scheduling, MachineLICM, etc.

Here are the statistics of spills/reloads on `llvm-test-suite` with
`-O3 -march=rva23u64`:

```
Metric: regalloc.NumSpills,regalloc.NumReloads

Program                                       regalloc.NumSpills                  regalloc.NumReloads
                                              baseline           after    diff    baseline            after    diff
External/S...NT2017rate/502.gcc_r/502.gcc_r   11812.00           11338.00 -474.00 26813.00            25751.00 -1062.00
External/S...T2017speed/602.gcc_s/602.gcc_s   11812.00           11338.00 -474.00 26813.00            25751.00 -1062.00
External/S...te/526.blender_r/526.blender_r   13514.00           13228.00 -286.00 27456.00            27260.00  -196.00
External/S...00.perlbench_s/600.perlbench_s    4398.00            4274.00 -124.00  9745.00             9341.00  -404.00
External/S...00.perlbench_r/500.perlbench_r    4398.00            4274.00 -124.00  9745.00             9341.00  -404.00
SingleSour...nchmarks/Adobe-C++/loop_unroll    1533.00            1413.00 -120.00  2943.00             2633.00  -310.00
External/S...rate/510.parest_r/510.parest_r   43985.00           43879.00 -106.00 87409.00            87309.00  -100.00
External/S...te/538.imagick_r/538.imagick_r    4160.00            4060.00 -100.00 10338.00            10244.00   -94.00
External/S...ed/638.imagick_s/638.imagick_s    4160.00            4060.00 -100.00 10338.00            10244.00   -94.00
MultiSourc...e/Applications/ClamAV/clamscan    2120.00            2023.00  -97.00  5035.00             4901.00  -134.00
MultiSourc...sumer-typeset/consumer-typeset    1218.00            1129.00  -89.00  3041.00             2887.00  -154.00
MultiSourc.../Applications/JM/ldecod/ldecod    1341.00            1263.00  -78.00  2316.00             2238.00   -78.00
External/S...rate/511.povray_r/511.povray_r    1734.00            1659.00  -75.00  3413.00             3246.00  -167.00
MultiSource/Applications/SPASS/SPASS           1442.00            1376.00  -66.00  2954.00             2837.00  -117.00
MultiSourc.../DOE-ProxyApps-C++/CLAMR/CLAMR    1628.00            1568.00  -60.00  3026.00             2958.00   -68.00
      regalloc.NumSpills                            regalloc.NumReloads
run             baseline         after         diff            baseline         after         diff
mean   86.725206          85.041122    -1.684083     1363.122137         1342.900383  -3.212869
```

Co-authored-by: BoyaoWang430 <wangboyao at bytedance.com>
---
 llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp   |   14 +
 llvm/lib/Target/RISCV/RISCVRegisterInfo.h     |    2 +
 llvm/test/CodeGen/RISCV/pr69586.ll            |  821 ++---
 .../RISCV/rvv/fixed-vectors-masked-scatter.ll |   78 +-
 .../RISCV/rvv/fixed-vectors-setcc-fp-vp.ll    | 2208 +++++------
 .../RISCV/rvv/intrinsic-vector-match.ll       |  700 ++--
 .../RISCV/rvv/vxrm-insert-out-of-loop.ll      |    5 +-
 ...lar-shift-by-byte-multiple-legalization.ll | 3240 ++++++++---------
 .../RISCV/wide-scalar-shift-legalization.ll   |  646 ++--
 9 files changed, 3755 insertions(+), 3959 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
index cfcc3119257f65..a73bd1621a739d 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
@@ -934,3 +934,17 @@ bool RISCVRegisterInfo::getRegAllocationHints(
 
   return BaseImplRetVal;
 }
+
+unsigned RISCVRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF,
+                                                   unsigned Idx) const {
+  if (Idx == RISCV::RegisterPressureSets::GPRAll) {
+    unsigned Reserved = 0;
+    BitVector ReservedRegs = getReservedRegs(MF);
+    for (MCPhysReg Reg = RISCV::X0_H; Reg <= RISCV::X31_H; Reg++)
+      if (ReservedRegs.test(Reg))
+        Reserved++;
+
+    return 32 - Reserved;
+  }
+  return RISCVGenRegisterInfo::getRegPressureSetLimit(MF, Idx);
+}
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.h b/llvm/lib/Target/RISCV/RISCVRegisterInfo.h
index 3ab79694e175c8..ca4934de2f52d2 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.h
@@ -144,6 +144,8 @@ struct RISCVRegisterInfo : public RISCVGenRegisterInfo {
   static bool isRVVRegClass(const TargetRegisterClass *RC) {
     return RISCVRI::isVRegClass(RC->TSFlags);
   }
+  unsigned getRegPressureSetLimit(const MachineFunction &MF,
+                                  unsigned Idx) const override;
 };
 } // namespace llvm
 
diff --git a/llvm/test/CodeGen/RISCV/pr69586.ll b/llvm/test/CodeGen/RISCV/pr69586.ll
index 9fc9a3c42867e7..21e64ada7061aa 100644
--- a/llvm/test/CodeGen/RISCV/pr69586.ll
+++ b/llvm/test/CodeGen/RISCV/pr69586.ll
@@ -44,59 +44,50 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; NOREMAT-NEXT:    addi a5, a7, 512
 ; NOREMAT-NEXT:    addi a4, a7, 1024
 ; NOREMAT-NEXT:    addi a6, a7, 1536
-; NOREMAT-NEXT:    li t4, 1
-; NOREMAT-NEXT:    li a2, 5
-; NOREMAT-NEXT:    li t1, 3
-; NOREMAT-NEXT:    li t0, 7
-; NOREMAT-NEXT:    lui t5, 1
-; NOREMAT-NEXT:    li s4, 9
-; NOREMAT-NEXT:    li s6, 11
-; NOREMAT-NEXT:    li s9, 13
-; NOREMAT-NEXT:    li ra, 15
-; NOREMAT-NEXT:    lui t2, 2
-; NOREMAT-NEXT:    lui s1, 3
-; NOREMAT-NEXT:    lui t3, 4
-; NOREMAT-NEXT:    lui s0, 5
-; NOREMAT-NEXT:    lui s3, 6
-; NOREMAT-NEXT:    lui s7, 7
+; NOREMAT-NEXT:    li t1, 1
+; NOREMAT-NEXT:    li a3, 5
+; NOREMAT-NEXT:    li t0, 3
+; NOREMAT-NEXT:    li a2, 7
+; NOREMAT-NEXT:    lui t2, 1
+; NOREMAT-NEXT:    li s5, 9
+; NOREMAT-NEXT:    li s8, 11
+; NOREMAT-NEXT:    lui s1, 2
+; NOREMAT-NEXT:    lui t5, 3
+; NOREMAT-NEXT:    lui s11, 4
+; NOREMAT-NEXT:    lui ra, 5
+; NOREMAT-NEXT:    lui t3, 6
+; NOREMAT-NEXT:    lui s0, 7
 ; NOREMAT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; NOREMAT-NEXT:    slli t4, t4, 11
-; NOREMAT-NEXT:    sd t4, 512(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    slli a3, a2, 9
-; NOREMAT-NEXT:    sd a3, 504(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    slli t6, t1, 10
-; NOREMAT-NEXT:    slli s2, t0, 9
-; NOREMAT-NEXT:    add a0, a7, t5
-; NOREMAT-NEXT:    lui s11, 1
-; NOREMAT-NEXT:    slli s4, s4, 9
-; NOREMAT-NEXT:    slli s5, a2, 10
-; NOREMAT-NEXT:    slli s6, s6, 9
-; NOREMAT-NEXT:    slli s8, t1, 11
+; NOREMAT-NEXT:    slli t4, t1, 11
+; NOREMAT-NEXT:    slli t6, a3, 9
+; NOREMAT-NEXT:    slli s2, t0, 10
+; NOREMAT-NEXT:    slli s4, a2, 9
+; NOREMAT-NEXT:    add a0, a7, t2
 ; NOREMAT-NEXT:    vle32.v v8, (a5)
-; NOREMAT-NEXT:    slli s9, s9, 9
-; NOREMAT-NEXT:    li t5, 13
+; NOREMAT-NEXT:    slli s5, s5, 9
 ; NOREMAT-NEXT:    vle32.v v10, (a4)
 ; NOREMAT-NEXT:    vle32.v v2, (a4)
-; NOREMAT-NEXT:    slli s10, t0, 10
+; NOREMAT-NEXT:    slli s6, a3, 10
 ; NOREMAT-NEXT:    vle32.v v0, (a6)
 ; NOREMAT-NEXT:    vle32.v v12, (a6)
-; NOREMAT-NEXT:    slli ra, ra, 9
+; NOREMAT-NEXT:    slli s8, s8, 9
+; NOREMAT-NEXT:    slli s9, t0, 11
 ; NOREMAT-NEXT:    vle32.v v4, (a0)
 ; NOREMAT-NEXT:    vle32.v v20, (a0)
-; NOREMAT-NEXT:    add a4, a7, t2
+; NOREMAT-NEXT:    add a4, a7, s1
 ; NOREMAT-NEXT:    vle32.v v6, (a4)
 ; NOREMAT-NEXT:    vle32.v v30, (a4)
-; NOREMAT-NEXT:    add a4, a7, s1
+; NOREMAT-NEXT:    add a4, a7, t5
 ; NOREMAT-NEXT:    vle32.v v28, (a4)
 ; NOREMAT-NEXT:    vle32.v v26, (a4)
-; NOREMAT-NEXT:    add a4, a7, t3
+; NOREMAT-NEXT:    add a4, a7, s11
 ; NOREMAT-NEXT:    vle32.v v24, (a4)
 ; NOREMAT-NEXT:    vle32.v v22, (a4)
-; NOREMAT-NEXT:    add a4, a7, s0
+; NOREMAT-NEXT:    add a4, a7, ra
 ; NOREMAT-NEXT:    vle32.v v14, (a7)
 ; NOREMAT-NEXT:    vle32.v v18, (a4)
 ; NOREMAT-NEXT:    vle32.v v16, (a4)
-; NOREMAT-NEXT:    add a4, a7, s3
+; NOREMAT-NEXT:    add a4, a7, t3
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v14, v8
 ; NOREMAT-NEXT:    vle32.v v14, (a4)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v8, v10
@@ -107,78 +98,86 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; NOREMAT-NEXT:    vle32.v v10, (a4)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v2, v0
 ; NOREMAT-NEXT:    vle32.v v2, (a4)
-; NOREMAT-NEXT:    add a4, a7, a3
+; NOREMAT-NEXT:    add a4, a7, t6
 ; NOREMAT-NEXT:    vle32.v v0, (a4)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v12, v10
 ; NOREMAT-NEXT:    vle32.v v10, (a4)
-; NOREMAT-NEXT:    add a4, a7, t6
+; NOREMAT-NEXT:    add a4, a7, s2
 ; NOREMAT-NEXT:    vle32.v v12, (a4)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v2, v0
 ; NOREMAT-NEXT:    vle32.v v2, (a4)
-; NOREMAT-NEXT:    add a4, a7, s2
+; NOREMAT-NEXT:    add a4, a7, s4
 ; NOREMAT-NEXT:    vle32.v v8, (a4)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
 ; NOREMAT-NEXT:    vle32.v v12, (a4)
-; NOREMAT-NEXT:    add a4, a7, s7
+; NOREMAT-NEXT:    add a4, a7, s0
 ; NOREMAT-NEXT:    vle32.v v0, (a4)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v2, v8
 ; NOREMAT-NEXT:    vle32.v v10, (a4)
-; NOREMAT-NEXT:    add a4, a7, s4
+; NOREMAT-NEXT:    add a4, a7, s5
 ; NOREMAT-NEXT:    vle32.v v8, (a4)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v12, v4
 ; NOREMAT-NEXT:    vle32.v v12, (a4)
-; NOREMAT-NEXT:    add a4, a7, s5
+; NOREMAT-NEXT:    add a4, a7, s6
 ; NOREMAT-NEXT:    vle32.v v4, (a4)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v20, v8
 ; NOREMAT-NEXT:    vle32.v v8, (a4)
-; NOREMAT-NEXT:    add a4, a7, s6
+; NOREMAT-NEXT:    add a4, a7, s8
 ; NOREMAT-NEXT:    vle32.v v20, (a4)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v12, v4
 ; NOREMAT-NEXT:    vle32.v v12, (a4)
-; NOREMAT-NEXT:    add a4, a7, s8
+; NOREMAT-NEXT:    add a4, a7, s9
 ; NOREMAT-NEXT:    vle32.v v4, (a4)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v8, v20
 ; NOREMAT-NEXT:    vle32.v v8, (a4)
-; NOREMAT-NEXT:    add a4, a7, s9
+; NOREMAT-NEXT:    li t5, 13
+; NOREMAT-NEXT:    slli a4, t5, 9
+; NOREMAT-NEXT:    sd a4, 624(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a4, a7, a4
 ; NOREMAT-NEXT:    vle32.v v20, (a4)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v12, v4
 ; NOREMAT-NEXT:    vle32.v v12, (a4)
-; NOREMAT-NEXT:    add a4, a7, s10
+; NOREMAT-NEXT:    slli a4, a2, 10
+; NOREMAT-NEXT:    sd a4, 616(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a4, a7, a4
 ; NOREMAT-NEXT:    vle32.v v4, (a4)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v8, v20
 ; NOREMAT-NEXT:    vle32.v v8, (a4)
-; NOREMAT-NEXT:    add a4, a7, ra
+; NOREMAT-NEXT:    li a6, 15
+; NOREMAT-NEXT:    slli a4, a6, 9
+; NOREMAT-NEXT:    sd a4, 608(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a4, a7, a4
 ; NOREMAT-NEXT:    vle32.v v2, (a4)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v12, v4
-; NOREMAT-NEXT:    lui t4, 8
-; NOREMAT-NEXT:    add a5, a7, t4
+; NOREMAT-NEXT:    lui t1, 8
+; NOREMAT-NEXT:    add a5, a7, t1
 ; NOREMAT-NEXT:    vle32.v v20, (a5)
 ; NOREMAT-NEXT:    vle32.v v12, (a4)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v8, v2
 ; NOREMAT-NEXT:    li a4, 17
 ; NOREMAT-NEXT:    slli a4, a4, 9
-; NOREMAT-NEXT:    li s1, 17
-; NOREMAT-NEXT:    sd a4, 624(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    li t2, 17
+; NOREMAT-NEXT:    sd a4, 600(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    add a4, a7, a4
 ; NOREMAT-NEXT:    vle32.v v8, (a4)
 ; NOREMAT-NEXT:    vle32.v v4, (a4)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v12, v6
 ; NOREMAT-NEXT:    li a5, 9
 ; NOREMAT-NEXT:    slli a4, a5, 10
-; NOREMAT-NEXT:    sd a4, 616(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd a4, 592(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    add a4, a7, a4
 ; NOREMAT-NEXT:    vle32.v v12, (a4)
 ; NOREMAT-NEXT:    vle32.v v6, (a4)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v30, v8
 ; NOREMAT-NEXT:    li a4, 19
 ; NOREMAT-NEXT:    slli a4, a4, 9
-; NOREMAT-NEXT:    li t2, 19
-; NOREMAT-NEXT:    sd a4, 608(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    li s1, 19
+; NOREMAT-NEXT:    sd a4, 584(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    add a4, a7, a4
 ; NOREMAT-NEXT:    vle32.v v8, (a4)
 ; NOREMAT-NEXT:    vle32.v v30, (a4)
-; NOREMAT-NEXT:    slli a3, a2, 11
-; NOREMAT-NEXT:    sd a3, 600(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    slli a3, a3, 11
+; NOREMAT-NEXT:    sd a3, 576(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v4, v12
 ; NOREMAT-NEXT:    add a3, a7, a3
 ; NOREMAT-NEXT:    vle32.v v12, (a3)
@@ -186,46 +185,45 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v6, v8
 ; NOREMAT-NEXT:    li s7, 21
 ; NOREMAT-NEXT:    slli a3, s7, 9
-; NOREMAT-NEXT:    sd a3, 592(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd a3, 568(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    add a3, a7, a3
 ; NOREMAT-NEXT:    vle32.v v8, (a3)
 ; NOREMAT-NEXT:    vle32.v v6, (a3)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v30, v12
-; NOREMAT-NEXT:    li a6, 11
-; NOREMAT-NEXT:    slli a3, a6, 10
-; NOREMAT-NEXT:    sd a3, 584(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    li a4, 11
+; NOREMAT-NEXT:    slli a3, a4, 10
+; NOREMAT-NEXT:    sd a3, 560(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    add a3, a7, a3
 ; NOREMAT-NEXT:    vle32.v v12, (a3)
 ; NOREMAT-NEXT:    vle32.v v30, (a3)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v4, v8
 ; NOREMAT-NEXT:    li s3, 23
-; NOREMAT-NEXT:    slli a3, s3, 9
-; NOREMAT-NEXT:    sd a3, 576(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a3, a7, a3
+; NOREMAT-NEXT:    slli s10, s3, 9
+; NOREMAT-NEXT:    add a3, a7, s10
 ; NOREMAT-NEXT:    vle32.v v8, (a3)
 ; NOREMAT-NEXT:    vle32.v v4, (a3)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v6, v12
 ; NOREMAT-NEXT:    li s0, 25
 ; NOREMAT-NEXT:    slli a3, s0, 9
-; NOREMAT-NEXT:    sd a3, 568(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd a3, 552(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    add a3, a7, a3
 ; NOREMAT-NEXT:    vle32.v v12, (a3)
 ; NOREMAT-NEXT:    vle32.v v6, (a3)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v30, v8
 ; NOREMAT-NEXT:    slli a3, t5, 10
-; NOREMAT-NEXT:    sd a3, 560(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd a3, 544(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    add a3, a7, a3
 ; NOREMAT-NEXT:    vle32.v v8, (a3)
 ; NOREMAT-NEXT:    vle32.v v30, (a3)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v4, v28
 ; NOREMAT-NEXT:    li t3, 27
 ; NOREMAT-NEXT:    slli a3, t3, 9
-; NOREMAT-NEXT:    sd a3, 552(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd a3, 536(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    add a3, a7, a3
 ; NOREMAT-NEXT:    vle32.v v28, (a3)
 ; NOREMAT-NEXT:    vle32.v v4, (a3)
-; NOREMAT-NEXT:    slli a2, t0, 11
-; NOREMAT-NEXT:    sd a2, 544(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    slli a2, a2, 11
+; NOREMAT-NEXT:    sd a2, 528(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v26, v12
 ; NOREMAT-NEXT:    add a2, a7, a2
 ; NOREMAT-NEXT:    vle32.v v12, (a2)
@@ -233,39 +231,37 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v6, v8
 ; NOREMAT-NEXT:    li t0, 29
 ; NOREMAT-NEXT:    slli a2, t0, 9
-; NOREMAT-NEXT:    sd a2, 536(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd a2, 520(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    add a2, a7, a2
 ; NOREMAT-NEXT:    vle32.v v8, (a2)
 ; NOREMAT-NEXT:    vle32.v v6, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v30, v28
-; NOREMAT-NEXT:    li a3, 15
-; NOREMAT-NEXT:    slli a2, a3, 10
-; NOREMAT-NEXT:    sd a2, 528(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    slli a2, a6, 10
+; NOREMAT-NEXT:    sd a2, 512(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    add a2, a7, a2
 ; NOREMAT-NEXT:    vle32.v v28, (a2)
 ; NOREMAT-NEXT:    vle32.v v30, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v4, v12
-; NOREMAT-NEXT:    li t1, 31
-; NOREMAT-NEXT:    slli a2, t1, 9
-; NOREMAT-NEXT:    sd a2, 520(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a7, a2
-; NOREMAT-NEXT:    vle32.v v12, (a2)
-; NOREMAT-NEXT:    vle32.v v4, (a2)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v26, v8
-; NOREMAT-NEXT:    lui a4, 4
-; NOREMAT-NEXT:    addiw a0, a4, 512
-; NOREMAT-NEXT:    sd a0, 496(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    li a3, 31
+; NOREMAT-NEXT:    slli a0, a3, 9
+; NOREMAT-NEXT:    sd a0, 504(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    add a0, a7, a0
-; NOREMAT-NEXT:    vle32.v v8, (a0)
-; NOREMAT-NEXT:    vle32.v v26, (a0)
+; NOREMAT-NEXT:    vle32.v v12, (a0)
+; NOREMAT-NEXT:    vle32.v v4, (a0)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v26, v8
+; NOREMAT-NEXT:    addiw a2, s11, 512
+; NOREMAT-NEXT:    sd a2, 496(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    vle32.v v8, (a2)
+; NOREMAT-NEXT:    vle32.v v26, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v6, v28
-; NOREMAT-NEXT:    slli a2, s1, 10
+; NOREMAT-NEXT:    slli a2, t2, 10
 ; NOREMAT-NEXT:    sd a2, 488(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    add a2, a7, a2
 ; NOREMAT-NEXT:    vle32.v v28, (a2)
 ; NOREMAT-NEXT:    vle32.v v6, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v30, v12
-; NOREMAT-NEXT:    addiw a2, a4, 1536
+; NOREMAT-NEXT:    addiw a2, s11, 1536
 ; NOREMAT-NEXT:    sd a2, 480(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    add a2, a7, a2
 ; NOREMAT-NEXT:    vle32.v v12, (a2)
@@ -277,27 +273,25 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; NOREMAT-NEXT:    vle32.v v24, (a2)
 ; NOREMAT-NEXT:    vle32.v v4, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v22, v8
-; NOREMAT-NEXT:    lui a5, 5
-; NOREMAT-NEXT:    addiw a2, a5, -1536
+; NOREMAT-NEXT:    addiw a2, ra, -1536
 ; NOREMAT-NEXT:    sd a2, 464(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    add a2, a7, a2
 ; NOREMAT-NEXT:    vle32.v v8, (a2)
 ; NOREMAT-NEXT:    vle32.v v22, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v26, v28
-; NOREMAT-NEXT:    slli a2, t2, 10
+; NOREMAT-NEXT:    slli a2, s1, 10
 ; NOREMAT-NEXT:    sd a2, 456(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    li t2, 19
 ; NOREMAT-NEXT:    add a2, a7, a2
 ; NOREMAT-NEXT:    vle32.v v26, (a2)
 ; NOREMAT-NEXT:    vle32.v v28, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v6, v12
-; NOREMAT-NEXT:    addiw a2, a5, -512
+; NOREMAT-NEXT:    addiw a2, ra, -512
 ; NOREMAT-NEXT:    sd a2, 448(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    add a2, a7, a2
 ; NOREMAT-NEXT:    vle32.v v12, (a2)
 ; NOREMAT-NEXT:    vle32.v v6, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v30, v24
-; NOREMAT-NEXT:    addiw a2, a5, 512
+; NOREMAT-NEXT:    addiw a2, ra, 512
 ; NOREMAT-NEXT:    sd a2, 440(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    add a2, a7, a2
 ; NOREMAT-NEXT:    vle32.v v24, (a2)
@@ -309,20 +303,20 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; NOREMAT-NEXT:    vle32.v v8, (a2)
 ; NOREMAT-NEXT:    vle32.v v4, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v22, v26
-; NOREMAT-NEXT:    addiw a2, a5, 1536
+; NOREMAT-NEXT:    addiw a2, ra, 1536
 ; NOREMAT-NEXT:    sd a2, 424(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    add a2, a7, a2
 ; NOREMAT-NEXT:    vle32.v v22, (a2)
 ; NOREMAT-NEXT:    vle32.v v26, (a2)
-; NOREMAT-NEXT:    slli a2, a6, 11
+; NOREMAT-NEXT:    slli a2, a4, 11
 ; NOREMAT-NEXT:    sd a2, 416(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v28, v12
 ; NOREMAT-NEXT:    add a2, a7, a2
 ; NOREMAT-NEXT:    vle32.v v12, (a2)
 ; NOREMAT-NEXT:    vle32.v v28, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v6, v18
-; NOREMAT-NEXT:    lui a6, 6
-; NOREMAT-NEXT:    addiw a2, a6, -1536
+; NOREMAT-NEXT:    lui a4, 6
+; NOREMAT-NEXT:    addiw a2, a4, -1536
 ; NOREMAT-NEXT:    sd a2, 408(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    add a2, a7, a2
 ; NOREMAT-NEXT:    vle32.v v18, (a2)
@@ -334,13 +328,13 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; NOREMAT-NEXT:    vle32.v v16, (a2)
 ; NOREMAT-NEXT:    vle32.v v24, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v30, v8
-; NOREMAT-NEXT:    addiw a2, a6, -512
+; NOREMAT-NEXT:    addiw a2, a4, -512
 ; NOREMAT-NEXT:    sd a2, 392(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    add a2, a7, a2
 ; NOREMAT-NEXT:    vle32.v v8, (a2)
 ; NOREMAT-NEXT:    vle32.v v30, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v4, v22
-; NOREMAT-NEXT:    addiw a2, a6, 512
+; NOREMAT-NEXT:    addiw a2, a4, 512
 ; NOREMAT-NEXT:    sd a2, 384(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    add a2, a7, a2
 ; NOREMAT-NEXT:    vle32.v v22, (a2)
@@ -352,7 +346,7 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; NOREMAT-NEXT:    vle32.v v26, (a2)
 ; NOREMAT-NEXT:    vle32.v v2, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v28, v18
-; NOREMAT-NEXT:    addiw a2, a6, 1536
+; NOREMAT-NEXT:    addiw a2, a4, 1536
 ; NOREMAT-NEXT:    sd a2, 368(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    add a2, a7, a2
 ; NOREMAT-NEXT:    vle32.v v18, (a2)
@@ -364,8 +358,8 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; NOREMAT-NEXT:    vle32.v v16, (a2)
 ; NOREMAT-NEXT:    vle32.v v6, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v24, v8
-; NOREMAT-NEXT:    lui s0, 7
-; NOREMAT-NEXT:    addiw a2, s0, -1536
+; NOREMAT-NEXT:    lui a5, 7
+; NOREMAT-NEXT:    addiw a2, a5, -1536
 ; NOREMAT-NEXT:    sd a2, 352(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    add a2, a7, a2
 ; NOREMAT-NEXT:    vle32.v v8, (a2)
@@ -379,15 +373,14 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; NOREMAT-NEXT:    addi a0, sp, 640
 ; NOREMAT-NEXT:    vl2r.v v12, (a0) # Unknown-size Folded Reload
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v12, v22
-; NOREMAT-NEXT:    addiw a2, s0, -512
+; NOREMAT-NEXT:    addiw a2, a5, -512
 ; NOREMAT-NEXT:    sd a2, 336(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    add a2, a7, a2
 ; NOREMAT-NEXT:    vle32.v v22, (a2)
 ; NOREMAT-NEXT:    vle32.v v12, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v4, v26
-; NOREMAT-NEXT:    addiw a2, s0, 512
+; NOREMAT-NEXT:    addiw a2, a5, 512
 ; NOREMAT-NEXT:    sd a2, 328(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    lui t3, 7
 ; NOREMAT-NEXT:    add a2, a7, a2
 ; NOREMAT-NEXT:    vle32.v v26, (a2)
 ; NOREMAT-NEXT:    vle32.v v4, (a2)
@@ -398,30 +391,30 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; NOREMAT-NEXT:    vle32.v v18, (a2)
 ; NOREMAT-NEXT:    vle32.v v2, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v28, v16
-; NOREMAT-NEXT:    addiw a2, t3, 1536
+; NOREMAT-NEXT:    addiw a2, a5, 1536
 ; NOREMAT-NEXT:    sd a2, 312(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    add a2, a7, a2
 ; NOREMAT-NEXT:    vle32.v v16, (a2)
 ; NOREMAT-NEXT:    vle32.v v28, (a2)
-; NOREMAT-NEXT:    slli a2, a3, 11
+; NOREMAT-NEXT:    slli a2, a6, 11
 ; NOREMAT-NEXT:    sd a2, 304(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v6, v8
 ; NOREMAT-NEXT:    add a2, a7, a2
 ; NOREMAT-NEXT:    vle32.v v8, (a2)
 ; NOREMAT-NEXT:    vle32.v v6, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v24, v14
-; NOREMAT-NEXT:    addiw a2, t4, -1536
+; NOREMAT-NEXT:    addiw a2, t1, -1536
 ; NOREMAT-NEXT:    sd a2, 296(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    add a2, a7, a2
 ; NOREMAT-NEXT:    vle32.v v14, (a2)
 ; NOREMAT-NEXT:    vle32.v v24, (a2)
-; NOREMAT-NEXT:    slli a2, t1, 10
+; NOREMAT-NEXT:    slli a2, a3, 10
 ; NOREMAT-NEXT:    sd a2, 288(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v30, v22
 ; NOREMAT-NEXT:    add a2, a7, a2
 ; NOREMAT-NEXT:    vle32.v v22, (a2)
 ; NOREMAT-NEXT:    vle32.v v30, (a2)
-; NOREMAT-NEXT:    addiw a0, t4, -512
+; NOREMAT-NEXT:    addiw a0, t1, -512
 ; NOREMAT-NEXT:    sd a0, 280(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    add a0, a7, a0
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v12, v0
@@ -438,32 +431,33 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; NOREMAT-NEXT:    addi a0, a1, 1024
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
-; NOREMAT-NEXT:    add s11, a1, s11
-; NOREMAT-NEXT:    sd s11, 272(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    lui a0, 1
+; NOREMAT-NEXT:    add a0, a1, a0
+; NOREMAT-NEXT:    sd a0, 272(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    lui a0, 2
 ; NOREMAT-NEXT:    add a0, a1, a0
 ; NOREMAT-NEXT:    sd a0, 264(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    lui a0, 3
 ; NOREMAT-NEXT:    add a0, a1, a0
 ; NOREMAT-NEXT:    sd a0, 256(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add s11, a1, s11
+; NOREMAT-NEXT:    sd s11, 248(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add ra, a1, ra
+; NOREMAT-NEXT:    sd ra, 240(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    add a4, a1, a4
-; NOREMAT-NEXT:    sd a4, 248(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd a4, 232(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    add a5, a1, a5
-; NOREMAT-NEXT:    sd a5, 240(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a6, a1, a6
-; NOREMAT-NEXT:    sd a6, 232(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add t3, a1, t3
-; NOREMAT-NEXT:    sd t3, 224(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a0, a1, t4
+; NOREMAT-NEXT:    sd a5, 224(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a0, a1, t1
 ; NOREMAT-NEXT:    sd a0, 216(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    addiw a0, t4, 512
+; NOREMAT-NEXT:    addiw a0, t1, 512
 ; NOREMAT-NEXT:    sd a0, 192(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    addiw a0, t4, 1024
+; NOREMAT-NEXT:    addiw a0, t1, 1024
 ; NOREMAT-NEXT:    sd a0, 176(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    addiw a0, t4, 1536
+; NOREMAT-NEXT:    addiw a0, t1, 1536
 ; NOREMAT-NEXT:    sd a0, 160(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    slli s1, s1, 11
-; NOREMAT-NEXT:    sd s1, 128(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    slli t2, t2, 11
+; NOREMAT-NEXT:    sd t2, 128(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    lui a0, 9
 ; NOREMAT-NEXT:    addiw a2, a0, -1536
 ; NOREMAT-NEXT:    sd a2, 88(sp) # 8-byte Folded Spill
@@ -476,7 +470,7 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; NOREMAT-NEXT:    addiw s11, a0, 512
 ; NOREMAT-NEXT:    addiw s7, a0, 1024
 ; NOREMAT-NEXT:    addiw s3, a0, 1536
-; NOREMAT-NEXT:    slli s1, t2, 11
+; NOREMAT-NEXT:    slli s1, s1, 11
 ; NOREMAT-NEXT:    lui a0, 10
 ; NOREMAT-NEXT:    addiw t2, a0, -1536
 ; NOREMAT-NEXT:    addiw a7, a0, -1024
@@ -484,52 +478,52 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; NOREMAT-NEXT:    add a2, a1, a0
 ; NOREMAT-NEXT:    sd a2, 200(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    addiw a0, a0, 512
-; NOREMAT-NEXT:    ld a2, 512(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    add a2, a1, a2
-; NOREMAT-NEXT:    ld a3, 504(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    add a3, a1, a3
-; NOREMAT-NEXT:    add a5, a1, t6
-; NOREMAT-NEXT:    add a6, a1, s2
-; NOREMAT-NEXT:    add t0, a1, s4
-; NOREMAT-NEXT:    add t1, a1, s5
-; NOREMAT-NEXT:    add t3, a1, s6
-; NOREMAT-NEXT:    add t4, a1, s8
-; NOREMAT-NEXT:    add t5, a1, s9
-; NOREMAT-NEXT:    add t6, a1, s10
-; NOREMAT-NEXT:    add s0, a1, ra
-; NOREMAT-NEXT:    ld s2, 624(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add a2, a1, t4
+; NOREMAT-NEXT:    add a3, a1, t6
+; NOREMAT-NEXT:    add a5, a1, s2
+; NOREMAT-NEXT:    add a6, a1, s4
+; NOREMAT-NEXT:    add t0, a1, s5
+; NOREMAT-NEXT:    add t1, a1, s6
+; NOREMAT-NEXT:    add t3, a1, s8
+; NOREMAT-NEXT:    add t4, a1, s9
+; NOREMAT-NEXT:    ld t5, 624(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add t5, a1, t5
+; NOREMAT-NEXT:    ld t6, 616(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add t6, a1, t6
+; NOREMAT-NEXT:    ld s0, 608(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add s0, a1, s0
+; NOREMAT-NEXT:    ld s2, 600(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add s2, a1, s2
-; NOREMAT-NEXT:    ld s4, 616(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld s4, 592(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add s4, a1, s4
-; NOREMAT-NEXT:    ld s5, 608(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld s5, 584(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add s5, a1, s5
-; NOREMAT-NEXT:    ld s6, 600(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld s6, 576(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add s6, a1, s6
-; NOREMAT-NEXT:    ld s8, 592(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld s8, 568(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add s8, a1, s8
-; NOREMAT-NEXT:    ld s9, 584(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld s9, 560(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add s9, a1, s9
-; NOREMAT-NEXT:    ld s10, 576(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add s10, a1, s10
-; NOREMAT-NEXT:    ld ra, 568(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld ra, 552(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add ra, a1, ra
 ; NOREMAT-NEXT:    sd ra, 16(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    ld ra, 560(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld ra, 544(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add ra, a1, ra
 ; NOREMAT-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    ld ra, 552(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld ra, 536(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add ra, a1, ra
 ; NOREMAT-NEXT:    sd ra, 32(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    ld ra, 544(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld ra, 528(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add ra, a1, ra
 ; NOREMAT-NEXT:    sd ra, 48(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    ld ra, 536(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld ra, 520(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add ra, a1, ra
 ; NOREMAT-NEXT:    sd ra, 56(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    ld ra, 528(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld ra, 512(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add ra, a1, ra
 ; NOREMAT-NEXT:    sd ra, 64(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    ld ra, 520(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld ra, 504(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add ra, a1, ra
 ; NOREMAT-NEXT:    sd ra, 80(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    ld ra, 496(sp) # 8-byte Folded Reload
@@ -923,10 +917,9 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; REMAT-NEXT:    .cfi_offset s10, -96
 ; REMAT-NEXT:    .cfi_offset s11, -104
 ; REMAT-NEXT:    csrr a2, vlenb
-; REMAT-NEXT:    li a3, 18
-; REMAT-NEXT:    mul a2, a2, a3
+; REMAT-NEXT:    slli a2, a2, 3
 ; REMAT-NEXT:    sub sp, sp, a2
-; REMAT-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xa0, 0x04, 0x22, 0x11, 0x12, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 544 + 18 * vlenb
+; REMAT-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xa0, 0x04, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 544 + 8 * vlenb
 ; REMAT-NEXT:    li a4, 32
 ; REMAT-NEXT:    addi a5, a0, 512
 ; REMAT-NEXT:    addi a3, a0, 1024
@@ -963,23 +956,14 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; REMAT-NEXT:    slli s6, s6, 9
 ; REMAT-NEXT:    li s7, 5
 ; REMAT-NEXT:    slli s7, s7, 11
-; REMAT-NEXT:    li s8, 21
-; REMAT-NEXT:    slli s8, s8, 9
-; REMAT-NEXT:    li s9, 11
-; REMAT-NEXT:    slli s9, s9, 10
-; REMAT-NEXT:    li s10, 23
-; REMAT-NEXT:    slli s10, s10, 9
-; REMAT-NEXT:    lui s11, 3
-; REMAT-NEXT:    li ra, 25
-; REMAT-NEXT:    slli ra, ra, 9
 ; REMAT-NEXT:    vsetvli zero, a4, e32, m2, ta, ma
 ; REMAT-NEXT:    vle32.v v8, (a5)
-; REMAT-NEXT:    li a4, 13
-; REMAT-NEXT:    slli a4, a4, 10
+; REMAT-NEXT:    li a4, 21
+; REMAT-NEXT:    slli a4, a4, 9
 ; REMAT-NEXT:    vle32.v v10, (a3)
 ; REMAT-NEXT:    vle32.v v12, (a3)
-; REMAT-NEXT:    li a3, 27
-; REMAT-NEXT:    slli a3, a3, 9
+; REMAT-NEXT:    li a3, 11
+; REMAT-NEXT:    slli a3, a3, 10
 ; REMAT-NEXT:    vle32.v v14, (a2)
 ; REMAT-NEXT:    vle32.v v16, (a2)
 ; REMAT-NEXT:    add a2, a0, a6
@@ -995,7 +979,8 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; REMAT-NEXT:    vle32.v v30, (a2)
 ; REMAT-NEXT:    vle32.v v6, (a2)
 ; REMAT-NEXT:    csrr a2, vlenb
-; REMAT-NEXT:    slli a2, a2, 4
+; REMAT-NEXT:    li a5, 6
+; REMAT-NEXT:    mul a2, a2, a5
 ; REMAT-NEXT:    add a2, sp, a2
 ; REMAT-NEXT:    addi a2, a2, 432
 ; REMAT-NEXT:    vs2r.v v6, (a2) # Unknown-size Folded Spill
@@ -1004,8 +989,7 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; REMAT-NEXT:    vle32.v v2, (a2)
 ; REMAT-NEXT:    vle32.v v6, (a2)
 ; REMAT-NEXT:    csrr a2, vlenb
-; REMAT-NEXT:    li a5, 14
-; REMAT-NEXT:    mul a2, a2, a5
+; REMAT-NEXT:    slli a2, a2, 2
 ; REMAT-NEXT:    add a2, sp, a2
 ; REMAT-NEXT:    addi a2, a2, 432
 ; REMAT-NEXT:    vs2r.v v6, (a2) # Unknown-size Folded Spill
@@ -1019,17 +1003,11 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v12, v14
 ; REMAT-NEXT:    vle32.v v0, (a2)
 ; REMAT-NEXT:    add a2, a0, t5
-; REMAT-NEXT:    vle32.v v14, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v16, v18
 ; REMAT-NEXT:    vle32.v v8, (a2)
-; REMAT-NEXT:    csrr a2, vlenb
-; REMAT-NEXT:    li a5, 12
-; REMAT-NEXT:    mul a2, a2, a5
-; REMAT-NEXT:    add a2, sp, a2
-; REMAT-NEXT:    addi a2, a2, 432
-; REMAT-NEXT:    vs2r.v v8, (a2) # Unknown-size Folded Spill
-; REMAT-NEXT:    add a2, a0, t6
+; REMAT-NEXT:    sf.vc.vv 3, 0, v16, v18
 ; REMAT-NEXT:    vle32.v v18, (a2)
+; REMAT-NEXT:    add a2, a0, t6
+; REMAT-NEXT:    vle32.v v16, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v20, v22
 ; REMAT-NEXT:    vle32.v v20, (a2)
 ; REMAT-NEXT:    add a2, a0, s0
@@ -1039,403 +1017,340 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; REMAT-NEXT:    add a2, a0, s1
 ; REMAT-NEXT:    vle32.v v26, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v28, v30
-; REMAT-NEXT:    vle32.v v28, (a2)
+; REMAT-NEXT:    vle32.v v14, (a2)
 ; REMAT-NEXT:    add a2, a0, s2
-; REMAT-NEXT:    vle32.v v8, (a2)
+; REMAT-NEXT:    vle32.v v12, (a2)
 ; REMAT-NEXT:    csrr a5, vlenb
-; REMAT-NEXT:    slli a5, a5, 4
+; REMAT-NEXT:    li a6, 6
+; REMAT-NEXT:    mul a5, a5, a6
 ; REMAT-NEXT:    add a5, sp, a5
 ; REMAT-NEXT:    addi a5, a5, 432
-; REMAT-NEXT:    vl2r.v v12, (a5) # Unknown-size Folded Reload
-; REMAT-NEXT:    sf.vc.vv 3, 0, v12, v2
+; REMAT-NEXT:    vl2r.v v28, (a5) # Unknown-size Folded Reload
+; REMAT-NEXT:    sf.vc.vv 3, 0, v28, v2
 ; REMAT-NEXT:    vle32.v v2, (a2)
 ; REMAT-NEXT:    add a2, a0, s3
-; REMAT-NEXT:    vle32.v v12, (a2)
+; REMAT-NEXT:    vle32.v v28, (a2)
 ; REMAT-NEXT:    csrr a5, vlenb
-; REMAT-NEXT:    li a6, 14
-; REMAT-NEXT:    mul a5, a5, a6
+; REMAT-NEXT:    slli a5, a5, 2
 ; REMAT-NEXT:    add a5, sp, a5
 ; REMAT-NEXT:    addi a5, a5, 432
-; REMAT-NEXT:    vl2r.v v16, (a5) # Unknown-size Folded Reload
-; REMAT-NEXT:    sf.vc.vv 3, 0, v16, v4
-; REMAT-NEXT:    vle32.v v30, (a2)
+; REMAT-NEXT:    vl2r.v v30, (a5) # Unknown-size Folded Reload
+; REMAT-NEXT:    sf.vc.vv 3, 0, v30, v4
+; REMAT-NEXT:    vle32.v v4, (a2)
 ; REMAT-NEXT:    add a2, a0, s4
-; REMAT-NEXT:    vle32.v v16, (a2)
+; REMAT-NEXT:    vle32.v v30, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v6, v10
-; REMAT-NEXT:    vle32.v v6, (a2)
-; REMAT-NEXT:    add a2, a0, s5
 ; REMAT-NEXT:    vle32.v v10, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v0, v14
-; REMAT-NEXT:    vle32.v v4, (a2)
-; REMAT-NEXT:    add a2, a0, s6
-; REMAT-NEXT:    vle32.v v14, (a2)
-; REMAT-NEXT:    csrr a5, vlenb
-; REMAT-NEXT:    li a6, 12
-; REMAT-NEXT:    mul a5, a5, a6
-; REMAT-NEXT:    add a5, sp, a5
-; REMAT-NEXT:    addi a5, a5, 432
-; REMAT-NEXT:    vl2r.v v0, (a5) # Unknown-size Folded Reload
-; REMAT-NEXT:    sf.vc.vv 3, 0, v0, v18
+; REMAT-NEXT:    add a2, a0, s5
+; REMAT-NEXT:    vle32.v v6, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v0, v8
 ; REMAT-NEXT:    vle32.v v0, (a2)
-; REMAT-NEXT:    add a2, a0, s7
+; REMAT-NEXT:    add a2, a0, s6
+; REMAT-NEXT:    vle32.v v8, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v18, v16
 ; REMAT-NEXT:    vle32.v v18, (a2)
+; REMAT-NEXT:    add a2, a0, s7
+; REMAT-NEXT:    vle32.v v16, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v20, v22
 ; REMAT-NEXT:    vle32.v v22, (a2)
-; REMAT-NEXT:    add a2, a0, s8
+; REMAT-NEXT:    add a2, a0, a4
 ; REMAT-NEXT:    vle32.v v20, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v24, v26
-; REMAT-NEXT:    vle32.v v26, (a2)
-; REMAT-NEXT:    add a2, a0, s9
 ; REMAT-NEXT:    vle32.v v24, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v28, v8
-; REMAT-NEXT:    vle32.v v28, (a2)
-; REMAT-NEXT:    add a2, a0, s10
-; REMAT-NEXT:    vle32.v v8, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v2, v12
-; REMAT-NEXT:    vle32.v v12, (a2)
-; REMAT-NEXT:    csrr a2, vlenb
-; REMAT-NEXT:    slli a2, a2, 3
-; REMAT-NEXT:    add a2, sp, a2
-; REMAT-NEXT:    addi a2, a2, 432
-; REMAT-NEXT:    vs2r.v v12, (a2) # Unknown-size Folded Spill
-; REMAT-NEXT:    add a2, a0, s11
+; REMAT-NEXT:    addi a2, sp, 432
+; REMAT-NEXT:    vs2r.v v24, (a2) # Unknown-size Folded Spill
+; REMAT-NEXT:    add a2, a0, a3
+; REMAT-NEXT:    vle32.v v24, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v14, v12
 ; REMAT-NEXT:    vle32.v v12, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v30, v16
-; REMAT-NEXT:    vle32.v v16, (a2)
-; REMAT-NEXT:    add a2, a0, ra
-; REMAT-NEXT:    vle32.v v2, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v6, v10
-; REMAT-NEXT:    vle32.v v10, (a2)
-; REMAT-NEXT:    csrr a2, vlenb
-; REMAT-NEXT:    slli a2, a2, 1
-; REMAT-NEXT:    add a2, sp, a2
-; REMAT-NEXT:    addi a2, a2, 432
-; REMAT-NEXT:    vs2r.v v10, (a2) # Unknown-size Folded Spill
-; REMAT-NEXT:    add a2, a0, a4
-; REMAT-NEXT:    vle32.v v10, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v4, v14
+; REMAT-NEXT:    li a5, 23
+; REMAT-NEXT:    slli a5, a5, 9
+; REMAT-NEXT:    add a2, a0, a5
+; REMAT-NEXT:    vle32.v v26, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v2, v28
 ; REMAT-NEXT:    vle32.v v14, (a2)
 ; REMAT-NEXT:    csrr a2, vlenb
-; REMAT-NEXT:    slli a2, a2, 2
+; REMAT-NEXT:    li a3, 6
+; REMAT-NEXT:    mul a2, a2, a3
 ; REMAT-NEXT:    add a2, sp, a2
 ; REMAT-NEXT:    addi a2, a2, 432
 ; REMAT-NEXT:    vs2r.v v14, (a2) # Unknown-size Folded Spill
-; REMAT-NEXT:    add a2, a0, a3
+; REMAT-NEXT:    lui s8, 3
+; REMAT-NEXT:    add a2, a0, s8
+; REMAT-NEXT:    vle32.v v28, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v4, v30
 ; REMAT-NEXT:    vle32.v v14, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v0, v18
-; REMAT-NEXT:    vle32.v v18, (a2)
-; REMAT-NEXT:    csrr a2, vlenb
-; REMAT-NEXT:    slli a2, a2, 4
-; REMAT-NEXT:    add a2, sp, a2
-; REMAT-NEXT:    addi a2, a2, 432
-; REMAT-NEXT:    vs2r.v v18, (a2) # Unknown-size Folded Spill
-; REMAT-NEXT:    li a5, 7
-; REMAT-NEXT:    slli a5, a5, 11
-; REMAT-NEXT:    add a2, a0, a5
-; REMAT-NEXT:    vle32.v v18, (a2)
-; REMAT-NEXT:    addi a3, sp, 432
-; REMAT-NEXT:    vs2r.v v18, (a3) # Unknown-size Folded Spill
-; REMAT-NEXT:    sf.vc.vv 3, 0, v22, v20
-; REMAT-NEXT:    vle32.v v18, (a2)
 ; REMAT-NEXT:    csrr a2, vlenb
-; REMAT-NEXT:    li a3, 14
-; REMAT-NEXT:    mul a2, a2, a3
-; REMAT-NEXT:    add a2, sp, a2
-; REMAT-NEXT:    addi a2, a2, 432
-; REMAT-NEXT:    vs2r.v v18, (a2) # Unknown-size Folded Spill
-; REMAT-NEXT:    li a2, 29
-; REMAT-NEXT:    slli a2, a2, 9
-; REMAT-NEXT:    add a2, a0, a2
-; REMAT-NEXT:    vle32.v v18, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v26, v24
-; REMAT-NEXT:    vle32.v v20, (a2)
-; REMAT-NEXT:    csrr a2, vlenb
-; REMAT-NEXT:    li a3, 12
-; REMAT-NEXT:    mul a2, a2, a3
+; REMAT-NEXT:    slli a2, a2, 2
 ; REMAT-NEXT:    add a2, sp, a2
 ; REMAT-NEXT:    addi a2, a2, 432
-; REMAT-NEXT:    vs2r.v v20, (a2) # Unknown-size Folded Spill
-; REMAT-NEXT:    li a2, 15
-; REMAT-NEXT:    slli a2, a2, 10
-; REMAT-NEXT:    add a2, a0, a2
+; REMAT-NEXT:    vs2r.v v14, (a2) # Unknown-size Folded Spill
+; REMAT-NEXT:    li s9, 25
+; REMAT-NEXT:    slli s9, s9, 9
+; REMAT-NEXT:    add a2, a0, s9
 ; REMAT-NEXT:    vle32.v v30, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v28, v8
-; REMAT-NEXT:    vle32.v v8, (a2)
-; REMAT-NEXT:    csrr a2, vlenb
-; REMAT-NEXT:    li a3, 10
-; REMAT-NEXT:    mul a2, a2, a3
-; REMAT-NEXT:    add a2, sp, a2
-; REMAT-NEXT:    addi a2, a2, 432
-; REMAT-NEXT:    vs2r.v v8, (a2) # Unknown-size Folded Spill
-; REMAT-NEXT:    li a2, 31
-; REMAT-NEXT:    slli a2, a2, 9
-; REMAT-NEXT:    add a2, a0, a2
+; REMAT-NEXT:    sf.vc.vv 3, 0, v10, v6
+; REMAT-NEXT:    vle32.v v14, (a2)
+; REMAT-NEXT:    li s10, 13
+; REMAT-NEXT:    slli s10, s10, 10
+; REMAT-NEXT:    add a2, a0, s10
 ; REMAT-NEXT:    vle32.v v6, (a2)
-; REMAT-NEXT:    csrr a3, vlenb
-; REMAT-NEXT:    slli a3, a3, 3
-; REMAT-NEXT:    add a3, sp, a3
-; REMAT-NEXT:    addi a3, a3, 432
-; REMAT-NEXT:    vl2r.v v8, (a3) # Unknown-size Folded Reload
-; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v12
+; REMAT-NEXT:    sf.vc.vv 3, 0, v0, v8
 ; REMAT-NEXT:    vle32.v v8, (a2)
 ; REMAT-NEXT:    csrr a2, vlenb
-; REMAT-NEXT:    slli a2, a2, 3
+; REMAT-NEXT:    slli a2, a2, 1
 ; REMAT-NEXT:    add a2, sp, a2
 ; REMAT-NEXT:    addi a2, a2, 432
 ; REMAT-NEXT:    vs2r.v v8, (a2) # Unknown-size Folded Spill
-; REMAT-NEXT:    lui a2, 4
-; REMAT-NEXT:    add a2, a0, a2
+; REMAT-NEXT:    li s11, 27
+; REMAT-NEXT:    slli s11, s11, 9
+; REMAT-NEXT:    add a2, a0, s11
 ; REMAT-NEXT:    vle32.v v4, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v16, v2
-; REMAT-NEXT:    vle32.v v8, (a2)
-; REMAT-NEXT:    csrr a2, vlenb
-; REMAT-NEXT:    li a3, 6
-; REMAT-NEXT:    mul a2, a2, a3
-; REMAT-NEXT:    add a2, sp, a2
-; REMAT-NEXT:    addi a2, a2, 432
-; REMAT-NEXT:    vs2r.v v8, (a2) # Unknown-size Folded Spill
-; REMAT-NEXT:    lui a2, 4
-; REMAT-NEXT:    addiw a2, a2, 512
-; REMAT-NEXT:    add a2, a0, a2
+; REMAT-NEXT:    sf.vc.vv 3, 0, v18, v16
+; REMAT-NEXT:    vle32.v v18, (a2)
+; REMAT-NEXT:    li ra, 7
+; REMAT-NEXT:    slli ra, ra, 11
+; REMAT-NEXT:    add a2, a0, ra
 ; REMAT-NEXT:    vle32.v v2, (a2)
-; REMAT-NEXT:    csrr a3, vlenb
-; REMAT-NEXT:    slli a3, a3, 1
-; REMAT-NEXT:    add a3, sp, a3
-; REMAT-NEXT:    addi a3, a3, 432
-; REMAT-NEXT:    vl2r.v v8, (a3) # Unknown-size Folded Reload
-; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v10
+; REMAT-NEXT:    sf.vc.vv 3, 0, v22, v20
 ; REMAT-NEXT:    vle32.v v20, (a2)
-; REMAT-NEXT:    li a2, 17
-; REMAT-NEXT:    slli a2, a2, 10
+; REMAT-NEXT:    li a2, 29
+; REMAT-NEXT:    slli a2, a2, 9
 ; REMAT-NEXT:    add a2, a0, a2
 ; REMAT-NEXT:    vle32.v v0, (a2)
-; REMAT-NEXT:    csrr a3, vlenb
-; REMAT-NEXT:    slli a3, a3, 2
-; REMAT-NEXT:    add a3, sp, a3
-; REMAT-NEXT:    addi a3, a3, 432
+; REMAT-NEXT:    addi a3, sp, 432
 ; REMAT-NEXT:    vl2r.v v8, (a3) # Unknown-size Folded Reload
-; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v14
+; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v24
 ; REMAT-NEXT:    vle32.v v22, (a2)
-; REMAT-NEXT:    lui a2, 4
-; REMAT-NEXT:    addiw a2, a2, 1536
+; REMAT-NEXT:    li a2, 15
+; REMAT-NEXT:    slli a2, a2, 10
 ; REMAT-NEXT:    add a2, a0, a2
 ; REMAT-NEXT:    vle32.v v24, (a2)
-; REMAT-NEXT:    csrr a3, vlenb
-; REMAT-NEXT:    slli a3, a3, 4
-; REMAT-NEXT:    add a3, sp, a3
-; REMAT-NEXT:    addi a3, a3, 432
-; REMAT-NEXT:    vl2r.v v8, (a3) # Unknown-size Folded Reload
-; REMAT-NEXT:    addi a3, sp, 432
-; REMAT-NEXT:    vl2r.v v10, (a3) # Unknown-size Folded Reload
-; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v10
+; REMAT-NEXT:    sf.vc.vv 3, 0, v12, v26
 ; REMAT-NEXT:    vle32.v v8, (a2)
-; REMAT-NEXT:    li a2, 9
-; REMAT-NEXT:    slli a2, a2, 11
+; REMAT-NEXT:    li a2, 31
+; REMAT-NEXT:    slli a2, a2, 9
 ; REMAT-NEXT:    add a2, a0, a2
 ; REMAT-NEXT:    vle32.v v26, (a2)
 ; REMAT-NEXT:    csrr a3, vlenb
-; REMAT-NEXT:    li a4, 14
+; REMAT-NEXT:    li a4, 6
 ; REMAT-NEXT:    mul a3, a3, a4
 ; REMAT-NEXT:    add a3, sp, a3
 ; REMAT-NEXT:    addi a3, a3, 432
 ; REMAT-NEXT:    vl2r.v v10, (a3) # Unknown-size Folded Reload
-; REMAT-NEXT:    sf.vc.vv 3, 0, v10, v18
+; REMAT-NEXT:    sf.vc.vv 3, 0, v10, v28
 ; REMAT-NEXT:    vle32.v v10, (a2)
-; REMAT-NEXT:    lui a2, 5
-; REMAT-NEXT:    addiw a2, a2, -1536
+; REMAT-NEXT:    lui a2, 4
 ; REMAT-NEXT:    add a2, a0, a2
 ; REMAT-NEXT:    vle32.v v28, (a2)
 ; REMAT-NEXT:    csrr a3, vlenb
-; REMAT-NEXT:    li a4, 12
-; REMAT-NEXT:    mul a3, a3, a4
+; REMAT-NEXT:    slli a3, a3, 2
 ; REMAT-NEXT:    add a3, sp, a3
 ; REMAT-NEXT:    addi a3, a3, 432
 ; REMAT-NEXT:    vl2r.v v12, (a3) # Unknown-size Folded Reload
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v12, v30
 ; REMAT-NEXT:    vle32.v v12, (a2)
-; REMAT-NEXT:    li a2, 19
-; REMAT-NEXT:    slli a2, a2, 10
+; REMAT-NEXT:    lui a2, 4
+; REMAT-NEXT:    addiw a2, a2, 512
 ; REMAT-NEXT:    add a2, a0, a2
 ; REMAT-NEXT:    vle32.v v30, (a2)
-; REMAT-NEXT:    csrr a3, vlenb
-; REMAT-NEXT:    li a4, 10
-; REMAT-NEXT:    mul a3, a3, a4
-; REMAT-NEXT:    add a3, sp, a3
-; REMAT-NEXT:    addi a3, a3, 432
-; REMAT-NEXT:    vl2r.v v14, (a3) # Unknown-size Folded Reload
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v14, v6
 ; REMAT-NEXT:    vle32.v v14, (a2)
-; REMAT-NEXT:    lui a2, 5
-; REMAT-NEXT:    addiw a2, a2, -512
+; REMAT-NEXT:    li a2, 17
+; REMAT-NEXT:    slli a2, a2, 10
 ; REMAT-NEXT:    add a2, a0, a2
 ; REMAT-NEXT:    vle32.v v6, (a2)
 ; REMAT-NEXT:    csrr a3, vlenb
-; REMAT-NEXT:    slli a3, a3, 3
+; REMAT-NEXT:    slli a3, a3, 1
 ; REMAT-NEXT:    add a3, sp, a3
 ; REMAT-NEXT:    addi a3, a3, 432
 ; REMAT-NEXT:    vl2r.v v16, (a3) # Unknown-size Folded Reload
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v16, v4
 ; REMAT-NEXT:    vle32.v v16, (a2)
-; REMAT-NEXT:    lui a2, 5
+; REMAT-NEXT:    lui a2, 4
+; REMAT-NEXT:    addiw a2, a2, 1536
 ; REMAT-NEXT:    add a2, a0, a2
 ; REMAT-NEXT:    vle32.v v4, (a2)
-; REMAT-NEXT:    csrr a3, vlenb
-; REMAT-NEXT:    li a4, 6
-; REMAT-NEXT:    mul a3, a3, a4
-; REMAT-NEXT:    add a3, sp, a3
-; REMAT-NEXT:    addi a3, a3, 432
-; REMAT-NEXT:    vl2r.v v18, (a3) # Unknown-size Folded Reload
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v18, v2
 ; REMAT-NEXT:    vle32.v v18, (a2)
-; REMAT-NEXT:    lui a2, 5
-; REMAT-NEXT:    addiw a2, a2, 512
+; REMAT-NEXT:    li a2, 9
+; REMAT-NEXT:    slli a2, a2, 11
 ; REMAT-NEXT:    add a2, a0, a2
 ; REMAT-NEXT:    vle32.v v2, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v20, v0
 ; REMAT-NEXT:    vle32.v v20, (a2)
-; REMAT-NEXT:    li s7, 21
-; REMAT-NEXT:    slli s7, s7, 10
-; REMAT-NEXT:    add a2, a0, s7
+; REMAT-NEXT:    lui a2, 5
+; REMAT-NEXT:    addiw a2, a2, -1536
+; REMAT-NEXT:    add a2, a0, a2
 ; REMAT-NEXT:    vle32.v v0, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v22, v24
 ; REMAT-NEXT:    vle32.v v22, (a2)
-; REMAT-NEXT:    lui s4, 5
-; REMAT-NEXT:    addiw s4, s4, 1536
-; REMAT-NEXT:    add a2, a0, s4
+; REMAT-NEXT:    li a2, 19
+; REMAT-NEXT:    slli a2, a2, 10
+; REMAT-NEXT:    add a2, a0, a2
 ; REMAT-NEXT:    vle32.v v24, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v26
 ; REMAT-NEXT:    vle32.v v8, (a2)
-; REMAT-NEXT:    li a2, 11
-; REMAT-NEXT:    slli a2, a2, 11
+; REMAT-NEXT:    lui a2, 5
+; REMAT-NEXT:    addiw a2, a2, -512
 ; REMAT-NEXT:    add a2, a0, a2
 ; REMAT-NEXT:    vle32.v v26, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v10, v28
 ; REMAT-NEXT:    vle32.v v10, (a2)
-; REMAT-NEXT:    lui s3, 6
-; REMAT-NEXT:    addiw s3, s3, -1536
-; REMAT-NEXT:    add a2, a0, s3
+; REMAT-NEXT:    lui a2, 5
+; REMAT-NEXT:    add a2, a0, a2
 ; REMAT-NEXT:    vle32.v v28, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v12, v30
 ; REMAT-NEXT:    vle32.v v12, (a2)
-; REMAT-NEXT:    li s2, 23
-; REMAT-NEXT:    slli s2, s2, 10
-; REMAT-NEXT:    add a2, a0, s2
+; REMAT-NEXT:    lui a2, 5
+; REMAT-NEXT:    addiw a2, a2, 512
+; REMAT-NEXT:    add a2, a0, a2
 ; REMAT-NEXT:    vle32.v v30, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v14, v6
 ; REMAT-NEXT:    vle32.v v14, (a2)
-; REMAT-NEXT:    lui a2, 6
-; REMAT-NEXT:    addiw a2, a2, -512
+; REMAT-NEXT:    li a2, 21
+; REMAT-NEXT:    slli a2, a2, 10
 ; REMAT-NEXT:    add a2, a0, a2
 ; REMAT-NEXT:    vle32.v v6, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v16, v4
 ; REMAT-NEXT:    vle32.v v16, (a2)
-; REMAT-NEXT:    lui a2, 6
+; REMAT-NEXT:    lui a2, 5
+; REMAT-NEXT:    addiw a2, a2, 1536
 ; REMAT-NEXT:    add a2, a0, a2
-; REMAT-NEXT:    lui s1, 6
 ; REMAT-NEXT:    vle32.v v4, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v18, v2
 ; REMAT-NEXT:    vle32.v v18, (a2)
-; REMAT-NEXT:    lui s0, 6
-; REMAT-NEXT:    addiw s0, s0, 512
-; REMAT-NEXT:    add a2, a0, s0
+; REMAT-NEXT:    li a2, 11
+; REMAT-NEXT:    slli a2, a2, 11
+; REMAT-NEXT:    add a2, a0, a2
 ; REMAT-NEXT:    vle32.v v2, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v20, v0
 ; REMAT-NEXT:    vle32.v v20, (a2)
-; REMAT-NEXT:    li a2, 25
-; REMAT-NEXT:    slli a2, a2, 10
+; REMAT-NEXT:    lui a2, 6
+; REMAT-NEXT:    addiw a2, a2, -1536
 ; REMAT-NEXT:    add a2, a0, a2
 ; REMAT-NEXT:    vle32.v v0, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v22, v24
 ; REMAT-NEXT:    vle32.v v22, (a2)
-; REMAT-NEXT:    lui t6, 6
-; REMAT-NEXT:    addiw t6, t6, 1536
-; REMAT-NEXT:    add a2, a0, t6
+; REMAT-NEXT:    li a2, 23
+; REMAT-NEXT:    slli a2, a2, 10
+; REMAT-NEXT:    add a2, a0, a2
 ; REMAT-NEXT:    vle32.v v24, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v26
 ; REMAT-NEXT:    vle32.v v8, (a2)
-; REMAT-NEXT:    li t5, 13
-; REMAT-NEXT:    slli t5, t5, 11
-; REMAT-NEXT:    add a2, a0, t5
+; REMAT-NEXT:    lui a2, 6
+; REMAT-NEXT:    addiw a2, a2, -512
+; REMAT-NEXT:    add a2, a0, a2
 ; REMAT-NEXT:    vle32.v v26, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v10, v28
 ; REMAT-NEXT:    vle32.v v10, (a2)
-; REMAT-NEXT:    lui a2, 7
-; REMAT-NEXT:    addiw a2, a2, -1536
+; REMAT-NEXT:    lui a2, 6
 ; REMAT-NEXT:    add a2, a0, a2
+; REMAT-NEXT:    lui s1, 6
 ; REMAT-NEXT:    vle32.v v28, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v12, v30
 ; REMAT-NEXT:    vle32.v v12, (a2)
-; REMAT-NEXT:    li t4, 27
-; REMAT-NEXT:    slli t4, t4, 10
-; REMAT-NEXT:    add a2, a0, t4
+; REMAT-NEXT:    lui s0, 6
+; REMAT-NEXT:    addiw s0, s0, 512
+; REMAT-NEXT:    add a2, a0, s0
 ; REMAT-NEXT:    vle32.v v30, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v14, v6
 ; REMAT-NEXT:    vle32.v v14, (a2)
-; REMAT-NEXT:    lui a2, 7
-; REMAT-NEXT:    addiw a2, a2, -512
+; REMAT-NEXT:    li a2, 25
+; REMAT-NEXT:    slli a2, a2, 10
 ; REMAT-NEXT:    add a2, a0, a2
 ; REMAT-NEXT:    vle32.v v6, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v16, v4
 ; REMAT-NEXT:    vle32.v v16, (a2)
-; REMAT-NEXT:    lui a2, 7
-; REMAT-NEXT:    add a2, a0, a2
-; REMAT-NEXT:    lui t3, 7
+; REMAT-NEXT:    lui t6, 6
+; REMAT-NEXT:    addiw t6, t6, 1536
+; REMAT-NEXT:    add a2, a0, t6
 ; REMAT-NEXT:    vle32.v v4, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v18, v2
 ; REMAT-NEXT:    vle32.v v18, (a2)
-; REMAT-NEXT:    lui t2, 7
-; REMAT-NEXT:    addiw t2, t2, 512
-; REMAT-NEXT:    add a2, a0, t2
+; REMAT-NEXT:    li t5, 13
+; REMAT-NEXT:    slli t5, t5, 11
+; REMAT-NEXT:    add a2, a0, t5
 ; REMAT-NEXT:    vle32.v v2, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v20, v0
 ; REMAT-NEXT:    vle32.v v20, (a2)
-; REMAT-NEXT:    li t1, 29
-; REMAT-NEXT:    slli t1, t1, 10
-; REMAT-NEXT:    add a2, a0, t1
+; REMAT-NEXT:    lui a2, 7
+; REMAT-NEXT:    addiw a2, a2, -1536
+; REMAT-NEXT:    add a2, a0, a2
 ; REMAT-NEXT:    vle32.v v0, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v22, v24
 ; REMAT-NEXT:    vle32.v v22, (a2)
-; REMAT-NEXT:    lui t0, 7
-; REMAT-NEXT:    addiw t0, t0, 1536
-; REMAT-NEXT:    add a2, a0, t0
+; REMAT-NEXT:    li t4, 27
+; REMAT-NEXT:    slli t4, t4, 10
+; REMAT-NEXT:    add a2, a0, t4
 ; REMAT-NEXT:    vle32.v v24, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v26
 ; REMAT-NEXT:    vle32.v v8, (a2)
-; REMAT-NEXT:    li a7, 15
-; REMAT-NEXT:    slli a7, a7, 11
-; REMAT-NEXT:    add a2, a0, a7
+; REMAT-NEXT:    lui a2, 7
+; REMAT-NEXT:    addiw a2, a2, -512
+; REMAT-NEXT:    add a2, a0, a2
 ; REMAT-NEXT:    vle32.v v26, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v10, v28
 ; REMAT-NEXT:    vle32.v v10, (a2)
-; REMAT-NEXT:    lui a6, 8
-; REMAT-NEXT:    addiw a6, a6, -1536
-; REMAT-NEXT:    add a2, a0, a6
+; REMAT-NEXT:    lui a2, 7
+; REMAT-NEXT:    add a2, a0, a2
+; REMAT-NEXT:    lui t3, 7
 ; REMAT-NEXT:    vle32.v v28, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v12, v30
 ; REMAT-NEXT:    vle32.v v12, (a2)
-; REMAT-NEXT:    li a4, 31
-; REMAT-NEXT:    slli a4, a4, 10
-; REMAT-NEXT:    add a2, a0, a4
+; REMAT-NEXT:    lui t2, 7
+; REMAT-NEXT:    addiw t2, t2, 512
+; REMAT-NEXT:    add a2, a0, t2
 ; REMAT-NEXT:    vle32.v v30, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v14, v6
 ; REMAT-NEXT:    vle32.v v14, (a2)
-; REMAT-NEXT:    lui a3, 8
-; REMAT-NEXT:    addiw a3, a3, -512
-; REMAT-NEXT:    add a2, a0, a3
+; REMAT-NEXT:    li t1, 29
+; REMAT-NEXT:    slli t1, t1, 10
+; REMAT-NEXT:    add a2, a0, t1
 ; REMAT-NEXT:    vle32.v v6, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v16, v4
 ; REMAT-NEXT:    vle32.v v16, (a2)
-; REMAT-NEXT:    lui a2, 8
-; REMAT-NEXT:    add a0, a0, a2
-; REMAT-NEXT:    vle32.v v4, (a0)
+; REMAT-NEXT:    lui t0, 7
+; REMAT-NEXT:    addiw t0, t0, 1536
+; REMAT-NEXT:    add a2, a0, t0
+; REMAT-NEXT:    vle32.v v4, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v18, v2
+; REMAT-NEXT:    vle32.v v18, (a2)
+; REMAT-NEXT:    li a7, 15
+; REMAT-NEXT:    slli a7, a7, 11
+; REMAT-NEXT:    add a2, a0, a7
+; REMAT-NEXT:    vle32.v v2, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v20, v0
+; REMAT-NEXT:    vle32.v v20, (a2)
+; REMAT-NEXT:    lui a6, 8
+; REMAT-NEXT:    addiw a6, a6, -1536
+; REMAT-NEXT:    add a2, a0, a6
+; REMAT-NEXT:    vle32.v v0, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v22, v24
+; REMAT-NEXT:    vle32.v v22, (a2)
+; REMAT-NEXT:    li a4, 31
+; REMAT-NEXT:    slli a4, a4, 10
+; REMAT-NEXT:    add a2, a0, a4
+; REMAT-NEXT:    vle32.v v24, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v26
+; REMAT-NEXT:    vle32.v v8, (a2)
+; REMAT-NEXT:    lui a3, 8
+; REMAT-NEXT:    addiw a3, a3, -512
+; REMAT-NEXT:    add a2, a0, a3
+; REMAT-NEXT:    vle32.v v26, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v10, v28
+; REMAT-NEXT:    vle32.v v10, (a2)
+; REMAT-NEXT:    lui a2, 8
+; REMAT-NEXT:    add a0, a0, a2
+; REMAT-NEXT:    vle32.v v28, (a0)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v12, v30
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v14, v6
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v16, v4
+; REMAT-NEXT:    sf.vc.vv 3, 0, v18, v2
+; REMAT-NEXT:    sf.vc.vv 3, 0, v20, v0
+; REMAT-NEXT:    sf.vc.vv 3, 0, v22, v24
+; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v26
+; REMAT-NEXT:    sf.vc.vv 3, 0, v10, v28
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; REMAT-NEXT:    addi a0, a1, 1024
 ; REMAT-NEXT:    vse32.v v8, (a0)
@@ -1482,45 +1397,38 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; REMAT-NEXT:    slli a0, a0, 10
 ; REMAT-NEXT:    add a0, a1, a0
 ; REMAT-NEXT:    sd a0, 336(sp) # 8-byte Folded Spill
-; REMAT-NEXT:    li a0, 15
-; REMAT-NEXT:    slli a0, a0, 9
-; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    sd a0, 328(sp) # 8-byte Folded Spill
-; REMAT-NEXT:    lui a0, 2
-; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    sd a0, 320(sp) # 8-byte Folded Spill
-; REMAT-NEXT:    li a0, 17
-; REMAT-NEXT:    slli a0, a0, 9
-; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    sd a0, 312(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    add s2, a1, s2
+; REMAT-NEXT:    sd s2, 328(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    add s3, a1, s3
+; REMAT-NEXT:    sd s3, 320(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    add s4, a1, s4
+; REMAT-NEXT:    sd s4, 312(sp) # 8-byte Folded Spill
 ; REMAT-NEXT:    add s5, a1, s5
 ; REMAT-NEXT:    sd s5, 304(sp) # 8-byte Folded Spill
 ; REMAT-NEXT:    add s6, a1, s6
 ; REMAT-NEXT:    sd s6, 296(sp) # 8-byte Folded Spill
-; REMAT-NEXT:    li a0, 5
-; REMAT-NEXT:    slli a0, a0, 11
+; REMAT-NEXT:    add s7, a1, s7
+; REMAT-NEXT:    sd s7, 288(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    li a0, 21
+; REMAT-NEXT:    slli a0, a0, 9
+; REMAT-NEXT:    add a0, a1, a0
+; REMAT-NEXT:    sd a0, 280(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    li a0, 11
+; REMAT-NEXT:    slli a0, a0, 10
 ; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    sd a0, 288(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    sd a0, 272(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    add a5, a1, a5
+; REMAT-NEXT:    sd a5, 264(sp) # 8-byte Folded Spill
 ; REMAT-NEXT:    add s8, a1, s8
-; REMAT-NEXT:    sd s8, 280(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    sd s8, 256(sp) # 8-byte Folded Spill
 ; REMAT-NEXT:    add s9, a1, s9
-; REMAT-NEXT:    sd s9, 272(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    sd s9, 248(sp) # 8-byte Folded Spill
 ; REMAT-NEXT:    add s10, a1, s10
-; REMAT-NEXT:    sd s10, 264(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    sd s10, 240(sp) # 8-byte Folded Spill
 ; REMAT-NEXT:    add s11, a1, s11
-; REMAT-NEXT:    sd s11, 256(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    sd s11, 232(sp) # 8-byte Folded Spill
 ; REMAT-NEXT:    add ra, a1, ra
-; REMAT-NEXT:    sd ra, 248(sp) # 8-byte Folded Spill
-; REMAT-NEXT:    li a0, 13
-; REMAT-NEXT:    slli a0, a0, 10
-; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    sd a0, 240(sp) # 8-byte Folded Spill
-; REMAT-NEXT:    li a0, 27
-; REMAT-NEXT:    slli a0, a0, 9
-; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    sd a0, 232(sp) # 8-byte Folded Spill
-; REMAT-NEXT:    add a5, a1, a5
-; REMAT-NEXT:    sd a5, 224(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    sd ra, 224(sp) # 8-byte Folded Spill
 ; REMAT-NEXT:    li a0, 29
 ; REMAT-NEXT:    slli a0, a0, 9
 ; REMAT-NEXT:    add a0, a1, a0
@@ -1571,18 +1479,26 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; REMAT-NEXT:    addiw a0, a0, 512
 ; REMAT-NEXT:    add a0, a1, a0
 ; REMAT-NEXT:    sd a0, 120(sp) # 8-byte Folded Spill
-; REMAT-NEXT:    add s7, a1, s7
-; REMAT-NEXT:    sd s7, 112(sp) # 8-byte Folded Spill
-; REMAT-NEXT:    add s4, a1, s4
-; REMAT-NEXT:    sd s4, 104(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    li a0, 21
+; REMAT-NEXT:    slli a0, a0, 10
+; REMAT-NEXT:    add a0, a1, a0
+; REMAT-NEXT:    sd a0, 112(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    lui a0, 5
+; REMAT-NEXT:    addiw a0, a0, 1536
+; REMAT-NEXT:    add a0, a1, a0
+; REMAT-NEXT:    sd a0, 104(sp) # 8-byte Folded Spill
 ; REMAT-NEXT:    li a0, 11
 ; REMAT-NEXT:    slli a0, a0, 11
 ; REMAT-NEXT:    add a0, a1, a0
 ; REMAT-NEXT:    sd a0, 96(sp) # 8-byte Folded Spill
-; REMAT-NEXT:    add s3, a1, s3
-; REMAT-NEXT:    sd s3, 88(sp) # 8-byte Folded Spill
-; REMAT-NEXT:    add s2, a1, s2
-; REMAT-NEXT:    sd s2, 80(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    lui a0, 6
+; REMAT-NEXT:    addiw a0, a0, -1536
+; REMAT-NEXT:    add a0, a1, a0
+; REMAT-NEXT:    sd a0, 88(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    li a0, 23
+; REMAT-NEXT:    slli a0, a0, 10
+; REMAT-NEXT:    add a0, a1, a0
+; REMAT-NEXT:    sd a0, 80(sp) # 8-byte Folded Spill
 ; REMAT-NEXT:    lui a0, 6
 ; REMAT-NEXT:    addiw a0, a0, -512
 ; REMAT-NEXT:    add a0, a1, a0
@@ -1879,8 +1795,7 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; REMAT-NEXT:    csrr a0, vlenb
-; REMAT-NEXT:    li a1, 18
-; REMAT-NEXT:    mul a0, a0, a1
+; REMAT-NEXT:    slli a0, a0, 3
 ; REMAT-NEXT:    add sp, sp, a0
 ; REMAT-NEXT:    .cfi_def_cfa sp, 544
 ; REMAT-NEXT:    ld ra, 536(sp) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll
index 575a757149ebba..0b5856a7000dd4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll
@@ -5682,28 +5682,16 @@ define void @mscatter_baseidx_v8i64(<8 x i64> %val, ptr %base, <8 x i64> %idxs,
 ;
 ; RV32ZVE32F-LABEL: mscatter_baseidx_v8i64:
 ; RV32ZVE32F:       # %bb.0:
-; RV32ZVE32F-NEXT:    addi sp, sp, -48
-; RV32ZVE32F-NEXT:    .cfi_def_cfa_offset 48
-; RV32ZVE32F-NEXT:    sw s0, 44(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT:    sw s1, 40(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT:    sw s2, 36(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT:    sw s3, 32(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT:    sw s4, 28(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT:    sw s5, 24(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT:    sw s6, 20(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT:    sw s7, 16(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT:    sw s8, 12(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT:    sw s9, 8(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    addi sp, sp, -16
+; RV32ZVE32F-NEXT:    .cfi_def_cfa_offset 16
+; RV32ZVE32F-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw s2, 4(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw s3, 0(sp) # 4-byte Folded Spill
 ; RV32ZVE32F-NEXT:    .cfi_offset s0, -4
 ; RV32ZVE32F-NEXT:    .cfi_offset s1, -8
 ; RV32ZVE32F-NEXT:    .cfi_offset s2, -12
 ; RV32ZVE32F-NEXT:    .cfi_offset s3, -16
-; RV32ZVE32F-NEXT:    .cfi_offset s4, -20
-; RV32ZVE32F-NEXT:    .cfi_offset s5, -24
-; RV32ZVE32F-NEXT:    .cfi_offset s6, -28
-; RV32ZVE32F-NEXT:    .cfi_offset s7, -32
-; RV32ZVE32F-NEXT:    .cfi_offset s8, -36
-; RV32ZVE32F-NEXT:    .cfi_offset s9, -40
 ; RV32ZVE32F-NEXT:    .cfi_remember_state
 ; RV32ZVE32F-NEXT:    lw a3, 56(a0)
 ; RV32ZVE32F-NEXT:    lw a4, 60(a0)
@@ -5715,30 +5703,30 @@ define void @mscatter_baseidx_v8i64(<8 x i64> %val, ptr %base, <8 x i64> %idxs,
 ; RV32ZVE32F-NEXT:    lw t4, 28(a0)
 ; RV32ZVE32F-NEXT:    lw t1, 32(a0)
 ; RV32ZVE32F-NEXT:    lw t2, 36(a0)
+; RV32ZVE32F-NEXT:    lw t5, 0(a2)
+; RV32ZVE32F-NEXT:    lw t6, 8(a2)
+; RV32ZVE32F-NEXT:    lw s0, 16(a2)
+; RV32ZVE32F-NEXT:    lw s1, 24(a2)
+; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32ZVE32F-NEXT:    vmv.v.x v8, t5
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, t6
+; RV32ZVE32F-NEXT:    lw t5, 32(a2)
+; RV32ZVE32F-NEXT:    lw t6, 40(a2)
+; RV32ZVE32F-NEXT:    lw s2, 48(a2)
+; RV32ZVE32F-NEXT:    lw s3, 56(a2)
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s0
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s1
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, t5
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, t6
 ; RV32ZVE32F-NEXT:    lw s0, 8(a0)
 ; RV32ZVE32F-NEXT:    lw s1, 12(a0)
 ; RV32ZVE32F-NEXT:    lw t5, 16(a0)
 ; RV32ZVE32F-NEXT:    lw t6, 20(a0)
-; RV32ZVE32F-NEXT:    lw s2, 32(a2)
-; RV32ZVE32F-NEXT:    lw s3, 40(a2)
-; RV32ZVE32F-NEXT:    lw s4, 48(a2)
-; RV32ZVE32F-NEXT:    lw s5, 56(a2)
-; RV32ZVE32F-NEXT:    lw s6, 0(a2)
-; RV32ZVE32F-NEXT:    lw s7, 8(a2)
-; RV32ZVE32F-NEXT:    lw s8, 16(a2)
-; RV32ZVE32F-NEXT:    lw s9, 24(a2)
-; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32ZVE32F-NEXT:    vmv.v.x v8, s6
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s2
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s a2, v0
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s7
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s8
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s9
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s2
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s3
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s4
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s5
 ; RV32ZVE32F-NEXT:    vsll.vi v8, v8, 3
 ; RV32ZVE32F-NEXT:    andi s2, a2, 1
 ; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a1
@@ -5771,27 +5759,15 @@ define void @mscatter_baseidx_v8i64(<8 x i64> %val, ptr %base, <8 x i64> %idxs,
 ; RV32ZVE32F-NEXT:    sw a3, 0(a0)
 ; RV32ZVE32F-NEXT:    sw a4, 4(a0)
 ; RV32ZVE32F-NEXT:  .LBB51_9: # %else14
-; RV32ZVE32F-NEXT:    lw s0, 44(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    lw s1, 40(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    lw s2, 36(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    lw s3, 32(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    lw s4, 28(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    lw s5, 24(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    lw s6, 20(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    lw s7, 16(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    lw s8, 12(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    lw s9, 8(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    lw s2, 4(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    lw s3, 0(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    .cfi_restore s0
 ; RV32ZVE32F-NEXT:    .cfi_restore s1
 ; RV32ZVE32F-NEXT:    .cfi_restore s2
 ; RV32ZVE32F-NEXT:    .cfi_restore s3
-; RV32ZVE32F-NEXT:    .cfi_restore s4
-; RV32ZVE32F-NEXT:    .cfi_restore s5
-; RV32ZVE32F-NEXT:    .cfi_restore s6
-; RV32ZVE32F-NEXT:    .cfi_restore s7
-; RV32ZVE32F-NEXT:    .cfi_restore s8
-; RV32ZVE32F-NEXT:    .cfi_restore s9
-; RV32ZVE32F-NEXT:    addi sp, sp, 48
+; RV32ZVE32F-NEXT:    addi sp, sp, 16
 ; RV32ZVE32F-NEXT:    .cfi_def_cfa_offset 0
 ; RV32ZVE32F-NEXT:    ret
 ; RV32ZVE32F-NEXT:  .LBB51_10: # %cond.store
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll
index 03d5762b4903ef..036fee6a13ca4c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll
@@ -1364,19 +1364,16 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN32-NEXT:    vs1r.v v10, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN32-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
 ; ZVFHMIN32-NEXT:    vslidedown.vi v26, v8, 15
-; ZVFHMIN32-NEXT:    vslidedown.vi v20, v8, 14
-; ZVFHMIN32-NEXT:    vslidedown.vi v28, v8, 13
-; ZVFHMIN32-NEXT:    vslidedown.vi v10, v8, 12
-; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    slli a2, a2, 1
-; ZVFHMIN32-NEXT:    add a2, sp, a2
-; ZVFHMIN32-NEXT:    addi a2, a2, 848
+; ZVFHMIN32-NEXT:    vslidedown.vi v28, v8, 14
+; ZVFHMIN32-NEXT:    vslidedown.vi v10, v8, 13
+; ZVFHMIN32-NEXT:    addi a2, sp, 848
 ; ZVFHMIN32-NEXT:    vs2r.v v10, (a2) # Unknown-size Folded Spill
-; ZVFHMIN32-NEXT:    vslidedown.vi v4, v8, 11
-; ZVFHMIN32-NEXT:    vslidedown.vi v2, v8, 10
-; ZVFHMIN32-NEXT:    vslidedown.vi v30, v8, 9
-; ZVFHMIN32-NEXT:    vslidedown.vi v22, v8, 8
-; ZVFHMIN32-NEXT:    vmv.x.s a4, v16
+; ZVFHMIN32-NEXT:    vslidedown.vi v6, v8, 12
+; ZVFHMIN32-NEXT:    vslidedown.vi v2, v8, 11
+; ZVFHMIN32-NEXT:    vslidedown.vi v22, v8, 10
+; ZVFHMIN32-NEXT:    vslidedown.vi v20, v8, 9
+; ZVFHMIN32-NEXT:    vslidedown.vi v18, v8, 8
+; ZVFHMIN32-NEXT:    vmv.x.s a3, v16
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
@@ -1384,52 +1381,51 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN32-NEXT:    lh a0, 560(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 304(sp)
 ; ZVFHMIN32-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; ZVFHMIN32-NEXT:    vslidedown.vi v3, v16, 7
-; ZVFHMIN32-NEXT:    vslidedown.vi v31, v16, 6
-; ZVFHMIN32-NEXT:    vslidedown.vi v5, v16, 5
+; ZVFHMIN32-NEXT:    vslidedown.vi v21, v16, 7
+; ZVFHMIN32-NEXT:    vslidedown.vi v3, v16, 6
+; ZVFHMIN32-NEXT:    vslidedown.vi v19, v16, 5
 ; ZVFHMIN32-NEXT:    vslidedown.vi v23, v16, 4
 ; ZVFHMIN32-NEXT:    vslidedown.vi v8, v16, 3
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    li a3, 21
-; ZVFHMIN32-NEXT:    mul a2, a2, a3
+; ZVFHMIN32-NEXT:    li a4, 10
+; ZVFHMIN32-NEXT:    mul a2, a2, a4
 ; ZVFHMIN32-NEXT:    add a2, sp, a2
 ; ZVFHMIN32-NEXT:    addi a2, a2, 848
 ; ZVFHMIN32-NEXT:    vs1r.v v8, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN32-NEXT:    vslidedown.vi v8, v16, 2
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    li a3, 20
-; ZVFHMIN32-NEXT:    mul a2, a2, a3
+; ZVFHMIN32-NEXT:    slli a2, a2, 4
 ; ZVFHMIN32-NEXT:    add a2, sp, a2
 ; ZVFHMIN32-NEXT:    addi a2, a2, 848
 ; ZVFHMIN32-NEXT:    vs1r.v v8, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN32-NEXT:    vslidedown.vi v8, v16, 1
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    li a3, 22
-; ZVFHMIN32-NEXT:    mul a2, a2, a3
+; ZVFHMIN32-NEXT:    slli a4, a2, 4
+; ZVFHMIN32-NEXT:    sub a2, a4, a2
 ; ZVFHMIN32-NEXT:    add a2, sp, a2
 ; ZVFHMIN32-NEXT:    addi a2, a2, 848
 ; ZVFHMIN32-NEXT:    vs1r.v v8, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN32-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
-; ZVFHMIN32-NEXT:    vslidedown.vi v18, v16, 15
-; ZVFHMIN32-NEXT:    vslidedown.vi v14, v16, 14
-; ZVFHMIN32-NEXT:    vslidedown.vi v8, v16, 13
+; ZVFHMIN32-NEXT:    vslidedown.vi v14, v16, 15
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v16, 14
+; ZVFHMIN32-NEXT:    vslidedown.vi v10, v16, 13
 ; ZVFHMIN32-NEXT:    vslidedown.vi v12, v16, 12
-; ZVFHMIN32-NEXT:    vslidedown.vi v10, v16, 11
-; ZVFHMIN32-NEXT:    vslidedown.vi v6, v16, 10
+; ZVFHMIN32-NEXT:    vslidedown.vi v30, v16, 11
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    li a3, 18
-; ZVFHMIN32-NEXT:    mul a2, a2, a3
+; ZVFHMIN32-NEXT:    slli a4, a2, 4
+; ZVFHMIN32-NEXT:    add a2, a4, a2
 ; ZVFHMIN32-NEXT:    add a2, sp, a2
 ; ZVFHMIN32-NEXT:    addi a2, a2, 848
-; ZVFHMIN32-NEXT:    vs2r.v v6, (a2) # Unknown-size Folded Spill
-; ZVFHMIN32-NEXT:    vslidedown.vi v6, v16, 9
+; ZVFHMIN32-NEXT:    vs2r.v v30, (a2) # Unknown-size Folded Spill
+; ZVFHMIN32-NEXT:    vslidedown.vi v30, v16, 10
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    li a3, 14
-; ZVFHMIN32-NEXT:    mul a2, a2, a3
+; ZVFHMIN32-NEXT:    li a4, 11
+; ZVFHMIN32-NEXT:    mul a2, a2, a4
 ; ZVFHMIN32-NEXT:    add a2, sp, a2
 ; ZVFHMIN32-NEXT:    addi a2, a2, 848
-; ZVFHMIN32-NEXT:    vs2r.v v6, (a2) # Unknown-size Folded Spill
-; ZVFHMIN32-NEXT:    vslidedown.vi v6, v16, 8
+; ZVFHMIN32-NEXT:    vs2r.v v30, (a2) # Unknown-size Folded Spill
+; ZVFHMIN32-NEXT:    vslidedown.vi v4, v16, 9
+; ZVFHMIN32-NEXT:    vslidedown.vi v30, v16, 8
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
@@ -1437,12 +1433,12 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN32-NEXT:    lh a0, 558(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 302(sp)
 ; ZVFHMIN32-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; ZVFHMIN32-NEXT:    vslidedown.vi v13, v0, 7
-; ZVFHMIN32-NEXT:    vslidedown.vi v29, v0, 6
-; ZVFHMIN32-NEXT:    vslidedown.vi v11, v0, 5
-; ZVFHMIN32-NEXT:    vslidedown.vi v7, v0, 4
-; ZVFHMIN32-NEXT:    vslidedown.vi v9, v0, 3
-; ZVFHMIN32-NEXT:    vslidedown.vi v21, v0, 2
+; ZVFHMIN32-NEXT:    vslidedown.vi v11, v0, 7
+; ZVFHMIN32-NEXT:    vslidedown.vi v7, v0, 6
+; ZVFHMIN32-NEXT:    vslidedown.vi v9, v0, 5
+; ZVFHMIN32-NEXT:    vslidedown.vi v29, v0, 4
+; ZVFHMIN32-NEXT:    vslidedown.vi v31, v0, 3
+; ZVFHMIN32-NEXT:    vslidedown.vi v5, v0, 2
 ; ZVFHMIN32-NEXT:    vslidedown.vi v27, v0, 1
 ; ZVFHMIN32-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
 ; ZVFHMIN32-NEXT:    vslidedown.vi v16, v0, 15
@@ -1453,63 +1449,63 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN32-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN32-NEXT:    vslidedown.vi v16, v0, 14
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    slli a2, a2, 3
+; ZVFHMIN32-NEXT:    slli a2, a2, 1
 ; ZVFHMIN32-NEXT:    add a2, sp, a2
 ; ZVFHMIN32-NEXT:    addi a2, a2, 848
 ; ZVFHMIN32-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN32-NEXT:    vslidedown.vi v16, v0, 13
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    li a3, 6
-; ZVFHMIN32-NEXT:    mul a2, a2, a3
+; ZVFHMIN32-NEXT:    li a4, 6
+; ZVFHMIN32-NEXT:    mul a2, a2, a4
 ; ZVFHMIN32-NEXT:    add a2, sp, a2
 ; ZVFHMIN32-NEXT:    addi a2, a2, 848
 ; ZVFHMIN32-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN32-NEXT:    vslidedown.vi v16, v0, 12
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    li a3, 12
-; ZVFHMIN32-NEXT:    mul a2, a2, a3
+; ZVFHMIN32-NEXT:    slli a2, a2, 3
 ; ZVFHMIN32-NEXT:    add a2, sp, a2
 ; ZVFHMIN32-NEXT:    addi a2, a2, 848
 ; ZVFHMIN32-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN32-NEXT:    vslidedown.vi v16, v0, 11
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    li a3, 10
-; ZVFHMIN32-NEXT:    mul a2, a2, a3
+; ZVFHMIN32-NEXT:    li a4, 13
+; ZVFHMIN32-NEXT:    mul a2, a2, a4
 ; ZVFHMIN32-NEXT:    add a2, sp, a2
 ; ZVFHMIN32-NEXT:    addi a2, a2, 848
 ; ZVFHMIN32-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN32-NEXT:    vslidedown.vi v16, v0, 10
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    slli a2, a2, 4
+; ZVFHMIN32-NEXT:    li a4, 19
+; ZVFHMIN32-NEXT:    mul a2, a2, a4
 ; ZVFHMIN32-NEXT:    add a2, sp, a2
 ; ZVFHMIN32-NEXT:    addi a2, a2, 848
 ; ZVFHMIN32-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN32-NEXT:    vslidedown.vi v16, v0, 9
+; ZVFHMIN32-NEXT:    csrr a2, vlenb
+; ZVFHMIN32-NEXT:    li a4, 21
+; ZVFHMIN32-NEXT:    mul a2, a2, a4
+; ZVFHMIN32-NEXT:    add a2, sp, a2
+; ZVFHMIN32-NEXT:    addi a2, a2, 848
+; ZVFHMIN32-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN32-NEXT:    vslidedown.vi v0, v0, 8
-; ZVFHMIN32-NEXT:    addi a2, sp, 848
-; ZVFHMIN32-NEXT:    vs2r.v v0, (a2) # Unknown-size Folded Spill
-; ZVFHMIN32-NEXT:    vmv.x.s t4, v26
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a0, 215(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 556(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 300(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s t3, v20
-; ZVFHMIN32-NEXT:    vmv.x.s t1, v28
+; ZVFHMIN32-NEXT:    vmv.x.s t3, v26
+; ZVFHMIN32-NEXT:    vmv.x.s t2, v28
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a0, 214(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 554(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 298(sp)
-; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    slli a2, a2, 1
-; ZVFHMIN32-NEXT:    add a2, sp, a2
-; ZVFHMIN32-NEXT:    addi a2, a2, 848
-; ZVFHMIN32-NEXT:    vl2r.v v0, (a2) # Unknown-size Folded Reload
-; ZVFHMIN32-NEXT:    vmv.x.s t2, v0
-; ZVFHMIN32-NEXT:    vmv.x.s t0, v4
+; ZVFHMIN32-NEXT:    addi a2, sp, 848
+; ZVFHMIN32-NEXT:    vl2r.v v16, (a2) # Unknown-size Folded Reload
+; ZVFHMIN32-NEXT:    vmv.x.s t1, v16
+; ZVFHMIN32-NEXT:    vmv.x.s t0, v6
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
@@ -1517,229 +1513,234 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN32-NEXT:    lh a0, 552(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 296(sp)
 ; ZVFHMIN32-NEXT:    vmv.x.s a7, v2
-; ZVFHMIN32-NEXT:    vmv.x.s a6, v30
+; ZVFHMIN32-NEXT:    vmv.x.s a6, v22
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a0, 212(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 550(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 294(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s a5, v22
+; ZVFHMIN32-NEXT:    vmv.x.s a5, v20
 ; ZVFHMIN32-NEXT:    vmv.x.s a2, v18
-; ZVFHMIN32-NEXT:    sw a2, 112(sp) # 4-byte Folded Spill
+; ZVFHMIN32-NEXT:    sw a2, 108(sp) # 4-byte Folded Spill
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a0, 211(sp)
-; ZVFHMIN32-NEXT:    lh a1, 548(sp)
-; ZVFHMIN32-NEXT:    lh t5, 292(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s a0, v14
-; ZVFHMIN32-NEXT:    sw a0, 116(sp) # 4-byte Folded Spill
-; ZVFHMIN32-NEXT:    vmv.x.s a0, v8
-; ZVFHMIN32-NEXT:    sw a0, 124(sp) # 4-byte Folded Spill
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, t5
-; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a1, 210(sp)
-; ZVFHMIN32-NEXT:    lh a1, 546(sp)
-; ZVFHMIN32-NEXT:    lh t5, 290(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a4
-; ZVFHMIN32-NEXT:    vmv.x.s a4, v24
+; ZVFHMIN32-NEXT:    lh a0, 548(sp)
+; ZVFHMIN32-NEXT:    lh a1, 292(sp)
+; ZVFHMIN32-NEXT:    vmv.x.s a2, v14
+; ZVFHMIN32-NEXT:    sw a2, 116(sp) # 4-byte Folded Spill
+; ZVFHMIN32-NEXT:    vmv.x.s a2, v8
+; ZVFHMIN32-NEXT:    sw a2, 124(sp) # 4-byte Folded Spill
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    fmv.h.x fa3, t5
-; ZVFHMIN32-NEXT:    feq.h a1, fa4, fa3
-; ZVFHMIN32-NEXT:    sb a1, 209(sp)
-; ZVFHMIN32-NEXT:    lh a1, 544(sp)
-; ZVFHMIN32-NEXT:    lh t5, 288(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a4
-; ZVFHMIN32-NEXT:    feq.h a4, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, t5
-; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a4, 192(sp)
-; ZVFHMIN32-NEXT:    sb a1, 208(sp)
-; ZVFHMIN32-NEXT:    lh t5, 738(sp)
-; ZVFHMIN32-NEXT:    lh t6, 482(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s a0, v12
-; ZVFHMIN32-NEXT:    sw a0, 108(sp) # 4-byte Folded Spill
-; ZVFHMIN32-NEXT:    vmv.x.s a0, v10
-; ZVFHMIN32-NEXT:    sw a0, 120(sp) # 4-byte Folded Spill
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, t5
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, t6
-; ZVFHMIN32-NEXT:    feq.h t5, fa5, fa4
-; ZVFHMIN32-NEXT:    sb t5, 177(sp)
-; ZVFHMIN32-NEXT:    lh t5, 736(sp)
-; ZVFHMIN32-NEXT:    lh t6, 480(sp)
-; ZVFHMIN32-NEXT:    csrr a0, vlenb
-; ZVFHMIN32-NEXT:    li a1, 29
-; ZVFHMIN32-NEXT:    mul a0, a0, a1
-; ZVFHMIN32-NEXT:    add a0, sp, a0
-; ZVFHMIN32-NEXT:    lh s5, 848(a0) # 8-byte Folded Reload
-; ZVFHMIN32-NEXT:    csrr a0, vlenb
-; ZVFHMIN32-NEXT:    li a1, 28
-; ZVFHMIN32-NEXT:    mul a0, a0, a1
-; ZVFHMIN32-NEXT:    add a0, sp, a0
-; ZVFHMIN32-NEXT:    lh s6, 848(a0) # 8-byte Folded Reload
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, t5
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, t6
-; ZVFHMIN32-NEXT:    feq.h t5, fa5, fa4
-; ZVFHMIN32-NEXT:    sb t5, 176(sp)
-; ZVFHMIN32-NEXT:    lh t5, 734(sp)
-; ZVFHMIN32-NEXT:    lh t6, 478(sp)
-; ZVFHMIN32-NEXT:    csrr a0, vlenb
-; ZVFHMIN32-NEXT:    li a1, 27
-; ZVFHMIN32-NEXT:    mul a0, a0, a1
-; ZVFHMIN32-NEXT:    add a0, sp, a0
-; ZVFHMIN32-NEXT:    lh s7, 848(a0) # 8-byte Folded Reload
-; ZVFHMIN32-NEXT:    csrr a0, vlenb
-; ZVFHMIN32-NEXT:    li a1, 26
-; ZVFHMIN32-NEXT:    mul a0, a0, a1
-; ZVFHMIN32-NEXT:    add a0, sp, a0
-; ZVFHMIN32-NEXT:    lh s8, 848(a0) # 8-byte Folded Reload
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, t5
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, t6
-; ZVFHMIN32-NEXT:    feq.h t5, fa5, fa4
-; ZVFHMIN32-NEXT:    sb t5, 175(sp)
-; ZVFHMIN32-NEXT:    lh t5, 732(sp)
-; ZVFHMIN32-NEXT:    lh t6, 476(sp)
-; ZVFHMIN32-NEXT:    csrr a0, vlenb
-; ZVFHMIN32-NEXT:    li a1, 25
-; ZVFHMIN32-NEXT:    mul a0, a0, a1
-; ZVFHMIN32-NEXT:    add a0, sp, a0
-; ZVFHMIN32-NEXT:    lh s4, 848(a0) # 8-byte Folded Reload
-; ZVFHMIN32-NEXT:    csrr a0, vlenb
-; ZVFHMIN32-NEXT:    li a1, 24
-; ZVFHMIN32-NEXT:    mul a0, a0, a1
-; ZVFHMIN32-NEXT:    add a0, sp, a0
-; ZVFHMIN32-NEXT:    lh s3, 848(a0) # 8-byte Folded Reload
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, t5
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, t6
-; ZVFHMIN32-NEXT:    feq.h t5, fa5, fa4
-; ZVFHMIN32-NEXT:    sb t5, 174(sp)
-; ZVFHMIN32-NEXT:    lh t6, 730(sp)
-; ZVFHMIN32-NEXT:    lh s9, 474(sp)
-; ZVFHMIN32-NEXT:    csrr a0, vlenb
-; ZVFHMIN32-NEXT:    li a1, 23
-; ZVFHMIN32-NEXT:    mul a0, a0, a1
-; ZVFHMIN32-NEXT:    add a0, sp, a0
-; ZVFHMIN32-NEXT:    lh s2, 848(a0) # 8-byte Folded Reload
-; ZVFHMIN32-NEXT:    vmv.x.s t5, v3
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, t6
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a0, 210(sp)
+; ZVFHMIN32-NEXT:    lh a0, 546(sp)
+; ZVFHMIN32-NEXT:    lh a1, 290(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a3
+; ZVFHMIN32-NEXT:    vmv.x.s a3, v24
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa3, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa4, fa3
+; ZVFHMIN32-NEXT:    sb a0, 209(sp)
+; ZVFHMIN32-NEXT:    lh a0, 544(sp)
+; ZVFHMIN32-NEXT:    lh a1, 288(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a3
+; ZVFHMIN32-NEXT:    feq.h a3, fa5, fa4
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a3, 192(sp)
+; ZVFHMIN32-NEXT:    sb a0, 208(sp)
+; ZVFHMIN32-NEXT:    lh a0, 738(sp)
+; ZVFHMIN32-NEXT:    lh a1, 482(sp)
+; ZVFHMIN32-NEXT:    vmv.x.s a2, v10
+; ZVFHMIN32-NEXT:    sw a2, 112(sp) # 4-byte Folded Spill
+; ZVFHMIN32-NEXT:    vmv.x.s a2, v12
+; ZVFHMIN32-NEXT:    sw a2, 120(sp) # 4-byte Folded Spill
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a0, 177(sp)
+; ZVFHMIN32-NEXT:    lh a0, 736(sp)
+; ZVFHMIN32-NEXT:    lh a1, 480(sp)
+; ZVFHMIN32-NEXT:    csrr a2, vlenb
+; ZVFHMIN32-NEXT:    li a3, 29
+; ZVFHMIN32-NEXT:    mul a2, a2, a3
+; ZVFHMIN32-NEXT:    add a2, sp, a2
+; ZVFHMIN32-NEXT:    lh s5, 848(a2) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT:    csrr a2, vlenb
+; ZVFHMIN32-NEXT:    li a3, 28
+; ZVFHMIN32-NEXT:    mul a2, a2, a3
+; ZVFHMIN32-NEXT:    add a2, sp, a2
+; ZVFHMIN32-NEXT:    lh s2, 848(a2) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a0, 176(sp)
+; ZVFHMIN32-NEXT:    lh a0, 734(sp)
+; ZVFHMIN32-NEXT:    lh a1, 478(sp)
+; ZVFHMIN32-NEXT:    csrr a2, vlenb
+; ZVFHMIN32-NEXT:    li a3, 27
+; ZVFHMIN32-NEXT:    mul a2, a2, a3
+; ZVFHMIN32-NEXT:    add a2, sp, a2
+; ZVFHMIN32-NEXT:    lh s6, 848(a2) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT:    csrr a2, vlenb
+; ZVFHMIN32-NEXT:    li a3, 26
+; ZVFHMIN32-NEXT:    mul a2, a2, a3
+; ZVFHMIN32-NEXT:    add a2, sp, a2
+; ZVFHMIN32-NEXT:    lh s3, 848(a2) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a0, 175(sp)
+; ZVFHMIN32-NEXT:    lh a0, 732(sp)
+; ZVFHMIN32-NEXT:    lh a1, 476(sp)
+; ZVFHMIN32-NEXT:    csrr a2, vlenb
+; ZVFHMIN32-NEXT:    li a3, 25
+; ZVFHMIN32-NEXT:    mul a2, a2, a3
+; ZVFHMIN32-NEXT:    add a2, sp, a2
+; ZVFHMIN32-NEXT:    lh s7, 848(a2) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT:    csrr a2, vlenb
+; ZVFHMIN32-NEXT:    li a3, 24
+; ZVFHMIN32-NEXT:    mul a2, a2, a3
+; ZVFHMIN32-NEXT:    add a2, sp, a2
+; ZVFHMIN32-NEXT:    lh s4, 848(a2) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a0, 174(sp)
+; ZVFHMIN32-NEXT:    lh a0, 730(sp)
+; ZVFHMIN32-NEXT:    lh a1, 474(sp)
+; ZVFHMIN32-NEXT:    csrr a2, vlenb
+; ZVFHMIN32-NEXT:    li a3, 23
+; ZVFHMIN32-NEXT:    mul a2, a2, a3
+; ZVFHMIN32-NEXT:    add a2, sp, a2
+; ZVFHMIN32-NEXT:    lh s8, 848(a2) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT:    vmv.x.s t4, v21
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a0, 173(sp)
+; ZVFHMIN32-NEXT:    lh a0, 728(sp)
+; ZVFHMIN32-NEXT:    lh a1, 472(sp)
+; ZVFHMIN32-NEXT:    vmv.x.s t6, v3
+; ZVFHMIN32-NEXT:    vmv.x.s t5, v19
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a0, 172(sp)
+; ZVFHMIN32-NEXT:    lh a0, 726(sp)
+; ZVFHMIN32-NEXT:    lh a1, 470(sp)
+; ZVFHMIN32-NEXT:    vmv.x.s s10, v11
+; ZVFHMIN32-NEXT:    vmv.x.s s11, v7
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a0, 171(sp)
+; ZVFHMIN32-NEXT:    lh a0, 724(sp)
+; ZVFHMIN32-NEXT:    lh s9, 468(sp)
+; ZVFHMIN32-NEXT:    vmv.x.s a4, v9
+; ZVFHMIN32-NEXT:    vmv.x.s ra, v29
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, s9
-; ZVFHMIN32-NEXT:    feq.h t6, fa5, fa4
-; ZVFHMIN32-NEXT:    sb t6, 173(sp)
-; ZVFHMIN32-NEXT:    lh s9, 728(sp)
-; ZVFHMIN32-NEXT:    lh s10, 472(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s t6, v31
-; ZVFHMIN32-NEXT:    vmv.x.s ra, v13
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, s9
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, s10
-; ZVFHMIN32-NEXT:    feq.h s9, fa5, fa4
-; ZVFHMIN32-NEXT:    sb s9, 172(sp)
-; ZVFHMIN32-NEXT:    lh s9, 726(sp)
-; ZVFHMIN32-NEXT:    lh s10, 470(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s a2, v29
-; ZVFHMIN32-NEXT:    vmv.x.s a3, v11
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, s9
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, s10
-; ZVFHMIN32-NEXT:    feq.h s9, fa5, fa4
-; ZVFHMIN32-NEXT:    sb s9, 171(sp)
-; ZVFHMIN32-NEXT:    lh s10, 724(sp)
-; ZVFHMIN32-NEXT:    lh s11, 468(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s a4, v7
-; ZVFHMIN32-NEXT:    vmv.x.s s9, v9
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, s10
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, s11
-; ZVFHMIN32-NEXT:    feq.h s10, fa5, fa4
-; ZVFHMIN32-NEXT:    sb s10, 170(sp)
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a0, 170(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 722(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 466(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s s10, v21
-; ZVFHMIN32-NEXT:    vmv.x.s s11, v27
+; ZVFHMIN32-NEXT:    vmv.x.s s9, v31
+; ZVFHMIN32-NEXT:    vmv.x.s a3, v5
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a0, 169(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 720(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 464(sp)
+; ZVFHMIN32-NEXT:    vmv.x.s a2, v27
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, s5
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, s6
-; ZVFHMIN32-NEXT:    fmv.h.x fa3, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa2, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa3, fa2
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa3, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa4, fa3
 ; ZVFHMIN32-NEXT:    sb a0, 168(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 718(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 462(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa3, s7
-; ZVFHMIN32-NEXT:    fmv.h.x fa2, s8
-; ZVFHMIN32-NEXT:    fmv.h.x fa1, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa0, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa1, fa0
-; ZVFHMIN32-NEXT:    fmv.h.x fa1, ra
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, s2
+; ZVFHMIN32-NEXT:    fmv.h.x fa3, s6
+; ZVFHMIN32-NEXT:    fmv.h.x fa2, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa1, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa2, fa1
 ; ZVFHMIN32-NEXT:    sb a0, 167(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 716(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa0, a2
 ; ZVFHMIN32-NEXT:    lh a1, 460(sp)
-; ZVFHMIN32-NEXT:    feq.h s5, fa5, fa1
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    feq.h a0, fa4, fa0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, s4
-; ZVFHMIN32-NEXT:    sb a1, 166(sp)
-; ZVFHMIN32-NEXT:    lh a1, 714(sp)
-; ZVFHMIN32-NEXT:    lh a2, 458(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a3
-; ZVFHMIN32-NEXT:    feq.h a3, fa3, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    fmv.h.x fa3, a2
-; ZVFHMIN32-NEXT:    feq.h a1, fa4, fa3
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, s3
-; ZVFHMIN32-NEXT:    sb a1, 165(sp)
-; ZVFHMIN32-NEXT:    lh a1, 712(sp)
-; ZVFHMIN32-NEXT:    lh a2, 456(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa3, a4
-; ZVFHMIN32-NEXT:    feq.h a4, fa2, fa3
-; ZVFHMIN32-NEXT:    fmv.h.x fa3, a1
-; ZVFHMIN32-NEXT:    fmv.h.x fa2, a2
-; ZVFHMIN32-NEXT:    feq.h a1, fa3, fa2
-; ZVFHMIN32-NEXT:    fmv.h.x fa3, s2
-; ZVFHMIN32-NEXT:    sb a1, 164(sp)
-; ZVFHMIN32-NEXT:    lh a1, 710(sp)
-; ZVFHMIN32-NEXT:    lh a2, 454(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa2, s9
-; ZVFHMIN32-NEXT:    feq.h s2, fa5, fa2
+; ZVFHMIN32-NEXT:    fmv.h.x fa2, s3
+; ZVFHMIN32-NEXT:    fmv.h.x fa1, s7
+; ZVFHMIN32-NEXT:    fmv.h.x fa0, a0
+; ZVFHMIN32-NEXT:    fmv.h.x ft0, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa0, ft0
+; ZVFHMIN32-NEXT:    sb a0, 166(sp)
+; ZVFHMIN32-NEXT:    lh a0, 714(sp)
+; ZVFHMIN32-NEXT:    lh a1, 458(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa0, s4
+; ZVFHMIN32-NEXT:    fmv.h.x ft0, s8
+; ZVFHMIN32-NEXT:    fmv.h.x ft1, a0
+; ZVFHMIN32-NEXT:    fmv.h.x ft2, a1
+; ZVFHMIN32-NEXT:    feq.h a0, ft1, ft2
+; ZVFHMIN32-NEXT:    sb a0, 165(sp)
+; ZVFHMIN32-NEXT:    lh a0, 712(sp)
+; ZVFHMIN32-NEXT:    lh a1, 456(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x ft1, s10
+; ZVFHMIN32-NEXT:    fmv.h.x ft2, s11
+; ZVFHMIN32-NEXT:    fmv.h.x ft3, a0
+; ZVFHMIN32-NEXT:    fmv.h.x ft4, a1
+; ZVFHMIN32-NEXT:    feq.h a0, ft3, ft4
+; ZVFHMIN32-NEXT:    sb a0, 164(sp)
+; ZVFHMIN32-NEXT:    lh a0, 710(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x ft3, a4
+; ZVFHMIN32-NEXT:    lh a1, 454(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x ft4, ra
+; ZVFHMIN32-NEXT:    fmv.h.x ft5, a0
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, ft1
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT:    fmv.h.x fa2, a2
-; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa2
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, s10
-; ZVFHMIN32-NEXT:    fmv.h.x fa2, s11
+; ZVFHMIN32-NEXT:    feq.h a1, ft5, fa5
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a3
 ; ZVFHMIN32-NEXT:    sb a1, 163(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 708(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x ft1, a2
 ; ZVFHMIN32-NEXT:    lh a2, 452(sp)
-; ZVFHMIN32-NEXT:    feq.h s3, fa4, fa5
-; ZVFHMIN32-NEXT:    feq.h s4, fa3, fa2
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
-; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a1, 162(sp)
-; ZVFHMIN32-NEXT:    lh a1, 706(sp)
-; ZVFHMIN32-NEXT:    lh a2, 450(sp)
-; ZVFHMIN32-NEXT:    sb s4, 129(sp)
-; ZVFHMIN32-NEXT:    sb s3, 130(sp)
-; ZVFHMIN32-NEXT:    sb s2, 131(sp)
-; ZVFHMIN32-NEXT:    sb a4, 132(sp)
+; ZVFHMIN32-NEXT:    feq.h a3, fa0, fa5
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
-; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    feq.h a1, ft0, ft1
+; ZVFHMIN32-NEXT:    fmv.h.x fa0, a2
+; ZVFHMIN32-NEXT:    feq.h a2, fa5, fa0
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, s9
+; ZVFHMIN32-NEXT:    sb a2, 162(sp)
+; ZVFHMIN32-NEXT:    lh a2, 706(sp)
+; ZVFHMIN32-NEXT:    lh a4, 450(sp)
+; ZVFHMIN32-NEXT:    sb a1, 129(sp)
+; ZVFHMIN32-NEXT:    feq.h a1, fa1, fa5
+; ZVFHMIN32-NEXT:    sb a3, 130(sp)
+; ZVFHMIN32-NEXT:    feq.h a3, fa2, ft4
+; ZVFHMIN32-NEXT:    sb a1, 131(sp)
+; ZVFHMIN32-NEXT:    feq.h a1, fa4, ft2
+; ZVFHMIN32-NEXT:    sb a3, 132(sp)
+; ZVFHMIN32-NEXT:    feq.h a3, fa3, ft3
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a2
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a4
+; ZVFHMIN32-NEXT:    feq.h a2, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a3, 133(sp)
-; ZVFHMIN32-NEXT:    sb a0, 134(sp)
-; ZVFHMIN32-NEXT:    sb s5, 135(sp)
-; ZVFHMIN32-NEXT:    sb a1, 161(sp)
+; ZVFHMIN32-NEXT:    sb a1, 134(sp)
+; ZVFHMIN32-NEXT:    sb a0, 135(sp)
+; ZVFHMIN32-NEXT:    sb a2, 161(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 610(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 354(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s s6, v5
-; ZVFHMIN32-NEXT:    vmv.x.s s5, v23
+; ZVFHMIN32-NEXT:    vmv.x.s s4, v23
+; ZVFHMIN32-NEXT:    csrr a2, vlenb
+; ZVFHMIN32-NEXT:    li a3, 10
+; ZVFHMIN32-NEXT:    mul a2, a2, a3
+; ZVFHMIN32-NEXT:    add a2, sp, a2
+; ZVFHMIN32-NEXT:    lh s2, 848(a2) # 8-byte Folded Reload
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
@@ -1747,13 +1748,12 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN32-NEXT:    lh a0, 608(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 352(sp)
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    li a3, 21
-; ZVFHMIN32-NEXT:    mul a2, a2, a3
+; ZVFHMIN32-NEXT:    slli a2, a2, 4
 ; ZVFHMIN32-NEXT:    add a2, sp, a2
-; ZVFHMIN32-NEXT:    lh s4, 848(a2) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT:    lh s5, 848(a2) # 8-byte Folded Reload
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    li a3, 20
-; ZVFHMIN32-NEXT:    mul a2, a2, a3
+; ZVFHMIN32-NEXT:    slli a3, a2, 4
+; ZVFHMIN32-NEXT:    sub a2, a3, a2
 ; ZVFHMIN32-NEXT:    add a2, sp, a2
 ; ZVFHMIN32-NEXT:    lh s3, 848(a2) # 8-byte Folded Reload
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
@@ -1762,153 +1762,148 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN32-NEXT:    sb a0, 240(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 606(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 350(sp)
-; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    li a3, 22
-; ZVFHMIN32-NEXT:    mul a2, a2, a3
-; ZVFHMIN32-NEXT:    add a2, sp, a2
-; ZVFHMIN32-NEXT:    lh s2, 848(a2) # 8-byte Folded Reload
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, t5
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa3, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa4, fa3
+; ZVFHMIN32-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 7
+; ZVFHMIN32-NEXT:    vmv.x.s s6, v8
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a0, 239(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 604(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 348(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, t6
-; ZVFHMIN32-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 7
-; ZVFHMIN32-NEXT:    fmv.h.x fa3, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa2, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa3, fa2
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 6
+; ZVFHMIN32-NEXT:    vmv.x.s s7, v8
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a0, 238(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 602(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 346(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s a2, v8
-; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 6
-; ZVFHMIN32-NEXT:    fmv.h.x fa3, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa2, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa3, fa2
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 5
+; ZVFHMIN32-NEXT:    vmv.x.s s8, v8
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a0, 237(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 600(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 344(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s a3, v8
-; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 5
-; ZVFHMIN32-NEXT:    fmv.h.x fa3, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa2, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa3, fa2
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 4
+; ZVFHMIN32-NEXT:    vmv.x.s s9, v8
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a0, 236(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 598(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 342(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s a4, v8
-; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 4
-; ZVFHMIN32-NEXT:    fmv.h.x fa3, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa2, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa3, fa2
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 3
+; ZVFHMIN32-NEXT:    vmv.x.s s10, v8
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a0, 235(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 596(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 340(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s s8, v8
-; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 3
-; ZVFHMIN32-NEXT:    fmv.h.x fa3, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa2, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa3, fa2
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 2
+; ZVFHMIN32-NEXT:    vmv.x.s s11, v8
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a0, 234(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 594(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 338(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s s9, v8
-; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 2
-; ZVFHMIN32-NEXT:    fmv.h.x fa3, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa2, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa3, fa2
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 1
+; ZVFHMIN32-NEXT:    vmv.x.s ra, v8
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a0, 233(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 592(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s a1, v8
-; ZVFHMIN32-NEXT:    lh t5, 336(sp)
-; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 1
+; ZVFHMIN32-NEXT:    lh a1, 336(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, t4
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, t6
 ; ZVFHMIN32-NEXT:    fmv.h.x fa3, a0
-; ZVFHMIN32-NEXT:    vmv.x.s s7, v8
-; ZVFHMIN32-NEXT:    fmv.h.x fa2, t5
+; ZVFHMIN32-NEXT:    fmv.h.x fa2, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa3, fa2
-; ZVFHMIN32-NEXT:    fmv.h.x fa3, a2
 ; ZVFHMIN32-NEXT:    sb a0, 232(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 590(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa2, a3
-; ZVFHMIN32-NEXT:    lh a2, 334(sp)
-; ZVFHMIN32-NEXT:    feq.h t5, fa5, fa3
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    feq.h t6, fa4, fa2
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, s6
+; ZVFHMIN32-NEXT:    lh a1, 334(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa3, t5
+; ZVFHMIN32-NEXT:    fmv.h.x fa2, s4
+; ZVFHMIN32-NEXT:    fmv.h.x fa1, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa0, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa1, fa0
 ; ZVFHMIN32-NEXT:    sb a0, 231(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 588(sp)
-; ZVFHMIN32-NEXT:    lh a2, 332(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a4
-; ZVFHMIN32-NEXT:    feq.h a3, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, s5
+; ZVFHMIN32-NEXT:    lh a1, 332(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa1, s2
+; ZVFHMIN32-NEXT:    fmv.h.x fa0, s5
+; ZVFHMIN32-NEXT:    fmv.h.x ft0, a0
+; ZVFHMIN32-NEXT:    fmv.h.x ft1, a1
+; ZVFHMIN32-NEXT:    feq.h a0, ft0, ft1
 ; ZVFHMIN32-NEXT:    sb a0, 230(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 586(sp)
-; ZVFHMIN32-NEXT:    lh a2, 330(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, s8
-; ZVFHMIN32-NEXT:    feq.h a4, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, s4
-; ZVFHMIN32-NEXT:    sb a0, 229(sp)
-; ZVFHMIN32-NEXT:    lh a0, 584(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x ft0, s3
+; ZVFHMIN32-NEXT:    lh a1, 330(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x ft1, s6
+; ZVFHMIN32-NEXT:    fmv.h.x ft2, a0
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, ft1
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT:    feq.h a1, ft2, fa5
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, s7
+; ZVFHMIN32-NEXT:    sb a1, 229(sp)
+; ZVFHMIN32-NEXT:    lh a1, 584(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x ft1, s8
 ; ZVFHMIN32-NEXT:    lh a2, 328(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, s9
-; ZVFHMIN32-NEXT:    feq.h s4, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, s3
-; ZVFHMIN32-NEXT:    sb a0, 228(sp)
-; ZVFHMIN32-NEXT:    lh a0, 582(sp)
-; ZVFHMIN32-NEXT:    lh a2, 326(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, s2
-; ZVFHMIN32-NEXT:    sb a0, 227(sp)
-; ZVFHMIN32-NEXT:    lh a0, 580(sp)
-; ZVFHMIN32-NEXT:    lh a2, 324(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, s7
-; ZVFHMIN32-NEXT:    feq.h s2, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 226(sp)
-; ZVFHMIN32-NEXT:    lh a0, 578(sp)
-; ZVFHMIN32-NEXT:    lh a2, 322(sp)
-; ZVFHMIN32-NEXT:    sb s2, 193(sp)
-; ZVFHMIN32-NEXT:    sb a1, 194(sp)
-; ZVFHMIN32-NEXT:    sb s4, 195(sp)
-; ZVFHMIN32-NEXT:    sb a4, 196(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    feq.h a3, fa4, fa5
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT:    feq.h a1, fa3, ft1
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a3, 197(sp)
-; ZVFHMIN32-NEXT:    sb t6, 198(sp)
-; ZVFHMIN32-NEXT:    sb t5, 199(sp)
-; ZVFHMIN32-NEXT:    sb a0, 225(sp)
+; ZVFHMIN32-NEXT:    feq.h a2, fa5, fa4
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, s9
+; ZVFHMIN32-NEXT:    sb a2, 228(sp)
+; ZVFHMIN32-NEXT:    lh a2, 582(sp)
+; ZVFHMIN32-NEXT:    lh a4, 326(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, s10
+; ZVFHMIN32-NEXT:    feq.h t4, fa2, fa5
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a2
+; ZVFHMIN32-NEXT:    fmv.h.x fa3, a4
+; ZVFHMIN32-NEXT:    feq.h a2, fa5, fa3
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, s11
+; ZVFHMIN32-NEXT:    fmv.h.x fa3, ra
+; ZVFHMIN32-NEXT:    sb a2, 227(sp)
+; ZVFHMIN32-NEXT:    lh a2, 580(sp)
+; ZVFHMIN32-NEXT:    lh a4, 324(sp)
+; ZVFHMIN32-NEXT:    feq.h t5, fa0, fa5
+; ZVFHMIN32-NEXT:    feq.h t6, ft0, fa3
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a2
+; ZVFHMIN32-NEXT:    fmv.h.x fa3, a4
+; ZVFHMIN32-NEXT:    feq.h a2, fa5, fa3
+; ZVFHMIN32-NEXT:    sb a2, 226(sp)
+; ZVFHMIN32-NEXT:    lh a2, 578(sp)
+; ZVFHMIN32-NEXT:    lh a4, 322(sp)
+; ZVFHMIN32-NEXT:    sb t6, 193(sp)
+; ZVFHMIN32-NEXT:    feq.h t6, fa1, fa4
+; ZVFHMIN32-NEXT:    sb t5, 194(sp)
+; ZVFHMIN32-NEXT:    sb t6, 195(sp)
+; ZVFHMIN32-NEXT:    sb t4, 196(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a2
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a4
+; ZVFHMIN32-NEXT:    feq.h a2, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a1, 197(sp)
+; ZVFHMIN32-NEXT:    sb a3, 198(sp)
+; ZVFHMIN32-NEXT:    sb a0, 199(sp)
+; ZVFHMIN32-NEXT:    sb a2, 225(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 766(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 510(sp)
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    li a3, 18
-; ZVFHMIN32-NEXT:    mul a2, a2, a3
+; ZVFHMIN32-NEXT:    slli a3, a2, 4
+; ZVFHMIN32-NEXT:    add a2, a3, a2
 ; ZVFHMIN32-NEXT:    add a2, sp, a2
 ; ZVFHMIN32-NEXT:    addi a2, a2, 848
 ; ZVFHMIN32-NEXT:    vl2r.v v8, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN32-NEXT:    vmv.x.s s2, v8
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    li a3, 14
+; ZVFHMIN32-NEXT:    li a3, 11
 ; ZVFHMIN32-NEXT:    mul a2, a2, a3
 ; ZVFHMIN32-NEXT:    add a2, sp, a2
 ; ZVFHMIN32-NEXT:    addi a2, a2, 848
@@ -1920,165 +1915,171 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN32-NEXT:    sb a0, 191(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 764(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 508(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s t5, v6
-; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    slli a2, a2, 2
-; ZVFHMIN32-NEXT:    add a2, sp, a2
-; ZVFHMIN32-NEXT:    addi a2, a2, 848
-; ZVFHMIN32-NEXT:    vl2r.v v8, (a2) # Unknown-size Folded Reload
-; ZVFHMIN32-NEXT:    vmv.x.s a2, v8
+; ZVFHMIN32-NEXT:    vmv.x.s t5, v4
+; ZVFHMIN32-NEXT:    vmv.x.s t4, v30
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a0, 190(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 762(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 506(sp)
+; ZVFHMIN32-NEXT:    csrr a2, vlenb
+; ZVFHMIN32-NEXT:    slli a2, a2, 2
+; ZVFHMIN32-NEXT:    add a2, sp, a2
+; ZVFHMIN32-NEXT:    addi a2, a2, 848
+; ZVFHMIN32-NEXT:    vl2r.v v8, (a2) # Unknown-size Folded Reload
+; ZVFHMIN32-NEXT:    vmv.x.s a2, v8
 ; ZVFHMIN32-NEXT:    csrr a3, vlenb
-; ZVFHMIN32-NEXT:    slli a3, a3, 3
+; ZVFHMIN32-NEXT:    slli a3, a3, 1
 ; ZVFHMIN32-NEXT:    add a3, sp, a3
 ; ZVFHMIN32-NEXT:    addi a3, a3, 848
 ; ZVFHMIN32-NEXT:    vl2r.v v8, (a3) # Unknown-size Folded Reload
 ; ZVFHMIN32-NEXT:    vmv.x.s a3, v8
-; ZVFHMIN32-NEXT:    csrr a4, vlenb
-; ZVFHMIN32-NEXT:    li s3, 6
-; ZVFHMIN32-NEXT:    mul a4, a4, s3
-; ZVFHMIN32-NEXT:    add a4, sp, a4
-; ZVFHMIN32-NEXT:    addi a4, a4, 848
-; ZVFHMIN32-NEXT:    vl2r.v v8, (a4) # Unknown-size Folded Reload
-; ZVFHMIN32-NEXT:    vmv.x.s a4, v8
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a0, 189(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 760(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 504(sp)
-; ZVFHMIN32-NEXT:    csrr s3, vlenb
-; ZVFHMIN32-NEXT:    li s4, 12
-; ZVFHMIN32-NEXT:    mul s3, s3, s4
-; ZVFHMIN32-NEXT:    add s3, sp, s3
-; ZVFHMIN32-NEXT:    addi s3, s3, 848
-; ZVFHMIN32-NEXT:    vl2r.v v8, (s3) # Unknown-size Folded Reload
-; ZVFHMIN32-NEXT:    vmv.x.s s6, v8
-; ZVFHMIN32-NEXT:    csrr s3, vlenb
-; ZVFHMIN32-NEXT:    li s4, 10
-; ZVFHMIN32-NEXT:    mul s3, s3, s4
-; ZVFHMIN32-NEXT:    add s3, sp, s3
-; ZVFHMIN32-NEXT:    addi s3, s3, 848
-; ZVFHMIN32-NEXT:    vl2r.v v8, (s3) # Unknown-size Folded Reload
-; ZVFHMIN32-NEXT:    vmv.x.s s4, v8
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, t3
+; ZVFHMIN32-NEXT:    csrr a4, vlenb
+; ZVFHMIN32-NEXT:    li t3, 6
+; ZVFHMIN32-NEXT:    mul a4, a4, t3
+; ZVFHMIN32-NEXT:    add a4, sp, a4
+; ZVFHMIN32-NEXT:    addi a4, a4, 848
+; ZVFHMIN32-NEXT:    vl2r.v v8, (a4) # Unknown-size Folded Reload
+; ZVFHMIN32-NEXT:    vmv.x.s a4, v8
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa3, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa4, fa3
 ; ZVFHMIN32-NEXT:    sb a0, 188(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 758(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 502(sp)
-; ZVFHMIN32-NEXT:    csrr s3, vlenb
-; ZVFHMIN32-NEXT:    slli s3, s3, 4
-; ZVFHMIN32-NEXT:    add s3, sp, s3
-; ZVFHMIN32-NEXT:    addi s3, s3, 848
-; ZVFHMIN32-NEXT:    vl2r.v v8, (s3) # Unknown-size Folded Reload
-; ZVFHMIN32-NEXT:    vmv.x.s s5, v8
-; ZVFHMIN32-NEXT:    vmv.x.s s3, v16
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, t4
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, t2
+; ZVFHMIN32-NEXT:    csrr t2, vlenb
+; ZVFHMIN32-NEXT:    slli t2, t2, 3
+; ZVFHMIN32-NEXT:    add t2, sp, t2
+; ZVFHMIN32-NEXT:    addi t2, t2, 848
+; ZVFHMIN32-NEXT:    vl2r.v v8, (t2) # Unknown-size Folded Reload
+; ZVFHMIN32-NEXT:    vmv.x.s t2, v8
+; ZVFHMIN32-NEXT:    fmv.h.x fa3, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa2, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa3, fa2
 ; ZVFHMIN32-NEXT:    sb a0, 187(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 756(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 500(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
-; ZVFHMIN32-NEXT:    feq.h t4, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, t3
+; ZVFHMIN32-NEXT:    fmv.h.x fa3, t1
+; ZVFHMIN32-NEXT:    csrr t1, vlenb
+; ZVFHMIN32-NEXT:    li t3, 13
+; ZVFHMIN32-NEXT:    mul t1, t1, t3
+; ZVFHMIN32-NEXT:    add t1, sp, t1
+; ZVFHMIN32-NEXT:    addi t1, t1, 848
+; ZVFHMIN32-NEXT:    vl2r.v v8, (t1) # Unknown-size Folded Reload
+; ZVFHMIN32-NEXT:    vmv.x.s t3, v8
+; ZVFHMIN32-NEXT:    fmv.h.x fa2, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa1, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa2, fa1
 ; ZVFHMIN32-NEXT:    sb a0, 186(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 754(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa2, t0
 ; ZVFHMIN32-NEXT:    lh a1, 498(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a3
-; ZVFHMIN32-NEXT:    feq.h t3, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, t1
-; ZVFHMIN32-NEXT:    sb a0, 185(sp)
-; ZVFHMIN32-NEXT:    lh a0, 752(sp)
-; ZVFHMIN32-NEXT:    lh a1, 496(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a4
-; ZVFHMIN32-NEXT:    feq.h t1, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, t2
-; ZVFHMIN32-NEXT:    sb a0, 184(sp)
-; ZVFHMIN32-NEXT:    lh a0, 750(sp)
-; ZVFHMIN32-NEXT:    lh a1, 494(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, s6
+; ZVFHMIN32-NEXT:    csrr t0, vlenb
+; ZVFHMIN32-NEXT:    li t1, 19
+; ZVFHMIN32-NEXT:    mul t0, t0, t1
+; ZVFHMIN32-NEXT:    add t0, sp, t0
+; ZVFHMIN32-NEXT:    addi t0, t0, 848
+; ZVFHMIN32-NEXT:    vl2r.v v8, (t0) # Unknown-size Folded Reload
+; ZVFHMIN32-NEXT:    vmv.x.s s3, v8
+; ZVFHMIN32-NEXT:    fmv.h.x fa1, a0
+; ZVFHMIN32-NEXT:    csrr a0, vlenb
+; ZVFHMIN32-NEXT:    li t0, 21
+; ZVFHMIN32-NEXT:    mul a0, a0, t0
+; ZVFHMIN32-NEXT:    add a0, sp, a0
+; ZVFHMIN32-NEXT:    addi a0, a0, 848
+; ZVFHMIN32-NEXT:    vl2r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN32-NEXT:    vmv.x.s a0, v8
+; ZVFHMIN32-NEXT:    fmv.h.x fa0, a1
+; ZVFHMIN32-NEXT:    feq.h a1, fa1, fa0
+; ZVFHMIN32-NEXT:    fmv.h.x fa1, a2
+; ZVFHMIN32-NEXT:    sb a1, 185(sp)
+; ZVFHMIN32-NEXT:    lh a1, 752(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa0, a3
+; ZVFHMIN32-NEXT:    lh a2, 496(sp)
+; ZVFHMIN32-NEXT:    feq.h t0, fa5, fa1
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT:    feq.h t1, fa4, fa0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a4
+; ZVFHMIN32-NEXT:    sb a1, 184(sp)
+; ZVFHMIN32-NEXT:    lh a1, 750(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, t2
+; ZVFHMIN32-NEXT:    lh a2, 494(sp)
+; ZVFHMIN32-NEXT:    feq.h a3, fa3, fa5
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT:    feq.h a1, fa2, fa4
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
 ; ZVFHMIN32-NEXT:    feq.h a2, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, t0
-; ZVFHMIN32-NEXT:    sb a0, 183(sp)
-; ZVFHMIN32-NEXT:    lh a0, 748(sp)
-; ZVFHMIN32-NEXT:    lh a1, 492(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, s4
-; ZVFHMIN32-NEXT:    feq.h a3, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a7
-; ZVFHMIN32-NEXT:    sb a0, 182(sp)
-; ZVFHMIN32-NEXT:    lh a0, 746(sp)
-; ZVFHMIN32-NEXT:    lh a1, 490(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, s5
-; ZVFHMIN32-NEXT:    feq.h a4, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a2, 183(sp)
+; ZVFHMIN32-NEXT:    lh a2, 748(sp)
+; ZVFHMIN32-NEXT:    lh a4, 492(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, t3
+; ZVFHMIN32-NEXT:    feq.h a7, fa5, fa4
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a2
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a4
+; ZVFHMIN32-NEXT:    feq.h a2, fa5, fa4
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a6
-; ZVFHMIN32-NEXT:    sb a0, 181(sp)
-; ZVFHMIN32-NEXT:    lh a0, 744(sp)
-; ZVFHMIN32-NEXT:    lh a1, 488(sp)
+; ZVFHMIN32-NEXT:    sb a2, 182(sp)
+; ZVFHMIN32-NEXT:    lh a2, 746(sp)
+; ZVFHMIN32-NEXT:    lh a4, 490(sp)
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, s3
 ; ZVFHMIN32-NEXT:    feq.h a6, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a2
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a4
+; ZVFHMIN32-NEXT:    feq.h a2, fa5, fa4
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a5
-; ZVFHMIN32-NEXT:    addi a1, sp, 848
-; ZVFHMIN32-NEXT:    vl2r.v v8, (a1) # Unknown-size Folded Reload
-; ZVFHMIN32-NEXT:    vmv.x.s a1, v8
+; ZVFHMIN32-NEXT:    sb a2, 181(sp)
+; ZVFHMIN32-NEXT:    lh a2, 744(sp)
+; ZVFHMIN32-NEXT:    lh a4, 488(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a0
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a2
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a4
+; ZVFHMIN32-NEXT:    feq.h a2, fa5, fa4
+; ZVFHMIN32-NEXT:    lw a4, 108(sp) # 4-byte Folded Reload
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a4
+; ZVFHMIN32-NEXT:    vmv.x.s a5, v0
 ; ZVFHMIN32-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
 ; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 15
-; ZVFHMIN32-NEXT:    vmv.x.s a5, v8
-; ZVFHMIN32-NEXT:    sb a0, 180(sp)
-; ZVFHMIN32-NEXT:    lh a0, 742(sp)
-; ZVFHMIN32-NEXT:    lh a7, 486(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a7
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 179(sp)
-; ZVFHMIN32-NEXT:    lh a0, 740(sp)
-; ZVFHMIN32-NEXT:    lh a7, 484(sp)
-; ZVFHMIN32-NEXT:    sb a2, 140(sp)
-; ZVFHMIN32-NEXT:    sb t1, 141(sp)
-; ZVFHMIN32-NEXT:    sb t3, 142(sp)
-; ZVFHMIN32-NEXT:    sb t4, 143(sp)
-; ZVFHMIN32-NEXT:    sb a1, 136(sp)
-; ZVFHMIN32-NEXT:    sb a6, 137(sp)
-; ZVFHMIN32-NEXT:    sb a4, 138(sp)
-; ZVFHMIN32-NEXT:    sb a3, 139(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a7
+; ZVFHMIN32-NEXT:    vmv.x.s a4, v8
+; ZVFHMIN32-NEXT:    sb a2, 180(sp)
+; ZVFHMIN32-NEXT:    lh a2, 742(sp)
+; ZVFHMIN32-NEXT:    lh t2, 486(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a5
+; ZVFHMIN32-NEXT:    feq.h a5, fa5, fa4
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a2
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, t2
+; ZVFHMIN32-NEXT:    feq.h a2, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a2, 179(sp)
+; ZVFHMIN32-NEXT:    lh a2, 740(sp)
+; ZVFHMIN32-NEXT:    lh t2, 484(sp)
+; ZVFHMIN32-NEXT:    sb a1, 140(sp)
+; ZVFHMIN32-NEXT:    sb a3, 141(sp)
+; ZVFHMIN32-NEXT:    sb t1, 142(sp)
+; ZVFHMIN32-NEXT:    sb t0, 143(sp)
+; ZVFHMIN32-NEXT:    sb a5, 136(sp)
+; ZVFHMIN32-NEXT:    sb a0, 137(sp)
+; ZVFHMIN32-NEXT:    sb a6, 138(sp)
+; ZVFHMIN32-NEXT:    sb a7, 139(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a2
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, t2
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a0, 178(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 638(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 382(sp)
 ; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 14
-; ZVFHMIN32-NEXT:    vmv.x.s t3, v8
+; ZVFHMIN32-NEXT:    vmv.x.s t2, v8
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
@@ -2086,7 +2087,7 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN32-NEXT:    lh a0, 636(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 380(sp)
 ; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 13
-; ZVFHMIN32-NEXT:    vmv.x.s t2, v8
+; ZVFHMIN32-NEXT:    vmv.x.s t1, v8
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
@@ -2094,7 +2095,7 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN32-NEXT:    lh a0, 634(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 378(sp)
 ; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 12
-; ZVFHMIN32-NEXT:    vmv.x.s t1, v8
+; ZVFHMIN32-NEXT:    vmv.x.s t0, v8
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
@@ -2102,7 +2103,7 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN32-NEXT:    lh a0, 632(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 376(sp)
 ; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 11
-; ZVFHMIN32-NEXT:    vmv.x.s t0, v8
+; ZVFHMIN32-NEXT:    vmv.x.s a7, v8
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
@@ -2110,7 +2111,7 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN32-NEXT:    lh a0, 630(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 374(sp)
 ; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 10
-; ZVFHMIN32-NEXT:    vmv.x.s a7, v8
+; ZVFHMIN32-NEXT:    vmv.x.s a6, v8
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
@@ -2118,102 +2119,101 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN32-NEXT:    lh a0, 628(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 372(sp)
 ; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 9
-; ZVFHMIN32-NEXT:    vmv.x.s a6, v8
+; ZVFHMIN32-NEXT:    vmv.x.s a5, v8
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    lw a1, 112(sp) # 4-byte Folded Reload
+; ZVFHMIN32-NEXT:    lw a1, 116(sp) # 4-byte Folded Reload
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN32-NEXT:    sb a0, 250(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 626(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 370(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a5
-; ZVFHMIN32-NEXT:    feq.h a3, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    lw a1, 116(sp) # 4-byte Folded Reload
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT:    sb a0, 249(sp)
-; ZVFHMIN32-NEXT:    lh a0, 624(sp)
-; ZVFHMIN32-NEXT:    lh a1, 368(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, t3
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a4
 ; ZVFHMIN32-NEXT:    feq.h a2, fa5, fa4
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    lw a1, 124(sp) # 4-byte Folded Reload
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT:    sb a0, 248(sp)
-; ZVFHMIN32-NEXT:    lh a0, 622(sp)
-; ZVFHMIN32-NEXT:    lh a1, 366(sp)
+; ZVFHMIN32-NEXT:    sb a0, 249(sp)
+; ZVFHMIN32-NEXT:    lh a1, 624(sp)
+; ZVFHMIN32-NEXT:    lh a3, 368(sp)
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, t2
-; ZVFHMIN32-NEXT:    feq.h a4, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    lw a1, 108(sp) # 4-byte Folded Reload
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT:    sb a0, 247(sp)
-; ZVFHMIN32-NEXT:    lh a0, 620(sp)
-; ZVFHMIN32-NEXT:    lh a1, 364(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a3
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    lw a3, 112(sp) # 4-byte Folded Reload
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a3
+; ZVFHMIN32-NEXT:    sb a1, 248(sp)
+; ZVFHMIN32-NEXT:    lh a1, 622(sp)
+; ZVFHMIN32-NEXT:    lh a3, 366(sp)
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, t1
-; ZVFHMIN32-NEXT:    feq.h a5, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    lw a1, 120(sp) # 4-byte Folded Reload
+; ZVFHMIN32-NEXT:    feq.h a4, fa5, fa4
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT:    sb a0, 246(sp)
-; ZVFHMIN32-NEXT:    lh a0, 618(sp)
-; ZVFHMIN32-NEXT:    lh a1, 362(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a3
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    lw a3, 120(sp) # 4-byte Folded Reload
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a3
+; ZVFHMIN32-NEXT:    sb a1, 247(sp)
+; ZVFHMIN32-NEXT:    lh a1, 620(sp)
+; ZVFHMIN32-NEXT:    lh a3, 364(sp)
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, t0
 ; ZVFHMIN32-NEXT:    feq.h t0, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a3
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, s2
-; ZVFHMIN32-NEXT:    sb a0, 245(sp)
-; ZVFHMIN32-NEXT:    lh a0, 616(sp)
-; ZVFHMIN32-NEXT:    lh a1, 360(sp)
+; ZVFHMIN32-NEXT:    sb a1, 246(sp)
+; ZVFHMIN32-NEXT:    lh a1, 618(sp)
+; ZVFHMIN32-NEXT:    lh a3, 362(sp)
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a7
 ; ZVFHMIN32-NEXT:    feq.h a7, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a3
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, t6
-; ZVFHMIN32-NEXT:    sb a0, 244(sp)
-; ZVFHMIN32-NEXT:    lh a0, 614(sp)
-; ZVFHMIN32-NEXT:    lh a1, 358(sp)
+; ZVFHMIN32-NEXT:    sb a1, 245(sp)
+; ZVFHMIN32-NEXT:    lh a1, 616(sp)
+; ZVFHMIN32-NEXT:    lh a3, 360(sp)
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a6
 ; ZVFHMIN32-NEXT:    feq.h a6, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a3
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, t5
+; ZVFHMIN32-NEXT:    sb a1, 244(sp)
+; ZVFHMIN32-NEXT:    lh a1, 614(sp)
+; ZVFHMIN32-NEXT:    lh a3, 358(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a5
+; ZVFHMIN32-NEXT:    feq.h a5, fa5, fa4
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a3
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, t4
 ; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 8
-; ZVFHMIN32-NEXT:    vmv.x.s a1, v8
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    sb a0, 243(sp)
-; ZVFHMIN32-NEXT:    lh a0, 612(sp)
-; ZVFHMIN32-NEXT:    lh a1, 356(sp)
-; ZVFHMIN32-NEXT:    sb a5, 204(sp)
+; ZVFHMIN32-NEXT:    vmv.x.s a3, v8
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a3
+; ZVFHMIN32-NEXT:    sb a1, 243(sp)
+; ZVFHMIN32-NEXT:    lh a1, 612(sp)
+; ZVFHMIN32-NEXT:    lh a3, 356(sp)
+; ZVFHMIN32-NEXT:    sb t0, 204(sp)
 ; ZVFHMIN32-NEXT:    sb a4, 205(sp)
-; ZVFHMIN32-NEXT:    sb a2, 206(sp)
-; ZVFHMIN32-NEXT:    sb a3, 207(sp)
-; ZVFHMIN32-NEXT:    feq.h a2, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a2, 200(sp)
-; ZVFHMIN32-NEXT:    sb a6, 201(sp)
-; ZVFHMIN32-NEXT:    sb a7, 202(sp)
-; ZVFHMIN32-NEXT:    sb t0, 203(sp)
-; ZVFHMIN32-NEXT:    li a2, 128
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 242(sp)
-; ZVFHMIN32-NEXT:    addi a0, sp, 128
-; ZVFHMIN32-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
-; ZVFHMIN32-NEXT:    vle8.v v8, (a0)
+; ZVFHMIN32-NEXT:    sb a0, 206(sp)
+; ZVFHMIN32-NEXT:    sb a2, 207(sp)
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a0, 200(sp)
+; ZVFHMIN32-NEXT:    sb a5, 201(sp)
+; ZVFHMIN32-NEXT:    sb a6, 202(sp)
+; ZVFHMIN32-NEXT:    sb a7, 203(sp)
+; ZVFHMIN32-NEXT:    li a0, 128
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a3
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a1, 242(sp)
+; ZVFHMIN32-NEXT:    addi a1, sp, 128
+; ZVFHMIN32-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; ZVFHMIN32-NEXT:    vle8.v v8, (a1)
 ; ZVFHMIN32-NEXT:    vand.vi v8, v8, 1
 ; ZVFHMIN32-NEXT:    vmsne.vi v0, v8, 0
 ; ZVFHMIN32-NEXT:    addi sp, s0, -896
@@ -2498,19 +2498,16 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN64-NEXT:    vs1r.v v10, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN64-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
 ; ZVFHMIN64-NEXT:    vslidedown.vi v26, v8, 15
-; ZVFHMIN64-NEXT:    vslidedown.vi v20, v8, 14
-; ZVFHMIN64-NEXT:    vslidedown.vi v28, v8, 13
-; ZVFHMIN64-NEXT:    vslidedown.vi v10, v8, 12
-; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    slli a2, a2, 1
-; ZVFHMIN64-NEXT:    add a2, sp, a2
-; ZVFHMIN64-NEXT:    addi a2, a2, 800
+; ZVFHMIN64-NEXT:    vslidedown.vi v28, v8, 14
+; ZVFHMIN64-NEXT:    vslidedown.vi v10, v8, 13
+; ZVFHMIN64-NEXT:    addi a2, sp, 800
 ; ZVFHMIN64-NEXT:    vs2r.v v10, (a2) # Unknown-size Folded Spill
-; ZVFHMIN64-NEXT:    vslidedown.vi v4, v8, 11
-; ZVFHMIN64-NEXT:    vslidedown.vi v2, v8, 10
-; ZVFHMIN64-NEXT:    vslidedown.vi v30, v8, 9
-; ZVFHMIN64-NEXT:    vslidedown.vi v22, v8, 8
-; ZVFHMIN64-NEXT:    vmv.x.s a4, v16
+; ZVFHMIN64-NEXT:    vslidedown.vi v6, v8, 12
+; ZVFHMIN64-NEXT:    vslidedown.vi v2, v8, 11
+; ZVFHMIN64-NEXT:    vslidedown.vi v22, v8, 10
+; ZVFHMIN64-NEXT:    vslidedown.vi v20, v8, 9
+; ZVFHMIN64-NEXT:    vslidedown.vi v18, v8, 8
+; ZVFHMIN64-NEXT:    vmv.x.s a3, v16
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
@@ -2518,52 +2515,51 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN64-NEXT:    lh a0, 560(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 304(sp)
 ; ZVFHMIN64-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; ZVFHMIN64-NEXT:    vslidedown.vi v3, v16, 7
-; ZVFHMIN64-NEXT:    vslidedown.vi v31, v16, 6
-; ZVFHMIN64-NEXT:    vslidedown.vi v5, v16, 5
+; ZVFHMIN64-NEXT:    vslidedown.vi v21, v16, 7
+; ZVFHMIN64-NEXT:    vslidedown.vi v3, v16, 6
+; ZVFHMIN64-NEXT:    vslidedown.vi v19, v16, 5
 ; ZVFHMIN64-NEXT:    vslidedown.vi v23, v16, 4
 ; ZVFHMIN64-NEXT:    vslidedown.vi v8, v16, 3
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    li a3, 21
-; ZVFHMIN64-NEXT:    mul a2, a2, a3
+; ZVFHMIN64-NEXT:    li a4, 10
+; ZVFHMIN64-NEXT:    mul a2, a2, a4
 ; ZVFHMIN64-NEXT:    add a2, sp, a2
 ; ZVFHMIN64-NEXT:    addi a2, a2, 800
 ; ZVFHMIN64-NEXT:    vs1r.v v8, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN64-NEXT:    vslidedown.vi v8, v16, 2
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    li a3, 20
-; ZVFHMIN64-NEXT:    mul a2, a2, a3
+; ZVFHMIN64-NEXT:    slli a2, a2, 4
 ; ZVFHMIN64-NEXT:    add a2, sp, a2
 ; ZVFHMIN64-NEXT:    addi a2, a2, 800
 ; ZVFHMIN64-NEXT:    vs1r.v v8, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN64-NEXT:    vslidedown.vi v8, v16, 1
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    li a3, 22
-; ZVFHMIN64-NEXT:    mul a2, a2, a3
+; ZVFHMIN64-NEXT:    slli a4, a2, 4
+; ZVFHMIN64-NEXT:    sub a2, a4, a2
 ; ZVFHMIN64-NEXT:    add a2, sp, a2
 ; ZVFHMIN64-NEXT:    addi a2, a2, 800
 ; ZVFHMIN64-NEXT:    vs1r.v v8, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN64-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
-; ZVFHMIN64-NEXT:    vslidedown.vi v18, v16, 15
-; ZVFHMIN64-NEXT:    vslidedown.vi v14, v16, 14
-; ZVFHMIN64-NEXT:    vslidedown.vi v8, v16, 13
+; ZVFHMIN64-NEXT:    vslidedown.vi v14, v16, 15
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v16, 14
+; ZVFHMIN64-NEXT:    vslidedown.vi v10, v16, 13
 ; ZVFHMIN64-NEXT:    vslidedown.vi v12, v16, 12
-; ZVFHMIN64-NEXT:    vslidedown.vi v10, v16, 11
-; ZVFHMIN64-NEXT:    vslidedown.vi v6, v16, 10
+; ZVFHMIN64-NEXT:    vslidedown.vi v30, v16, 11
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    li a3, 18
-; ZVFHMIN64-NEXT:    mul a2, a2, a3
+; ZVFHMIN64-NEXT:    slli a4, a2, 4
+; ZVFHMIN64-NEXT:    add a2, a4, a2
 ; ZVFHMIN64-NEXT:    add a2, sp, a2
 ; ZVFHMIN64-NEXT:    addi a2, a2, 800
-; ZVFHMIN64-NEXT:    vs2r.v v6, (a2) # Unknown-size Folded Spill
-; ZVFHMIN64-NEXT:    vslidedown.vi v6, v16, 9
+; ZVFHMIN64-NEXT:    vs2r.v v30, (a2) # Unknown-size Folded Spill
+; ZVFHMIN64-NEXT:    vslidedown.vi v30, v16, 10
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    li a3, 14
-; ZVFHMIN64-NEXT:    mul a2, a2, a3
+; ZVFHMIN64-NEXT:    li a4, 11
+; ZVFHMIN64-NEXT:    mul a2, a2, a4
 ; ZVFHMIN64-NEXT:    add a2, sp, a2
 ; ZVFHMIN64-NEXT:    addi a2, a2, 800
-; ZVFHMIN64-NEXT:    vs2r.v v6, (a2) # Unknown-size Folded Spill
-; ZVFHMIN64-NEXT:    vslidedown.vi v6, v16, 8
+; ZVFHMIN64-NEXT:    vs2r.v v30, (a2) # Unknown-size Folded Spill
+; ZVFHMIN64-NEXT:    vslidedown.vi v4, v16, 9
+; ZVFHMIN64-NEXT:    vslidedown.vi v30, v16, 8
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
@@ -2571,12 +2567,12 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN64-NEXT:    lh a0, 558(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 302(sp)
 ; ZVFHMIN64-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; ZVFHMIN64-NEXT:    vslidedown.vi v13, v0, 7
-; ZVFHMIN64-NEXT:    vslidedown.vi v29, v0, 6
-; ZVFHMIN64-NEXT:    vslidedown.vi v11, v0, 5
-; ZVFHMIN64-NEXT:    vslidedown.vi v7, v0, 4
-; ZVFHMIN64-NEXT:    vslidedown.vi v9, v0, 3
-; ZVFHMIN64-NEXT:    vslidedown.vi v21, v0, 2
+; ZVFHMIN64-NEXT:    vslidedown.vi v11, v0, 7
+; ZVFHMIN64-NEXT:    vslidedown.vi v7, v0, 6
+; ZVFHMIN64-NEXT:    vslidedown.vi v9, v0, 5
+; ZVFHMIN64-NEXT:    vslidedown.vi v29, v0, 4
+; ZVFHMIN64-NEXT:    vslidedown.vi v31, v0, 3
+; ZVFHMIN64-NEXT:    vslidedown.vi v5, v0, 2
 ; ZVFHMIN64-NEXT:    vslidedown.vi v27, v0, 1
 ; ZVFHMIN64-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
 ; ZVFHMIN64-NEXT:    vslidedown.vi v16, v0, 15
@@ -2587,63 +2583,63 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN64-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN64-NEXT:    vslidedown.vi v16, v0, 14
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    slli a2, a2, 3
+; ZVFHMIN64-NEXT:    slli a2, a2, 1
 ; ZVFHMIN64-NEXT:    add a2, sp, a2
 ; ZVFHMIN64-NEXT:    addi a2, a2, 800
 ; ZVFHMIN64-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN64-NEXT:    vslidedown.vi v16, v0, 13
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    li a3, 6
-; ZVFHMIN64-NEXT:    mul a2, a2, a3
+; ZVFHMIN64-NEXT:    li a4, 6
+; ZVFHMIN64-NEXT:    mul a2, a2, a4
 ; ZVFHMIN64-NEXT:    add a2, sp, a2
 ; ZVFHMIN64-NEXT:    addi a2, a2, 800
 ; ZVFHMIN64-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN64-NEXT:    vslidedown.vi v16, v0, 12
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    li a3, 12
-; ZVFHMIN64-NEXT:    mul a2, a2, a3
+; ZVFHMIN64-NEXT:    slli a2, a2, 3
 ; ZVFHMIN64-NEXT:    add a2, sp, a2
 ; ZVFHMIN64-NEXT:    addi a2, a2, 800
 ; ZVFHMIN64-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN64-NEXT:    vslidedown.vi v16, v0, 11
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    li a3, 10
-; ZVFHMIN64-NEXT:    mul a2, a2, a3
+; ZVFHMIN64-NEXT:    li a4, 13
+; ZVFHMIN64-NEXT:    mul a2, a2, a4
 ; ZVFHMIN64-NEXT:    add a2, sp, a2
 ; ZVFHMIN64-NEXT:    addi a2, a2, 800
 ; ZVFHMIN64-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN64-NEXT:    vslidedown.vi v16, v0, 10
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    slli a2, a2, 4
+; ZVFHMIN64-NEXT:    li a4, 19
+; ZVFHMIN64-NEXT:    mul a2, a2, a4
 ; ZVFHMIN64-NEXT:    add a2, sp, a2
 ; ZVFHMIN64-NEXT:    addi a2, a2, 800
 ; ZVFHMIN64-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN64-NEXT:    vslidedown.vi v16, v0, 9
+; ZVFHMIN64-NEXT:    csrr a2, vlenb
+; ZVFHMIN64-NEXT:    li a4, 21
+; ZVFHMIN64-NEXT:    mul a2, a2, a4
+; ZVFHMIN64-NEXT:    add a2, sp, a2
+; ZVFHMIN64-NEXT:    addi a2, a2, 800
+; ZVFHMIN64-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN64-NEXT:    vslidedown.vi v0, v0, 8
-; ZVFHMIN64-NEXT:    addi a2, sp, 800
-; ZVFHMIN64-NEXT:    vs2r.v v0, (a2) # Unknown-size Folded Spill
-; ZVFHMIN64-NEXT:    vmv.x.s t4, v26
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a0, 215(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 556(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 300(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s t3, v20
-; ZVFHMIN64-NEXT:    vmv.x.s t1, v28
+; ZVFHMIN64-NEXT:    vmv.x.s t3, v26
+; ZVFHMIN64-NEXT:    vmv.x.s t2, v28
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a0, 214(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 554(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 298(sp)
-; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    slli a2, a2, 1
-; ZVFHMIN64-NEXT:    add a2, sp, a2
-; ZVFHMIN64-NEXT:    addi a2, a2, 800
-; ZVFHMIN64-NEXT:    vl2r.v v0, (a2) # Unknown-size Folded Reload
-; ZVFHMIN64-NEXT:    vmv.x.s t2, v0
-; ZVFHMIN64-NEXT:    vmv.x.s t0, v4
+; ZVFHMIN64-NEXT:    addi a2, sp, 800
+; ZVFHMIN64-NEXT:    vl2r.v v16, (a2) # Unknown-size Folded Reload
+; ZVFHMIN64-NEXT:    vmv.x.s t1, v16
+; ZVFHMIN64-NEXT:    vmv.x.s t0, v6
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
@@ -2651,229 +2647,234 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN64-NEXT:    lh a0, 552(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 296(sp)
 ; ZVFHMIN64-NEXT:    vmv.x.s a7, v2
-; ZVFHMIN64-NEXT:    vmv.x.s a6, v30
+; ZVFHMIN64-NEXT:    vmv.x.s a6, v22
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a0, 212(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 550(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 294(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s a5, v22
+; ZVFHMIN64-NEXT:    vmv.x.s a5, v20
 ; ZVFHMIN64-NEXT:    vmv.x.s a2, v18
-; ZVFHMIN64-NEXT:    sd a2, 96(sp) # 8-byte Folded Spill
+; ZVFHMIN64-NEXT:    sd a2, 88(sp) # 8-byte Folded Spill
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a0, 211(sp)
-; ZVFHMIN64-NEXT:    lh a1, 548(sp)
-; ZVFHMIN64-NEXT:    lh t5, 292(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s a0, v14
-; ZVFHMIN64-NEXT:    sd a0, 104(sp) # 8-byte Folded Spill
-; ZVFHMIN64-NEXT:    vmv.x.s a0, v8
-; ZVFHMIN64-NEXT:    sd a0, 120(sp) # 8-byte Folded Spill
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, t5
-; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a1, 210(sp)
-; ZVFHMIN64-NEXT:    lh a1, 546(sp)
-; ZVFHMIN64-NEXT:    lh t5, 290(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a4
-; ZVFHMIN64-NEXT:    vmv.x.s a4, v24
+; ZVFHMIN64-NEXT:    lh a0, 548(sp)
+; ZVFHMIN64-NEXT:    lh a1, 292(sp)
+; ZVFHMIN64-NEXT:    vmv.x.s a2, v14
+; ZVFHMIN64-NEXT:    sd a2, 104(sp) # 8-byte Folded Spill
+; ZVFHMIN64-NEXT:    vmv.x.s a2, v8
+; ZVFHMIN64-NEXT:    sd a2, 120(sp) # 8-byte Folded Spill
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    fmv.h.x fa3, t5
-; ZVFHMIN64-NEXT:    feq.h a1, fa4, fa3
-; ZVFHMIN64-NEXT:    sb a1, 209(sp)
-; ZVFHMIN64-NEXT:    lh a1, 544(sp)
-; ZVFHMIN64-NEXT:    lh t5, 288(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a4
-; ZVFHMIN64-NEXT:    feq.h a4, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, t5
-; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a4, 192(sp)
-; ZVFHMIN64-NEXT:    sb a1, 208(sp)
-; ZVFHMIN64-NEXT:    lh t5, 738(sp)
-; ZVFHMIN64-NEXT:    lh t6, 482(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s a0, v12
-; ZVFHMIN64-NEXT:    sd a0, 88(sp) # 8-byte Folded Spill
-; ZVFHMIN64-NEXT:    vmv.x.s a0, v10
-; ZVFHMIN64-NEXT:    sd a0, 112(sp) # 8-byte Folded Spill
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, t5
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, t6
-; ZVFHMIN64-NEXT:    feq.h t5, fa5, fa4
-; ZVFHMIN64-NEXT:    sb t5, 177(sp)
-; ZVFHMIN64-NEXT:    lh t5, 736(sp)
-; ZVFHMIN64-NEXT:    lh t6, 480(sp)
-; ZVFHMIN64-NEXT:    csrr a0, vlenb
-; ZVFHMIN64-NEXT:    li a1, 29
-; ZVFHMIN64-NEXT:    mul a0, a0, a1
-; ZVFHMIN64-NEXT:    add a0, sp, a0
-; ZVFHMIN64-NEXT:    lh s5, 800(a0) # 8-byte Folded Reload
-; ZVFHMIN64-NEXT:    csrr a0, vlenb
-; ZVFHMIN64-NEXT:    li a1, 28
-; ZVFHMIN64-NEXT:    mul a0, a0, a1
-; ZVFHMIN64-NEXT:    add a0, sp, a0
-; ZVFHMIN64-NEXT:    lh s6, 800(a0) # 8-byte Folded Reload
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, t5
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, t6
-; ZVFHMIN64-NEXT:    feq.h t5, fa5, fa4
-; ZVFHMIN64-NEXT:    sb t5, 176(sp)
-; ZVFHMIN64-NEXT:    lh t5, 734(sp)
-; ZVFHMIN64-NEXT:    lh t6, 478(sp)
-; ZVFHMIN64-NEXT:    csrr a0, vlenb
-; ZVFHMIN64-NEXT:    li a1, 27
-; ZVFHMIN64-NEXT:    mul a0, a0, a1
-; ZVFHMIN64-NEXT:    add a0, sp, a0
-; ZVFHMIN64-NEXT:    lh s7, 800(a0) # 8-byte Folded Reload
-; ZVFHMIN64-NEXT:    csrr a0, vlenb
-; ZVFHMIN64-NEXT:    li a1, 26
-; ZVFHMIN64-NEXT:    mul a0, a0, a1
-; ZVFHMIN64-NEXT:    add a0, sp, a0
-; ZVFHMIN64-NEXT:    lh s8, 800(a0) # 8-byte Folded Reload
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, t5
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, t6
-; ZVFHMIN64-NEXT:    feq.h t5, fa5, fa4
-; ZVFHMIN64-NEXT:    sb t5, 175(sp)
-; ZVFHMIN64-NEXT:    lh t5, 732(sp)
-; ZVFHMIN64-NEXT:    lh t6, 476(sp)
-; ZVFHMIN64-NEXT:    csrr a0, vlenb
-; ZVFHMIN64-NEXT:    li a1, 25
-; ZVFHMIN64-NEXT:    mul a0, a0, a1
-; ZVFHMIN64-NEXT:    add a0, sp, a0
-; ZVFHMIN64-NEXT:    lh s4, 800(a0) # 8-byte Folded Reload
-; ZVFHMIN64-NEXT:    csrr a0, vlenb
-; ZVFHMIN64-NEXT:    li a1, 24
-; ZVFHMIN64-NEXT:    mul a0, a0, a1
-; ZVFHMIN64-NEXT:    add a0, sp, a0
-; ZVFHMIN64-NEXT:    lh s3, 800(a0) # 8-byte Folded Reload
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, t5
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, t6
-; ZVFHMIN64-NEXT:    feq.h t5, fa5, fa4
-; ZVFHMIN64-NEXT:    sb t5, 174(sp)
-; ZVFHMIN64-NEXT:    lh t6, 730(sp)
-; ZVFHMIN64-NEXT:    lh s9, 474(sp)
-; ZVFHMIN64-NEXT:    csrr a0, vlenb
-; ZVFHMIN64-NEXT:    li a1, 23
-; ZVFHMIN64-NEXT:    mul a0, a0, a1
-; ZVFHMIN64-NEXT:    add a0, sp, a0
-; ZVFHMIN64-NEXT:    lh s2, 800(a0) # 8-byte Folded Reload
-; ZVFHMIN64-NEXT:    vmv.x.s t5, v3
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, t6
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a0, 210(sp)
+; ZVFHMIN64-NEXT:    lh a0, 546(sp)
+; ZVFHMIN64-NEXT:    lh a1, 290(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a3
+; ZVFHMIN64-NEXT:    vmv.x.s a3, v24
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa3, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa4, fa3
+; ZVFHMIN64-NEXT:    sb a0, 209(sp)
+; ZVFHMIN64-NEXT:    lh a0, 544(sp)
+; ZVFHMIN64-NEXT:    lh a1, 288(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a3
+; ZVFHMIN64-NEXT:    feq.h a3, fa5, fa4
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a3, 192(sp)
+; ZVFHMIN64-NEXT:    sb a0, 208(sp)
+; ZVFHMIN64-NEXT:    lh a0, 738(sp)
+; ZVFHMIN64-NEXT:    lh a1, 482(sp)
+; ZVFHMIN64-NEXT:    vmv.x.s a2, v10
+; ZVFHMIN64-NEXT:    sd a2, 96(sp) # 8-byte Folded Spill
+; ZVFHMIN64-NEXT:    vmv.x.s a2, v12
+; ZVFHMIN64-NEXT:    sd a2, 112(sp) # 8-byte Folded Spill
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a0, 177(sp)
+; ZVFHMIN64-NEXT:    lh a0, 736(sp)
+; ZVFHMIN64-NEXT:    lh a1, 480(sp)
+; ZVFHMIN64-NEXT:    csrr a2, vlenb
+; ZVFHMIN64-NEXT:    li a3, 29
+; ZVFHMIN64-NEXT:    mul a2, a2, a3
+; ZVFHMIN64-NEXT:    add a2, sp, a2
+; ZVFHMIN64-NEXT:    lh s5, 800(a2) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    csrr a2, vlenb
+; ZVFHMIN64-NEXT:    li a3, 28
+; ZVFHMIN64-NEXT:    mul a2, a2, a3
+; ZVFHMIN64-NEXT:    add a2, sp, a2
+; ZVFHMIN64-NEXT:    lh s2, 800(a2) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a0, 176(sp)
+; ZVFHMIN64-NEXT:    lh a0, 734(sp)
+; ZVFHMIN64-NEXT:    lh a1, 478(sp)
+; ZVFHMIN64-NEXT:    csrr a2, vlenb
+; ZVFHMIN64-NEXT:    li a3, 27
+; ZVFHMIN64-NEXT:    mul a2, a2, a3
+; ZVFHMIN64-NEXT:    add a2, sp, a2
+; ZVFHMIN64-NEXT:    lh s6, 800(a2) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    csrr a2, vlenb
+; ZVFHMIN64-NEXT:    li a3, 26
+; ZVFHMIN64-NEXT:    mul a2, a2, a3
+; ZVFHMIN64-NEXT:    add a2, sp, a2
+; ZVFHMIN64-NEXT:    lh s3, 800(a2) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a0, 175(sp)
+; ZVFHMIN64-NEXT:    lh a0, 732(sp)
+; ZVFHMIN64-NEXT:    lh a1, 476(sp)
+; ZVFHMIN64-NEXT:    csrr a2, vlenb
+; ZVFHMIN64-NEXT:    li a3, 25
+; ZVFHMIN64-NEXT:    mul a2, a2, a3
+; ZVFHMIN64-NEXT:    add a2, sp, a2
+; ZVFHMIN64-NEXT:    lh s7, 800(a2) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    csrr a2, vlenb
+; ZVFHMIN64-NEXT:    li a3, 24
+; ZVFHMIN64-NEXT:    mul a2, a2, a3
+; ZVFHMIN64-NEXT:    add a2, sp, a2
+; ZVFHMIN64-NEXT:    lh s4, 800(a2) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a0, 174(sp)
+; ZVFHMIN64-NEXT:    lh a0, 730(sp)
+; ZVFHMIN64-NEXT:    lh a1, 474(sp)
+; ZVFHMIN64-NEXT:    csrr a2, vlenb
+; ZVFHMIN64-NEXT:    li a3, 23
+; ZVFHMIN64-NEXT:    mul a2, a2, a3
+; ZVFHMIN64-NEXT:    add a2, sp, a2
+; ZVFHMIN64-NEXT:    lh s8, 800(a2) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    vmv.x.s t4, v21
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a0, 173(sp)
+; ZVFHMIN64-NEXT:    lh a0, 728(sp)
+; ZVFHMIN64-NEXT:    lh a1, 472(sp)
+; ZVFHMIN64-NEXT:    vmv.x.s t6, v3
+; ZVFHMIN64-NEXT:    vmv.x.s t5, v19
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a0, 172(sp)
+; ZVFHMIN64-NEXT:    lh a0, 726(sp)
+; ZVFHMIN64-NEXT:    lh a1, 470(sp)
+; ZVFHMIN64-NEXT:    vmv.x.s s10, v11
+; ZVFHMIN64-NEXT:    vmv.x.s s11, v7
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a0, 171(sp)
+; ZVFHMIN64-NEXT:    lh a0, 724(sp)
+; ZVFHMIN64-NEXT:    lh s9, 468(sp)
+; ZVFHMIN64-NEXT:    vmv.x.s a4, v9
+; ZVFHMIN64-NEXT:    vmv.x.s ra, v29
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, s9
-; ZVFHMIN64-NEXT:    feq.h t6, fa5, fa4
-; ZVFHMIN64-NEXT:    sb t6, 173(sp)
-; ZVFHMIN64-NEXT:    lh s9, 728(sp)
-; ZVFHMIN64-NEXT:    lh s10, 472(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s t6, v31
-; ZVFHMIN64-NEXT:    vmv.x.s ra, v13
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, s9
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, s10
-; ZVFHMIN64-NEXT:    feq.h s9, fa5, fa4
-; ZVFHMIN64-NEXT:    sb s9, 172(sp)
-; ZVFHMIN64-NEXT:    lh s9, 726(sp)
-; ZVFHMIN64-NEXT:    lh s10, 470(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s a2, v29
-; ZVFHMIN64-NEXT:    vmv.x.s a3, v11
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, s9
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, s10
-; ZVFHMIN64-NEXT:    feq.h s9, fa5, fa4
-; ZVFHMIN64-NEXT:    sb s9, 171(sp)
-; ZVFHMIN64-NEXT:    lh s10, 724(sp)
-; ZVFHMIN64-NEXT:    lh s11, 468(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s a4, v7
-; ZVFHMIN64-NEXT:    vmv.x.s s9, v9
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, s10
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, s11
-; ZVFHMIN64-NEXT:    feq.h s10, fa5, fa4
-; ZVFHMIN64-NEXT:    sb s10, 170(sp)
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a0, 170(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 722(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 466(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s s10, v21
-; ZVFHMIN64-NEXT:    vmv.x.s s11, v27
+; ZVFHMIN64-NEXT:    vmv.x.s s9, v31
+; ZVFHMIN64-NEXT:    vmv.x.s a3, v5
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a0, 169(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 720(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 464(sp)
+; ZVFHMIN64-NEXT:    vmv.x.s a2, v27
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, s5
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, s6
-; ZVFHMIN64-NEXT:    fmv.h.x fa3, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa2, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa3, fa2
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa3, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa4, fa3
 ; ZVFHMIN64-NEXT:    sb a0, 168(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 718(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 462(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa3, s7
-; ZVFHMIN64-NEXT:    fmv.h.x fa2, s8
-; ZVFHMIN64-NEXT:    fmv.h.x fa1, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa0, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa1, fa0
-; ZVFHMIN64-NEXT:    fmv.h.x fa1, ra
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, s2
+; ZVFHMIN64-NEXT:    fmv.h.x fa3, s6
+; ZVFHMIN64-NEXT:    fmv.h.x fa2, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa1, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa2, fa1
 ; ZVFHMIN64-NEXT:    sb a0, 167(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 716(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa0, a2
 ; ZVFHMIN64-NEXT:    lh a1, 460(sp)
-; ZVFHMIN64-NEXT:    feq.h s5, fa5, fa1
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    feq.h a0, fa4, fa0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, s4
-; ZVFHMIN64-NEXT:    sb a1, 166(sp)
-; ZVFHMIN64-NEXT:    lh a1, 714(sp)
-; ZVFHMIN64-NEXT:    lh a2, 458(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a3
-; ZVFHMIN64-NEXT:    feq.h a3, fa3, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    fmv.h.x fa3, a2
-; ZVFHMIN64-NEXT:    feq.h a1, fa4, fa3
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, s3
-; ZVFHMIN64-NEXT:    sb a1, 165(sp)
-; ZVFHMIN64-NEXT:    lh a1, 712(sp)
-; ZVFHMIN64-NEXT:    lh a2, 456(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa3, a4
-; ZVFHMIN64-NEXT:    feq.h a4, fa2, fa3
-; ZVFHMIN64-NEXT:    fmv.h.x fa3, a1
-; ZVFHMIN64-NEXT:    fmv.h.x fa2, a2
-; ZVFHMIN64-NEXT:    feq.h a1, fa3, fa2
-; ZVFHMIN64-NEXT:    fmv.h.x fa3, s2
-; ZVFHMIN64-NEXT:    sb a1, 164(sp)
-; ZVFHMIN64-NEXT:    lh a1, 710(sp)
-; ZVFHMIN64-NEXT:    lh a2, 454(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa2, s9
-; ZVFHMIN64-NEXT:    feq.h s2, fa5, fa2
+; ZVFHMIN64-NEXT:    fmv.h.x fa2, s3
+; ZVFHMIN64-NEXT:    fmv.h.x fa1, s7
+; ZVFHMIN64-NEXT:    fmv.h.x fa0, a0
+; ZVFHMIN64-NEXT:    fmv.h.x ft0, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa0, ft0
+; ZVFHMIN64-NEXT:    sb a0, 166(sp)
+; ZVFHMIN64-NEXT:    lh a0, 714(sp)
+; ZVFHMIN64-NEXT:    lh a1, 458(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa0, s4
+; ZVFHMIN64-NEXT:    fmv.h.x ft0, s8
+; ZVFHMIN64-NEXT:    fmv.h.x ft1, a0
+; ZVFHMIN64-NEXT:    fmv.h.x ft2, a1
+; ZVFHMIN64-NEXT:    feq.h a0, ft1, ft2
+; ZVFHMIN64-NEXT:    sb a0, 165(sp)
+; ZVFHMIN64-NEXT:    lh a0, 712(sp)
+; ZVFHMIN64-NEXT:    lh a1, 456(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x ft1, s10
+; ZVFHMIN64-NEXT:    fmv.h.x ft2, s11
+; ZVFHMIN64-NEXT:    fmv.h.x ft3, a0
+; ZVFHMIN64-NEXT:    fmv.h.x ft4, a1
+; ZVFHMIN64-NEXT:    feq.h a0, ft3, ft4
+; ZVFHMIN64-NEXT:    sb a0, 164(sp)
+; ZVFHMIN64-NEXT:    lh a0, 710(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x ft3, a4
+; ZVFHMIN64-NEXT:    lh a1, 454(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x ft4, ra
+; ZVFHMIN64-NEXT:    fmv.h.x ft5, a0
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, ft1
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT:    fmv.h.x fa2, a2
-; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa2
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, s10
-; ZVFHMIN64-NEXT:    fmv.h.x fa2, s11
+; ZVFHMIN64-NEXT:    feq.h a1, ft5, fa5
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a3
 ; ZVFHMIN64-NEXT:    sb a1, 163(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 708(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x ft1, a2
 ; ZVFHMIN64-NEXT:    lh a2, 452(sp)
-; ZVFHMIN64-NEXT:    feq.h s3, fa4, fa5
-; ZVFHMIN64-NEXT:    feq.h s4, fa3, fa2
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
-; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a1, 162(sp)
-; ZVFHMIN64-NEXT:    lh a1, 706(sp)
-; ZVFHMIN64-NEXT:    lh a2, 450(sp)
-; ZVFHMIN64-NEXT:    sb s4, 129(sp)
-; ZVFHMIN64-NEXT:    sb s3, 130(sp)
-; ZVFHMIN64-NEXT:    sb s2, 131(sp)
-; ZVFHMIN64-NEXT:    sb a4, 132(sp)
+; ZVFHMIN64-NEXT:    feq.h a3, fa0, fa5
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
-; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    feq.h a1, ft0, ft1
+; ZVFHMIN64-NEXT:    fmv.h.x fa0, a2
+; ZVFHMIN64-NEXT:    feq.h a2, fa5, fa0
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, s9
+; ZVFHMIN64-NEXT:    sb a2, 162(sp)
+; ZVFHMIN64-NEXT:    lh a2, 706(sp)
+; ZVFHMIN64-NEXT:    lh a4, 450(sp)
+; ZVFHMIN64-NEXT:    sb a1, 129(sp)
+; ZVFHMIN64-NEXT:    feq.h a1, fa1, fa5
+; ZVFHMIN64-NEXT:    sb a3, 130(sp)
+; ZVFHMIN64-NEXT:    feq.h a3, fa2, ft4
+; ZVFHMIN64-NEXT:    sb a1, 131(sp)
+; ZVFHMIN64-NEXT:    feq.h a1, fa4, ft2
+; ZVFHMIN64-NEXT:    sb a3, 132(sp)
+; ZVFHMIN64-NEXT:    feq.h a3, fa3, ft3
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a2
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a4
+; ZVFHMIN64-NEXT:    feq.h a2, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a3, 133(sp)
-; ZVFHMIN64-NEXT:    sb a0, 134(sp)
-; ZVFHMIN64-NEXT:    sb s5, 135(sp)
-; ZVFHMIN64-NEXT:    sb a1, 161(sp)
+; ZVFHMIN64-NEXT:    sb a1, 134(sp)
+; ZVFHMIN64-NEXT:    sb a0, 135(sp)
+; ZVFHMIN64-NEXT:    sb a2, 161(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 610(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 354(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s s6, v5
-; ZVFHMIN64-NEXT:    vmv.x.s s5, v23
+; ZVFHMIN64-NEXT:    vmv.x.s s4, v23
+; ZVFHMIN64-NEXT:    csrr a2, vlenb
+; ZVFHMIN64-NEXT:    li a3, 10
+; ZVFHMIN64-NEXT:    mul a2, a2, a3
+; ZVFHMIN64-NEXT:    add a2, sp, a2
+; ZVFHMIN64-NEXT:    lh s2, 800(a2) # 8-byte Folded Reload
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
@@ -2881,13 +2882,12 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN64-NEXT:    lh a0, 608(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 352(sp)
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    li a3, 21
-; ZVFHMIN64-NEXT:    mul a2, a2, a3
+; ZVFHMIN64-NEXT:    slli a2, a2, 4
 ; ZVFHMIN64-NEXT:    add a2, sp, a2
-; ZVFHMIN64-NEXT:    lh s4, 800(a2) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    lh s5, 800(a2) # 8-byte Folded Reload
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    li a3, 20
-; ZVFHMIN64-NEXT:    mul a2, a2, a3
+; ZVFHMIN64-NEXT:    slli a3, a2, 4
+; ZVFHMIN64-NEXT:    sub a2, a3, a2
 ; ZVFHMIN64-NEXT:    add a2, sp, a2
 ; ZVFHMIN64-NEXT:    lh s3, 800(a2) # 8-byte Folded Reload
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
@@ -2896,153 +2896,148 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN64-NEXT:    sb a0, 240(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 606(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 350(sp)
-; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    li a3, 22
-; ZVFHMIN64-NEXT:    mul a2, a2, a3
-; ZVFHMIN64-NEXT:    add a2, sp, a2
-; ZVFHMIN64-NEXT:    lh s2, 800(a2) # 8-byte Folded Reload
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, t5
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa3, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa4, fa3
+; ZVFHMIN64-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 7
+; ZVFHMIN64-NEXT:    vmv.x.s s6, v8
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a0, 239(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 604(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 348(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, t6
-; ZVFHMIN64-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 7
-; ZVFHMIN64-NEXT:    fmv.h.x fa3, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa2, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa3, fa2
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 6
+; ZVFHMIN64-NEXT:    vmv.x.s s7, v8
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a0, 238(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 602(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 346(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s a2, v8
-; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 6
-; ZVFHMIN64-NEXT:    fmv.h.x fa3, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa2, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa3, fa2
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 5
+; ZVFHMIN64-NEXT:    vmv.x.s s8, v8
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a0, 237(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 600(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 344(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s a3, v8
-; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 5
-; ZVFHMIN64-NEXT:    fmv.h.x fa3, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa2, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa3, fa2
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 4
+; ZVFHMIN64-NEXT:    vmv.x.s s9, v8
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a0, 236(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 598(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 342(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s a4, v8
-; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 4
-; ZVFHMIN64-NEXT:    fmv.h.x fa3, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa2, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa3, fa2
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 3
+; ZVFHMIN64-NEXT:    vmv.x.s s10, v8
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a0, 235(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 596(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 340(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s s8, v8
-; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 3
-; ZVFHMIN64-NEXT:    fmv.h.x fa3, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa2, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa3, fa2
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 2
+; ZVFHMIN64-NEXT:    vmv.x.s s11, v8
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a0, 234(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 594(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 338(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s s9, v8
-; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 2
-; ZVFHMIN64-NEXT:    fmv.h.x fa3, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa2, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa3, fa2
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 1
+; ZVFHMIN64-NEXT:    vmv.x.s ra, v8
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a0, 233(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 592(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s a1, v8
-; ZVFHMIN64-NEXT:    lh t5, 336(sp)
-; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 1
+; ZVFHMIN64-NEXT:    lh a1, 336(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, t4
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, t6
 ; ZVFHMIN64-NEXT:    fmv.h.x fa3, a0
-; ZVFHMIN64-NEXT:    vmv.x.s s7, v8
-; ZVFHMIN64-NEXT:    fmv.h.x fa2, t5
+; ZVFHMIN64-NEXT:    fmv.h.x fa2, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa3, fa2
-; ZVFHMIN64-NEXT:    fmv.h.x fa3, a2
 ; ZVFHMIN64-NEXT:    sb a0, 232(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 590(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa2, a3
-; ZVFHMIN64-NEXT:    lh a2, 334(sp)
-; ZVFHMIN64-NEXT:    feq.h t5, fa5, fa3
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    feq.h t6, fa4, fa2
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, s6
+; ZVFHMIN64-NEXT:    lh a1, 334(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa3, t5
+; ZVFHMIN64-NEXT:    fmv.h.x fa2, s4
+; ZVFHMIN64-NEXT:    fmv.h.x fa1, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa0, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa1, fa0
 ; ZVFHMIN64-NEXT:    sb a0, 231(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 588(sp)
-; ZVFHMIN64-NEXT:    lh a2, 332(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a4
-; ZVFHMIN64-NEXT:    feq.h a3, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, s5
+; ZVFHMIN64-NEXT:    lh a1, 332(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa1, s2
+; ZVFHMIN64-NEXT:    fmv.h.x fa0, s5
+; ZVFHMIN64-NEXT:    fmv.h.x ft0, a0
+; ZVFHMIN64-NEXT:    fmv.h.x ft1, a1
+; ZVFHMIN64-NEXT:    feq.h a0, ft0, ft1
 ; ZVFHMIN64-NEXT:    sb a0, 230(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 586(sp)
-; ZVFHMIN64-NEXT:    lh a2, 330(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, s8
-; ZVFHMIN64-NEXT:    feq.h a4, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, s4
-; ZVFHMIN64-NEXT:    sb a0, 229(sp)
-; ZVFHMIN64-NEXT:    lh a0, 584(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x ft0, s3
+; ZVFHMIN64-NEXT:    lh a1, 330(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x ft1, s6
+; ZVFHMIN64-NEXT:    fmv.h.x ft2, a0
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, ft1
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT:    feq.h a1, ft2, fa5
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, s7
+; ZVFHMIN64-NEXT:    sb a1, 229(sp)
+; ZVFHMIN64-NEXT:    lh a1, 584(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x ft1, s8
 ; ZVFHMIN64-NEXT:    lh a2, 328(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, s9
-; ZVFHMIN64-NEXT:    feq.h s4, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, s3
-; ZVFHMIN64-NEXT:    sb a0, 228(sp)
-; ZVFHMIN64-NEXT:    lh a0, 582(sp)
-; ZVFHMIN64-NEXT:    lh a2, 326(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, s2
-; ZVFHMIN64-NEXT:    sb a0, 227(sp)
-; ZVFHMIN64-NEXT:    lh a0, 580(sp)
-; ZVFHMIN64-NEXT:    lh a2, 324(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, s7
-; ZVFHMIN64-NEXT:    feq.h s2, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 226(sp)
-; ZVFHMIN64-NEXT:    lh a0, 578(sp)
-; ZVFHMIN64-NEXT:    lh a2, 322(sp)
-; ZVFHMIN64-NEXT:    sb s2, 193(sp)
-; ZVFHMIN64-NEXT:    sb a1, 194(sp)
-; ZVFHMIN64-NEXT:    sb s4, 195(sp)
-; ZVFHMIN64-NEXT:    sb a4, 196(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    feq.h a3, fa4, fa5
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT:    feq.h a1, fa3, ft1
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a3, 197(sp)
-; ZVFHMIN64-NEXT:    sb t6, 198(sp)
-; ZVFHMIN64-NEXT:    sb t5, 199(sp)
-; ZVFHMIN64-NEXT:    sb a0, 225(sp)
+; ZVFHMIN64-NEXT:    feq.h a2, fa5, fa4
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, s9
+; ZVFHMIN64-NEXT:    sb a2, 228(sp)
+; ZVFHMIN64-NEXT:    lh a2, 582(sp)
+; ZVFHMIN64-NEXT:    lh a4, 326(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, s10
+; ZVFHMIN64-NEXT:    feq.h t4, fa2, fa5
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a2
+; ZVFHMIN64-NEXT:    fmv.h.x fa3, a4
+; ZVFHMIN64-NEXT:    feq.h a2, fa5, fa3
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, s11
+; ZVFHMIN64-NEXT:    fmv.h.x fa3, ra
+; ZVFHMIN64-NEXT:    sb a2, 227(sp)
+; ZVFHMIN64-NEXT:    lh a2, 580(sp)
+; ZVFHMIN64-NEXT:    lh a4, 324(sp)
+; ZVFHMIN64-NEXT:    feq.h t5, fa0, fa5
+; ZVFHMIN64-NEXT:    feq.h t6, ft0, fa3
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a2
+; ZVFHMIN64-NEXT:    fmv.h.x fa3, a4
+; ZVFHMIN64-NEXT:    feq.h a2, fa5, fa3
+; ZVFHMIN64-NEXT:    sb a2, 226(sp)
+; ZVFHMIN64-NEXT:    lh a2, 578(sp)
+; ZVFHMIN64-NEXT:    lh a4, 322(sp)
+; ZVFHMIN64-NEXT:    sb t6, 193(sp)
+; ZVFHMIN64-NEXT:    feq.h t6, fa1, fa4
+; ZVFHMIN64-NEXT:    sb t5, 194(sp)
+; ZVFHMIN64-NEXT:    sb t6, 195(sp)
+; ZVFHMIN64-NEXT:    sb t4, 196(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a2
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a4
+; ZVFHMIN64-NEXT:    feq.h a2, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a1, 197(sp)
+; ZVFHMIN64-NEXT:    sb a3, 198(sp)
+; ZVFHMIN64-NEXT:    sb a0, 199(sp)
+; ZVFHMIN64-NEXT:    sb a2, 225(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 766(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 510(sp)
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    li a3, 18
-; ZVFHMIN64-NEXT:    mul a2, a2, a3
+; ZVFHMIN64-NEXT:    slli a3, a2, 4
+; ZVFHMIN64-NEXT:    add a2, a3, a2
 ; ZVFHMIN64-NEXT:    add a2, sp, a2
 ; ZVFHMIN64-NEXT:    addi a2, a2, 800
 ; ZVFHMIN64-NEXT:    vl2r.v v8, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN64-NEXT:    vmv.x.s s2, v8
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    li a3, 14
+; ZVFHMIN64-NEXT:    li a3, 11
 ; ZVFHMIN64-NEXT:    mul a2, a2, a3
 ; ZVFHMIN64-NEXT:    add a2, sp, a2
 ; ZVFHMIN64-NEXT:    addi a2, a2, 800
@@ -3054,165 +3049,171 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN64-NEXT:    sb a0, 191(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 764(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 508(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s t5, v6
-; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    slli a2, a2, 2
-; ZVFHMIN64-NEXT:    add a2, sp, a2
-; ZVFHMIN64-NEXT:    addi a2, a2, 800
-; ZVFHMIN64-NEXT:    vl2r.v v8, (a2) # Unknown-size Folded Reload
-; ZVFHMIN64-NEXT:    vmv.x.s a2, v8
+; ZVFHMIN64-NEXT:    vmv.x.s t5, v4
+; ZVFHMIN64-NEXT:    vmv.x.s t4, v30
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a0, 190(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 762(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 506(sp)
+; ZVFHMIN64-NEXT:    csrr a2, vlenb
+; ZVFHMIN64-NEXT:    slli a2, a2, 2
+; ZVFHMIN64-NEXT:    add a2, sp, a2
+; ZVFHMIN64-NEXT:    addi a2, a2, 800
+; ZVFHMIN64-NEXT:    vl2r.v v8, (a2) # Unknown-size Folded Reload
+; ZVFHMIN64-NEXT:    vmv.x.s a2, v8
 ; ZVFHMIN64-NEXT:    csrr a3, vlenb
-; ZVFHMIN64-NEXT:    slli a3, a3, 3
+; ZVFHMIN64-NEXT:    slli a3, a3, 1
 ; ZVFHMIN64-NEXT:    add a3, sp, a3
 ; ZVFHMIN64-NEXT:    addi a3, a3, 800
 ; ZVFHMIN64-NEXT:    vl2r.v v8, (a3) # Unknown-size Folded Reload
 ; ZVFHMIN64-NEXT:    vmv.x.s a3, v8
-; ZVFHMIN64-NEXT:    csrr a4, vlenb
-; ZVFHMIN64-NEXT:    li s3, 6
-; ZVFHMIN64-NEXT:    mul a4, a4, s3
-; ZVFHMIN64-NEXT:    add a4, sp, a4
-; ZVFHMIN64-NEXT:    addi a4, a4, 800
-; ZVFHMIN64-NEXT:    vl2r.v v8, (a4) # Unknown-size Folded Reload
-; ZVFHMIN64-NEXT:    vmv.x.s a4, v8
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a0, 189(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 760(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 504(sp)
-; ZVFHMIN64-NEXT:    csrr s3, vlenb
-; ZVFHMIN64-NEXT:    li s4, 12
-; ZVFHMIN64-NEXT:    mul s3, s3, s4
-; ZVFHMIN64-NEXT:    add s3, sp, s3
-; ZVFHMIN64-NEXT:    addi s3, s3, 800
-; ZVFHMIN64-NEXT:    vl2r.v v8, (s3) # Unknown-size Folded Reload
-; ZVFHMIN64-NEXT:    vmv.x.s s6, v8
-; ZVFHMIN64-NEXT:    csrr s3, vlenb
-; ZVFHMIN64-NEXT:    li s4, 10
-; ZVFHMIN64-NEXT:    mul s3, s3, s4
-; ZVFHMIN64-NEXT:    add s3, sp, s3
-; ZVFHMIN64-NEXT:    addi s3, s3, 800
-; ZVFHMIN64-NEXT:    vl2r.v v8, (s3) # Unknown-size Folded Reload
-; ZVFHMIN64-NEXT:    vmv.x.s s4, v8
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, t3
+; ZVFHMIN64-NEXT:    csrr a4, vlenb
+; ZVFHMIN64-NEXT:    li t3, 6
+; ZVFHMIN64-NEXT:    mul a4, a4, t3
+; ZVFHMIN64-NEXT:    add a4, sp, a4
+; ZVFHMIN64-NEXT:    addi a4, a4, 800
+; ZVFHMIN64-NEXT:    vl2r.v v8, (a4) # Unknown-size Folded Reload
+; ZVFHMIN64-NEXT:    vmv.x.s a4, v8
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa3, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa4, fa3
 ; ZVFHMIN64-NEXT:    sb a0, 188(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 758(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 502(sp)
-; ZVFHMIN64-NEXT:    csrr s3, vlenb
-; ZVFHMIN64-NEXT:    slli s3, s3, 4
-; ZVFHMIN64-NEXT:    add s3, sp, s3
-; ZVFHMIN64-NEXT:    addi s3, s3, 800
-; ZVFHMIN64-NEXT:    vl2r.v v8, (s3) # Unknown-size Folded Reload
-; ZVFHMIN64-NEXT:    vmv.x.s s5, v8
-; ZVFHMIN64-NEXT:    vmv.x.s s3, v16
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, t4
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, t2
+; ZVFHMIN64-NEXT:    csrr t2, vlenb
+; ZVFHMIN64-NEXT:    slli t2, t2, 3
+; ZVFHMIN64-NEXT:    add t2, sp, t2
+; ZVFHMIN64-NEXT:    addi t2, t2, 800
+; ZVFHMIN64-NEXT:    vl2r.v v8, (t2) # Unknown-size Folded Reload
+; ZVFHMIN64-NEXT:    vmv.x.s t2, v8
+; ZVFHMIN64-NEXT:    fmv.h.x fa3, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa2, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa3, fa2
 ; ZVFHMIN64-NEXT:    sb a0, 187(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 756(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 500(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
-; ZVFHMIN64-NEXT:    feq.h t4, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, t3
+; ZVFHMIN64-NEXT:    fmv.h.x fa3, t1
+; ZVFHMIN64-NEXT:    csrr t1, vlenb
+; ZVFHMIN64-NEXT:    li t3, 13
+; ZVFHMIN64-NEXT:    mul t1, t1, t3
+; ZVFHMIN64-NEXT:    add t1, sp, t1
+; ZVFHMIN64-NEXT:    addi t1, t1, 800
+; ZVFHMIN64-NEXT:    vl2r.v v8, (t1) # Unknown-size Folded Reload
+; ZVFHMIN64-NEXT:    vmv.x.s t3, v8
+; ZVFHMIN64-NEXT:    fmv.h.x fa2, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa1, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa2, fa1
 ; ZVFHMIN64-NEXT:    sb a0, 186(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 754(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa2, t0
 ; ZVFHMIN64-NEXT:    lh a1, 498(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a3
-; ZVFHMIN64-NEXT:    feq.h t3, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, t1
-; ZVFHMIN64-NEXT:    sb a0, 185(sp)
-; ZVFHMIN64-NEXT:    lh a0, 752(sp)
-; ZVFHMIN64-NEXT:    lh a1, 496(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a4
-; ZVFHMIN64-NEXT:    feq.h t1, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, t2
-; ZVFHMIN64-NEXT:    sb a0, 184(sp)
-; ZVFHMIN64-NEXT:    lh a0, 750(sp)
-; ZVFHMIN64-NEXT:    lh a1, 494(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, s6
+; ZVFHMIN64-NEXT:    csrr t0, vlenb
+; ZVFHMIN64-NEXT:    li t1, 19
+; ZVFHMIN64-NEXT:    mul t0, t0, t1
+; ZVFHMIN64-NEXT:    add t0, sp, t0
+; ZVFHMIN64-NEXT:    addi t0, t0, 800
+; ZVFHMIN64-NEXT:    vl2r.v v8, (t0) # Unknown-size Folded Reload
+; ZVFHMIN64-NEXT:    vmv.x.s s3, v8
+; ZVFHMIN64-NEXT:    fmv.h.x fa1, a0
+; ZVFHMIN64-NEXT:    csrr a0, vlenb
+; ZVFHMIN64-NEXT:    li t0, 21
+; ZVFHMIN64-NEXT:    mul a0, a0, t0
+; ZVFHMIN64-NEXT:    add a0, sp, a0
+; ZVFHMIN64-NEXT:    addi a0, a0, 800
+; ZVFHMIN64-NEXT:    vl2r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN64-NEXT:    vmv.x.s a0, v8
+; ZVFHMIN64-NEXT:    fmv.h.x fa0, a1
+; ZVFHMIN64-NEXT:    feq.h a1, fa1, fa0
+; ZVFHMIN64-NEXT:    fmv.h.x fa1, a2
+; ZVFHMIN64-NEXT:    sb a1, 185(sp)
+; ZVFHMIN64-NEXT:    lh a1, 752(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa0, a3
+; ZVFHMIN64-NEXT:    lh a2, 496(sp)
+; ZVFHMIN64-NEXT:    feq.h t0, fa5, fa1
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT:    feq.h t1, fa4, fa0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a4
+; ZVFHMIN64-NEXT:    sb a1, 184(sp)
+; ZVFHMIN64-NEXT:    lh a1, 750(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, t2
+; ZVFHMIN64-NEXT:    lh a2, 494(sp)
+; ZVFHMIN64-NEXT:    feq.h a3, fa3, fa5
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT:    feq.h a1, fa2, fa4
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
 ; ZVFHMIN64-NEXT:    feq.h a2, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, t0
-; ZVFHMIN64-NEXT:    sb a0, 183(sp)
-; ZVFHMIN64-NEXT:    lh a0, 748(sp)
-; ZVFHMIN64-NEXT:    lh a1, 492(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, s4
-; ZVFHMIN64-NEXT:    feq.h a3, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a7
-; ZVFHMIN64-NEXT:    sb a0, 182(sp)
-; ZVFHMIN64-NEXT:    lh a0, 746(sp)
-; ZVFHMIN64-NEXT:    lh a1, 490(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, s5
-; ZVFHMIN64-NEXT:    feq.h a4, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a2, 183(sp)
+; ZVFHMIN64-NEXT:    lh a2, 748(sp)
+; ZVFHMIN64-NEXT:    lh a4, 492(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, t3
+; ZVFHMIN64-NEXT:    feq.h a7, fa5, fa4
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a2
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a4
+; ZVFHMIN64-NEXT:    feq.h a2, fa5, fa4
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a6
-; ZVFHMIN64-NEXT:    sb a0, 181(sp)
-; ZVFHMIN64-NEXT:    lh a0, 744(sp)
-; ZVFHMIN64-NEXT:    lh a1, 488(sp)
+; ZVFHMIN64-NEXT:    sb a2, 182(sp)
+; ZVFHMIN64-NEXT:    lh a2, 746(sp)
+; ZVFHMIN64-NEXT:    lh a4, 490(sp)
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, s3
 ; ZVFHMIN64-NEXT:    feq.h a6, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a2
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a4
+; ZVFHMIN64-NEXT:    feq.h a2, fa5, fa4
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a5
-; ZVFHMIN64-NEXT:    addi a1, sp, 800
-; ZVFHMIN64-NEXT:    vl2r.v v8, (a1) # Unknown-size Folded Reload
-; ZVFHMIN64-NEXT:    vmv.x.s a1, v8
+; ZVFHMIN64-NEXT:    sb a2, 181(sp)
+; ZVFHMIN64-NEXT:    lh a2, 744(sp)
+; ZVFHMIN64-NEXT:    lh a4, 488(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a0
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a2
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a4
+; ZVFHMIN64-NEXT:    feq.h a2, fa5, fa4
+; ZVFHMIN64-NEXT:    ld a4, 88(sp) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a4
+; ZVFHMIN64-NEXT:    vmv.x.s a5, v0
 ; ZVFHMIN64-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
 ; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 15
-; ZVFHMIN64-NEXT:    vmv.x.s a5, v8
-; ZVFHMIN64-NEXT:    sb a0, 180(sp)
-; ZVFHMIN64-NEXT:    lh a0, 742(sp)
-; ZVFHMIN64-NEXT:    lh a7, 486(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a7
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 179(sp)
-; ZVFHMIN64-NEXT:    lh a0, 740(sp)
-; ZVFHMIN64-NEXT:    lh a7, 484(sp)
-; ZVFHMIN64-NEXT:    sb a2, 140(sp)
-; ZVFHMIN64-NEXT:    sb t1, 141(sp)
-; ZVFHMIN64-NEXT:    sb t3, 142(sp)
-; ZVFHMIN64-NEXT:    sb t4, 143(sp)
-; ZVFHMIN64-NEXT:    sb a1, 136(sp)
-; ZVFHMIN64-NEXT:    sb a6, 137(sp)
-; ZVFHMIN64-NEXT:    sb a4, 138(sp)
-; ZVFHMIN64-NEXT:    sb a3, 139(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a7
+; ZVFHMIN64-NEXT:    vmv.x.s a4, v8
+; ZVFHMIN64-NEXT:    sb a2, 180(sp)
+; ZVFHMIN64-NEXT:    lh a2, 742(sp)
+; ZVFHMIN64-NEXT:    lh t2, 486(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a5
+; ZVFHMIN64-NEXT:    feq.h a5, fa5, fa4
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a2
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, t2
+; ZVFHMIN64-NEXT:    feq.h a2, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a2, 179(sp)
+; ZVFHMIN64-NEXT:    lh a2, 740(sp)
+; ZVFHMIN64-NEXT:    lh t2, 484(sp)
+; ZVFHMIN64-NEXT:    sb a1, 140(sp)
+; ZVFHMIN64-NEXT:    sb a3, 141(sp)
+; ZVFHMIN64-NEXT:    sb t1, 142(sp)
+; ZVFHMIN64-NEXT:    sb t0, 143(sp)
+; ZVFHMIN64-NEXT:    sb a5, 136(sp)
+; ZVFHMIN64-NEXT:    sb a0, 137(sp)
+; ZVFHMIN64-NEXT:    sb a6, 138(sp)
+; ZVFHMIN64-NEXT:    sb a7, 139(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a2
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, t2
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a0, 178(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 638(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 382(sp)
 ; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 14
-; ZVFHMIN64-NEXT:    vmv.x.s t3, v8
+; ZVFHMIN64-NEXT:    vmv.x.s t2, v8
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
@@ -3220,7 +3221,7 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN64-NEXT:    lh a0, 636(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 380(sp)
 ; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 13
-; ZVFHMIN64-NEXT:    vmv.x.s t2, v8
+; ZVFHMIN64-NEXT:    vmv.x.s t1, v8
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
@@ -3228,7 +3229,7 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN64-NEXT:    lh a0, 634(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 378(sp)
 ; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 12
-; ZVFHMIN64-NEXT:    vmv.x.s t1, v8
+; ZVFHMIN64-NEXT:    vmv.x.s t0, v8
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
@@ -3236,7 +3237,7 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN64-NEXT:    lh a0, 632(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 376(sp)
 ; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 11
-; ZVFHMIN64-NEXT:    vmv.x.s t0, v8
+; ZVFHMIN64-NEXT:    vmv.x.s a7, v8
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
@@ -3244,7 +3245,7 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN64-NEXT:    lh a0, 630(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 374(sp)
 ; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 10
-; ZVFHMIN64-NEXT:    vmv.x.s a7, v8
+; ZVFHMIN64-NEXT:    vmv.x.s a6, v8
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
@@ -3252,102 +3253,101 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN64-NEXT:    lh a0, 628(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 372(sp)
 ; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 9
-; ZVFHMIN64-NEXT:    vmv.x.s a6, v8
+; ZVFHMIN64-NEXT:    vmv.x.s a5, v8
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    ld a1, 96(sp) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    ld a1, 104(sp) # 8-byte Folded Reload
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN64-NEXT:    sb a0, 250(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 626(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 370(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a5
-; ZVFHMIN64-NEXT:    feq.h a3, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    ld a1, 104(sp) # 8-byte Folded Reload
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT:    sb a0, 249(sp)
-; ZVFHMIN64-NEXT:    lh a0, 624(sp)
-; ZVFHMIN64-NEXT:    lh a1, 368(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, t3
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a4
 ; ZVFHMIN64-NEXT:    feq.h a2, fa5, fa4
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    ld a1, 120(sp) # 8-byte Folded Reload
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT:    sb a0, 248(sp)
-; ZVFHMIN64-NEXT:    lh a0, 622(sp)
-; ZVFHMIN64-NEXT:    lh a1, 366(sp)
+; ZVFHMIN64-NEXT:    sb a0, 249(sp)
+; ZVFHMIN64-NEXT:    lh a1, 624(sp)
+; ZVFHMIN64-NEXT:    lh a3, 368(sp)
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, t2
-; ZVFHMIN64-NEXT:    feq.h a4, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    ld a1, 88(sp) # 8-byte Folded Reload
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT:    sb a0, 247(sp)
-; ZVFHMIN64-NEXT:    lh a0, 620(sp)
-; ZVFHMIN64-NEXT:    lh a1, 364(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a3
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    ld a3, 96(sp) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a3
+; ZVFHMIN64-NEXT:    sb a1, 248(sp)
+; ZVFHMIN64-NEXT:    lh a1, 622(sp)
+; ZVFHMIN64-NEXT:    lh a3, 366(sp)
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, t1
-; ZVFHMIN64-NEXT:    feq.h a5, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    ld a1, 112(sp) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    feq.h a4, fa5, fa4
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT:    sb a0, 246(sp)
-; ZVFHMIN64-NEXT:    lh a0, 618(sp)
-; ZVFHMIN64-NEXT:    lh a1, 362(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a3
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    ld a3, 112(sp) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a3
+; ZVFHMIN64-NEXT:    sb a1, 247(sp)
+; ZVFHMIN64-NEXT:    lh a1, 620(sp)
+; ZVFHMIN64-NEXT:    lh a3, 364(sp)
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, t0
 ; ZVFHMIN64-NEXT:    feq.h t0, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a3
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, s2
-; ZVFHMIN64-NEXT:    sb a0, 245(sp)
-; ZVFHMIN64-NEXT:    lh a0, 616(sp)
-; ZVFHMIN64-NEXT:    lh a1, 360(sp)
+; ZVFHMIN64-NEXT:    sb a1, 246(sp)
+; ZVFHMIN64-NEXT:    lh a1, 618(sp)
+; ZVFHMIN64-NEXT:    lh a3, 362(sp)
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a7
 ; ZVFHMIN64-NEXT:    feq.h a7, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a3
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, t6
-; ZVFHMIN64-NEXT:    sb a0, 244(sp)
-; ZVFHMIN64-NEXT:    lh a0, 614(sp)
-; ZVFHMIN64-NEXT:    lh a1, 358(sp)
+; ZVFHMIN64-NEXT:    sb a1, 245(sp)
+; ZVFHMIN64-NEXT:    lh a1, 616(sp)
+; ZVFHMIN64-NEXT:    lh a3, 360(sp)
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a6
 ; ZVFHMIN64-NEXT:    feq.h a6, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a3
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, t5
+; ZVFHMIN64-NEXT:    sb a1, 244(sp)
+; ZVFHMIN64-NEXT:    lh a1, 614(sp)
+; ZVFHMIN64-NEXT:    lh a3, 358(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a5
+; ZVFHMIN64-NEXT:    feq.h a5, fa5, fa4
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a3
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, t4
 ; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 8
-; ZVFHMIN64-NEXT:    vmv.x.s a1, v8
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    sb a0, 243(sp)
-; ZVFHMIN64-NEXT:    lh a0, 612(sp)
-; ZVFHMIN64-NEXT:    lh a1, 356(sp)
-; ZVFHMIN64-NEXT:    sb a5, 204(sp)
+; ZVFHMIN64-NEXT:    vmv.x.s a3, v8
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a3
+; ZVFHMIN64-NEXT:    sb a1, 243(sp)
+; ZVFHMIN64-NEXT:    lh a1, 612(sp)
+; ZVFHMIN64-NEXT:    lh a3, 356(sp)
+; ZVFHMIN64-NEXT:    sb t0, 204(sp)
 ; ZVFHMIN64-NEXT:    sb a4, 205(sp)
-; ZVFHMIN64-NEXT:    sb a2, 206(sp)
-; ZVFHMIN64-NEXT:    sb a3, 207(sp)
-; ZVFHMIN64-NEXT:    feq.h a2, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a2, 200(sp)
-; ZVFHMIN64-NEXT:    sb a6, 201(sp)
-; ZVFHMIN64-NEXT:    sb a7, 202(sp)
-; ZVFHMIN64-NEXT:    sb t0, 203(sp)
-; ZVFHMIN64-NEXT:    li a2, 128
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 242(sp)
-; ZVFHMIN64-NEXT:    addi a0, sp, 128
-; ZVFHMIN64-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
-; ZVFHMIN64-NEXT:    vle8.v v8, (a0)
+; ZVFHMIN64-NEXT:    sb a0, 206(sp)
+; ZVFHMIN64-NEXT:    sb a2, 207(sp)
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a0, 200(sp)
+; ZVFHMIN64-NEXT:    sb a5, 201(sp)
+; ZVFHMIN64-NEXT:    sb a6, 202(sp)
+; ZVFHMIN64-NEXT:    sb a7, 203(sp)
+; ZVFHMIN64-NEXT:    li a0, 128
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a3
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a1, 242(sp)
+; ZVFHMIN64-NEXT:    addi a1, sp, 128
+; ZVFHMIN64-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; ZVFHMIN64-NEXT:    vle8.v v8, (a1)
 ; ZVFHMIN64-NEXT:    vand.vi v8, v8, 1
 ; ZVFHMIN64-NEXT:    vmsne.vi v0, v8, 0
 ; ZVFHMIN64-NEXT:    addi sp, s0, -896
diff --git a/llvm/test/CodeGen/RISCV/rvv/intrinsic-vector-match.ll b/llvm/test/CodeGen/RISCV/rvv/intrinsic-vector-match.ll
index e70dcd16d02cd2..dd2a8240ee2533 100644
--- a/llvm/test/CodeGen/RISCV/rvv/intrinsic-vector-match.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/intrinsic-vector-match.ll
@@ -507,37 +507,28 @@ define <8 x i1> @match_v8i8_v16i8(<8 x i8> %op1, <16 x i8> %op2, <8 x i1> %mask)
 define <vscale x 16 x i1> @match_nxv16i8_v32i8(<vscale x 16 x i8> %op1, <32 x i8> %op2, <vscale x 16 x i1> %mask) {
 ; RV32-LABEL: match_nxv16i8_v32i8:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -64
-; RV32-NEXT:    .cfi_def_cfa_offset 64
-; RV32-NEXT:    sw ra, 60(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s0, 56(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s1, 52(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s2, 48(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s3, 44(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s4, 40(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s5, 36(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s6, 32(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s7, 28(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s8, 24(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s9, 20(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s10, 16(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s11, 12(sp) # 4-byte Folded Spill
-; RV32-NEXT:    .cfi_offset ra, -4
-; RV32-NEXT:    .cfi_offset s0, -8
-; RV32-NEXT:    .cfi_offset s1, -12
-; RV32-NEXT:    .cfi_offset s2, -16
-; RV32-NEXT:    .cfi_offset s3, -20
-; RV32-NEXT:    .cfi_offset s4, -24
-; RV32-NEXT:    .cfi_offset s5, -28
-; RV32-NEXT:    .cfi_offset s6, -32
-; RV32-NEXT:    .cfi_offset s7, -36
-; RV32-NEXT:    .cfi_offset s8, -40
-; RV32-NEXT:    .cfi_offset s9, -44
-; RV32-NEXT:    .cfi_offset s10, -48
-; RV32-NEXT:    .cfi_offset s11, -52
+; RV32-NEXT:    addi sp, sp, -48
+; RV32-NEXT:    .cfi_def_cfa_offset 48
+; RV32-NEXT:    sw s0, 44(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s1, 40(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s2, 36(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s3, 32(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s4, 28(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s5, 24(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s6, 20(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s7, 16(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s8, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT:    .cfi_offset s0, -4
+; RV32-NEXT:    .cfi_offset s1, -8
+; RV32-NEXT:    .cfi_offset s2, -12
+; RV32-NEXT:    .cfi_offset s3, -16
+; RV32-NEXT:    .cfi_offset s4, -20
+; RV32-NEXT:    .cfi_offset s5, -24
+; RV32-NEXT:    .cfi_offset s6, -28
+; RV32-NEXT:    .cfi_offset s7, -32
+; RV32-NEXT:    .cfi_offset s8, -36
 ; RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
 ; RV32-NEXT:    vmv.x.s a0, v10
-; RV32-NEXT:    sw a0, 8(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    vslidedown.vi v12, v10, 1
 ; RV32-NEXT:    vslidedown.vi v13, v10, 2
 ; RV32-NEXT:    vslidedown.vi v14, v10, 3
@@ -593,95 +584,89 @@ define <vscale x 16 x i1> @match_nxv16i8_v32i8(<vscale x 16 x i8> %op1, <32 x i8
 ; RV32-NEXT:    vmv.x.s s5, v15
 ; RV32-NEXT:    vmv.x.s s6, v16
 ; RV32-NEXT:    vmv.x.s s7, v17
-; RV32-NEXT:    vmv.x.s s8, v18
-; RV32-NEXT:    vmv.x.s s9, v19
-; RV32-NEXT:    vmv.x.s s10, v20
-; RV32-NEXT:    vmv.x.s s11, v21
-; RV32-NEXT:    vmv.x.s ra, v22
-; RV32-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
-; RV32-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
+; RV32-NEXT:    vsetvli s8, zero, e8, m2, ta, ma
 ; RV32-NEXT:    vmseq.vx v12, v8, a0
-; RV32-NEXT:    vmv.x.s a0, v23
+; RV32-NEXT:    vmv.x.s a0, v18
 ; RV32-NEXT:    vmseq.vx v13, v8, s2
-; RV32-NEXT:    vmv.x.s s2, v11
-; RV32-NEXT:    vmseq.vx v11, v8, s3
-; RV32-NEXT:    vmv.x.s s3, v24
-; RV32-NEXT:    vmseq.vx v14, v8, s4
-; RV32-NEXT:    vmv.x.s s4, v10
-; RV32-NEXT:    vmseq.vx v10, v8, s5
-; RV32-NEXT:    vmor.mm v12, v12, v13
-; RV32-NEXT:    vmseq.vx v13, v8, s6
-; RV32-NEXT:    vmor.mm v11, v12, v11
-; RV32-NEXT:    vmseq.vx v12, v8, s7
-; RV32-NEXT:    vmor.mm v11, v11, v14
-; RV32-NEXT:    vmseq.vx v14, v8, s8
-; RV32-NEXT:    vmor.mm v10, v11, v10
-; RV32-NEXT:    vmseq.vx v11, v8, s9
-; RV32-NEXT:    vmor.mm v10, v10, v13
-; RV32-NEXT:    vmseq.vx v13, v8, s10
-; RV32-NEXT:    vmor.mm v10, v10, v12
-; RV32-NEXT:    vmseq.vx v12, v8, s11
-; RV32-NEXT:    vmor.mm v10, v10, v14
-; RV32-NEXT:    vmseq.vx v14, v8, ra
-; RV32-NEXT:    vmor.mm v10, v10, v11
+; RV32-NEXT:    vmv.x.s s2, v19
+; RV32-NEXT:    vmseq.vx v14, v8, s3
+; RV32-NEXT:    vmv.x.s s3, v20
+; RV32-NEXT:    vmseq.vx v15, v8, s4
+; RV32-NEXT:    vmv.x.s s4, v21
+; RV32-NEXT:    vmseq.vx v16, v8, s5
+; RV32-NEXT:    vmv.x.s s5, v22
+; RV32-NEXT:    vmseq.vx v17, v8, s6
+; RV32-NEXT:    vmv.x.s s6, v23
+; RV32-NEXT:    vmseq.vx v18, v8, s7
+; RV32-NEXT:    vmv.x.s s7, v11
 ; RV32-NEXT:    vmseq.vx v11, v8, a0
-; RV32-NEXT:    vmor.mm v10, v10, v13
-; RV32-NEXT:    vmseq.vx v13, v8, s2
-; RV32-NEXT:    vmor.mm v10, v10, v12
-; RV32-NEXT:    vmseq.vx v12, v8, s3
+; RV32-NEXT:    vmv.x.s a0, v24
+; RV32-NEXT:    vmseq.vx v19, v8, s2
+; RV32-NEXT:    vmv.x.s s2, v10
+; RV32-NEXT:    vmor.mm v10, v12, v13
 ; RV32-NEXT:    vmor.mm v10, v10, v14
-; RV32-NEXT:    vmseq.vx v14, v8, s4
+; RV32-NEXT:    vmor.mm v10, v10, v15
+; RV32-NEXT:    vmor.mm v10, v10, v16
+; RV32-NEXT:    vmor.mm v10, v10, v17
+; RV32-NEXT:    vmseq.vx v12, v8, s3
+; RV32-NEXT:    vmor.mm v10, v10, v18
+; RV32-NEXT:    vmseq.vx v13, v8, s4
 ; RV32-NEXT:    vmor.mm v10, v10, v11
-; RV32-NEXT:    vmseq.vx v11, v8, a1
-; RV32-NEXT:    vmor.mm v10, v10, v13
-; RV32-NEXT:    vmseq.vx v13, v8, a2
+; RV32-NEXT:    vmseq.vx v11, v8, s5
+; RV32-NEXT:    vmor.mm v10, v10, v19
+; RV32-NEXT:    vmseq.vx v14, v8, s6
 ; RV32-NEXT:    vmor.mm v10, v10, v12
-; RV32-NEXT:    vmseq.vx v12, v8, a3
-; RV32-NEXT:    vmor.mm v10, v10, v14
-; RV32-NEXT:    vmseq.vx v14, v8, a4
-; RV32-NEXT:    vmor.mm v10, v10, v11
-; RV32-NEXT:    vmseq.vx v11, v8, a5
+; RV32-NEXT:    vmseq.vx v12, v8, s7
 ; RV32-NEXT:    vmor.mm v10, v10, v13
-; RV32-NEXT:    vmseq.vx v13, v8, a6
-; RV32-NEXT:    vmor.mm v10, v10, v12
-; RV32-NEXT:    vmseq.vx v12, v8, a7
-; RV32-NEXT:    vmor.mm v10, v10, v14
-; RV32-NEXT:    vmseq.vx v14, v8, t0
+; RV32-NEXT:    vmseq.vx v13, v8, a0
 ; RV32-NEXT:    vmor.mm v10, v10, v11
-; RV32-NEXT:    vmseq.vx v11, v8, t1
-; RV32-NEXT:    vmor.mm v10, v10, v13
-; RV32-NEXT:    vmseq.vx v13, v8, t2
-; RV32-NEXT:    vmor.mm v10, v10, v12
-; RV32-NEXT:    vmseq.vx v12, v8, t3
+; RV32-NEXT:    vmseq.vx v11, v8, s2
 ; RV32-NEXT:    vmor.mm v10, v10, v14
-; RV32-NEXT:    vmseq.vx v14, v8, t4
-; RV32-NEXT:    vmor.mm v10, v10, v11
-; RV32-NEXT:    vmseq.vx v11, v8, t5
+; RV32-NEXT:    vmseq.vx v14, v8, a1
+; RV32-NEXT:    vmor.mm v10, v10, v12
+; RV32-NEXT:    vmseq.vx v12, v8, a2
 ; RV32-NEXT:    vmor.mm v10, v10, v13
-; RV32-NEXT:    vmseq.vx v13, v8, t6
+; RV32-NEXT:    vmseq.vx v13, v8, a3
+; RV32-NEXT:    vmor.mm v10, v10, v11
+; RV32-NEXT:    vmseq.vx v11, v8, a4
+; RV32-NEXT:    vmor.mm v10, v10, v14
+; RV32-NEXT:    vmseq.vx v14, v8, a5
 ; RV32-NEXT:    vmor.mm v10, v10, v12
-; RV32-NEXT:    vmseq.vx v12, v8, s0
+; RV32-NEXT:    vmseq.vx v12, v8, a6
+; RV32-NEXT:    vmor.mm v10, v10, v13
+; RV32-NEXT:    vmseq.vx v13, v8, a7
+; RV32-NEXT:    vmor.mm v10, v10, v11
+; RV32-NEXT:    vmseq.vx v11, v8, t0
 ; RV32-NEXT:    vmor.mm v10, v10, v14
+; RV32-NEXT:    vmseq.vx v14, v8, t1
+; RV32-NEXT:    vmor.mm v10, v10, v12
+; RV32-NEXT:    vmseq.vx v12, v8, t2
+; RV32-NEXT:    vmor.mm v10, v10, v13
+; RV32-NEXT:    vmseq.vx v13, v8, t3
 ; RV32-NEXT:    vmor.mm v10, v10, v11
+; RV32-NEXT:    vmseq.vx v11, v8, t4
+; RV32-NEXT:    vmor.mm v10, v10, v14
+; RV32-NEXT:    vmseq.vx v14, v8, t5
+; RV32-NEXT:    vmor.mm v10, v10, v12
+; RV32-NEXT:    vmseq.vx v12, v8, t6
 ; RV32-NEXT:    vmor.mm v10, v10, v13
+; RV32-NEXT:    vmseq.vx v13, v8, s0
+; RV32-NEXT:    vmor.mm v10, v10, v11
+; RV32-NEXT:    vmor.mm v10, v10, v14
 ; RV32-NEXT:    vmor.mm v10, v10, v12
+; RV32-NEXT:    vmor.mm v10, v10, v13
 ; RV32-NEXT:    vmseq.vx v11, v8, s1
 ; RV32-NEXT:    vmor.mm v8, v10, v11
 ; RV32-NEXT:    vmand.mm v0, v8, v0
-; RV32-NEXT:    lw ra, 60(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s0, 56(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s1, 52(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s2, 48(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s3, 44(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s4, 40(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s5, 36(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s6, 32(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s7, 28(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s8, 24(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s9, 20(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s10, 16(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s11, 12(sp) # 4-byte Folded Reload
-; RV32-NEXT:    .cfi_restore ra
+; RV32-NEXT:    lw s0, 44(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s1, 40(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s2, 36(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s3, 32(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s4, 28(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s5, 24(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s6, 20(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s7, 16(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s8, 12(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    .cfi_restore s0
 ; RV32-NEXT:    .cfi_restore s1
 ; RV32-NEXT:    .cfi_restore s2
@@ -691,46 +676,34 @@ define <vscale x 16 x i1> @match_nxv16i8_v32i8(<vscale x 16 x i8> %op1, <32 x i8
 ; RV32-NEXT:    .cfi_restore s6
 ; RV32-NEXT:    .cfi_restore s7
 ; RV32-NEXT:    .cfi_restore s8
-; RV32-NEXT:    .cfi_restore s9
-; RV32-NEXT:    .cfi_restore s10
-; RV32-NEXT:    .cfi_restore s11
-; RV32-NEXT:    addi sp, sp, 64
+; RV32-NEXT:    addi sp, sp, 48
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: match_nxv16i8_v32i8:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    addi sp, sp, -112
-; RV64-NEXT:    .cfi_def_cfa_offset 112
-; RV64-NEXT:    sd ra, 104(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s0, 96(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s1, 88(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s2, 80(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s3, 72(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s4, 64(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s5, 56(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s6, 48(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s7, 40(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s8, 32(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s9, 24(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s10, 16(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s11, 8(sp) # 8-byte Folded Spill
-; RV64-NEXT:    .cfi_offset ra, -8
-; RV64-NEXT:    .cfi_offset s0, -16
-; RV64-NEXT:    .cfi_offset s1, -24
-; RV64-NEXT:    .cfi_offset s2, -32
-; RV64-NEXT:    .cfi_offset s3, -40
-; RV64-NEXT:    .cfi_offset s4, -48
-; RV64-NEXT:    .cfi_offset s5, -56
-; RV64-NEXT:    .cfi_offset s6, -64
-; RV64-NEXT:    .cfi_offset s7, -72
-; RV64-NEXT:    .cfi_offset s8, -80
-; RV64-NEXT:    .cfi_offset s9, -88
-; RV64-NEXT:    .cfi_offset s10, -96
-; RV64-NEXT:    .cfi_offset s11, -104
+; RV64-NEXT:    addi sp, sp, -80
+; RV64-NEXT:    .cfi_def_cfa_offset 80
+; RV64-NEXT:    sd s0, 72(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s1, 64(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s2, 56(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s3, 48(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s4, 40(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s5, 32(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s6, 24(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s7, 16(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s8, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT:    .cfi_offset s0, -8
+; RV64-NEXT:    .cfi_offset s1, -16
+; RV64-NEXT:    .cfi_offset s2, -24
+; RV64-NEXT:    .cfi_offset s3, -32
+; RV64-NEXT:    .cfi_offset s4, -40
+; RV64-NEXT:    .cfi_offset s5, -48
+; RV64-NEXT:    .cfi_offset s6, -56
+; RV64-NEXT:    .cfi_offset s7, -64
+; RV64-NEXT:    .cfi_offset s8, -72
 ; RV64-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
 ; RV64-NEXT:    vmv.x.s a0, v10
-; RV64-NEXT:    sd a0, 0(sp) # 8-byte Folded Spill
 ; RV64-NEXT:    vslidedown.vi v12, v10, 1
 ; RV64-NEXT:    vslidedown.vi v13, v10, 2
 ; RV64-NEXT:    vslidedown.vi v14, v10, 3
@@ -786,95 +759,89 @@ define <vscale x 16 x i1> @match_nxv16i8_v32i8(<vscale x 16 x i8> %op1, <32 x i8
 ; RV64-NEXT:    vmv.x.s s5, v15
 ; RV64-NEXT:    vmv.x.s s6, v16
 ; RV64-NEXT:    vmv.x.s s7, v17
-; RV64-NEXT:    vmv.x.s s8, v18
-; RV64-NEXT:    vmv.x.s s9, v19
-; RV64-NEXT:    vmv.x.s s10, v20
-; RV64-NEXT:    vmv.x.s s11, v21
-; RV64-NEXT:    vmv.x.s ra, v22
-; RV64-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
-; RV64-NEXT:    ld a0, 0(sp) # 8-byte Folded Reload
+; RV64-NEXT:    vsetvli s8, zero, e8, m2, ta, ma
 ; RV64-NEXT:    vmseq.vx v12, v8, a0
-; RV64-NEXT:    vmv.x.s a0, v23
+; RV64-NEXT:    vmv.x.s a0, v18
 ; RV64-NEXT:    vmseq.vx v13, v8, s2
-; RV64-NEXT:    vmv.x.s s2, v11
-; RV64-NEXT:    vmseq.vx v11, v8, s3
-; RV64-NEXT:    vmv.x.s s3, v24
-; RV64-NEXT:    vmseq.vx v14, v8, s4
-; RV64-NEXT:    vmv.x.s s4, v10
-; RV64-NEXT:    vmseq.vx v10, v8, s5
-; RV64-NEXT:    vmor.mm v12, v12, v13
-; RV64-NEXT:    vmseq.vx v13, v8, s6
-; RV64-NEXT:    vmor.mm v11, v12, v11
-; RV64-NEXT:    vmseq.vx v12, v8, s7
-; RV64-NEXT:    vmor.mm v11, v11, v14
-; RV64-NEXT:    vmseq.vx v14, v8, s8
-; RV64-NEXT:    vmor.mm v10, v11, v10
-; RV64-NEXT:    vmseq.vx v11, v8, s9
-; RV64-NEXT:    vmor.mm v10, v10, v13
-; RV64-NEXT:    vmseq.vx v13, v8, s10
-; RV64-NEXT:    vmor.mm v10, v10, v12
-; RV64-NEXT:    vmseq.vx v12, v8, s11
-; RV64-NEXT:    vmor.mm v10, v10, v14
-; RV64-NEXT:    vmseq.vx v14, v8, ra
-; RV64-NEXT:    vmor.mm v10, v10, v11
+; RV64-NEXT:    vmv.x.s s2, v19
+; RV64-NEXT:    vmseq.vx v14, v8, s3
+; RV64-NEXT:    vmv.x.s s3, v20
+; RV64-NEXT:    vmseq.vx v15, v8, s4
+; RV64-NEXT:    vmv.x.s s4, v21
+; RV64-NEXT:    vmseq.vx v16, v8, s5
+; RV64-NEXT:    vmv.x.s s5, v22
+; RV64-NEXT:    vmseq.vx v17, v8, s6
+; RV64-NEXT:    vmv.x.s s6, v23
+; RV64-NEXT:    vmseq.vx v18, v8, s7
+; RV64-NEXT:    vmv.x.s s7, v11
 ; RV64-NEXT:    vmseq.vx v11, v8, a0
-; RV64-NEXT:    vmor.mm v10, v10, v13
-; RV64-NEXT:    vmseq.vx v13, v8, s2
-; RV64-NEXT:    vmor.mm v10, v10, v12
-; RV64-NEXT:    vmseq.vx v12, v8, s3
+; RV64-NEXT:    vmv.x.s a0, v24
+; RV64-NEXT:    vmseq.vx v19, v8, s2
+; RV64-NEXT:    vmv.x.s s2, v10
+; RV64-NEXT:    vmor.mm v10, v12, v13
 ; RV64-NEXT:    vmor.mm v10, v10, v14
-; RV64-NEXT:    vmseq.vx v14, v8, s4
+; RV64-NEXT:    vmor.mm v10, v10, v15
+; RV64-NEXT:    vmor.mm v10, v10, v16
+; RV64-NEXT:    vmor.mm v10, v10, v17
+; RV64-NEXT:    vmseq.vx v12, v8, s3
+; RV64-NEXT:    vmor.mm v10, v10, v18
+; RV64-NEXT:    vmseq.vx v13, v8, s4
 ; RV64-NEXT:    vmor.mm v10, v10, v11
-; RV64-NEXT:    vmseq.vx v11, v8, a1
-; RV64-NEXT:    vmor.mm v10, v10, v13
-; RV64-NEXT:    vmseq.vx v13, v8, a2
+; RV64-NEXT:    vmseq.vx v11, v8, s5
+; RV64-NEXT:    vmor.mm v10, v10, v19
+; RV64-NEXT:    vmseq.vx v14, v8, s6
 ; RV64-NEXT:    vmor.mm v10, v10, v12
-; RV64-NEXT:    vmseq.vx v12, v8, a3
-; RV64-NEXT:    vmor.mm v10, v10, v14
-; RV64-NEXT:    vmseq.vx v14, v8, a4
-; RV64-NEXT:    vmor.mm v10, v10, v11
-; RV64-NEXT:    vmseq.vx v11, v8, a5
+; RV64-NEXT:    vmseq.vx v12, v8, s7
 ; RV64-NEXT:    vmor.mm v10, v10, v13
-; RV64-NEXT:    vmseq.vx v13, v8, a6
-; RV64-NEXT:    vmor.mm v10, v10, v12
-; RV64-NEXT:    vmseq.vx v12, v8, a7
-; RV64-NEXT:    vmor.mm v10, v10, v14
-; RV64-NEXT:    vmseq.vx v14, v8, t0
+; RV64-NEXT:    vmseq.vx v13, v8, a0
 ; RV64-NEXT:    vmor.mm v10, v10, v11
-; RV64-NEXT:    vmseq.vx v11, v8, t1
-; RV64-NEXT:    vmor.mm v10, v10, v13
-; RV64-NEXT:    vmseq.vx v13, v8, t2
-; RV64-NEXT:    vmor.mm v10, v10, v12
-; RV64-NEXT:    vmseq.vx v12, v8, t3
+; RV64-NEXT:    vmseq.vx v11, v8, s2
 ; RV64-NEXT:    vmor.mm v10, v10, v14
-; RV64-NEXT:    vmseq.vx v14, v8, t4
-; RV64-NEXT:    vmor.mm v10, v10, v11
-; RV64-NEXT:    vmseq.vx v11, v8, t5
+; RV64-NEXT:    vmseq.vx v14, v8, a1
+; RV64-NEXT:    vmor.mm v10, v10, v12
+; RV64-NEXT:    vmseq.vx v12, v8, a2
 ; RV64-NEXT:    vmor.mm v10, v10, v13
-; RV64-NEXT:    vmseq.vx v13, v8, t6
+; RV64-NEXT:    vmseq.vx v13, v8, a3
+; RV64-NEXT:    vmor.mm v10, v10, v11
+; RV64-NEXT:    vmseq.vx v11, v8, a4
+; RV64-NEXT:    vmor.mm v10, v10, v14
+; RV64-NEXT:    vmseq.vx v14, v8, a5
 ; RV64-NEXT:    vmor.mm v10, v10, v12
-; RV64-NEXT:    vmseq.vx v12, v8, s0
+; RV64-NEXT:    vmseq.vx v12, v8, a6
+; RV64-NEXT:    vmor.mm v10, v10, v13
+; RV64-NEXT:    vmseq.vx v13, v8, a7
+; RV64-NEXT:    vmor.mm v10, v10, v11
+; RV64-NEXT:    vmseq.vx v11, v8, t0
 ; RV64-NEXT:    vmor.mm v10, v10, v14
+; RV64-NEXT:    vmseq.vx v14, v8, t1
+; RV64-NEXT:    vmor.mm v10, v10, v12
+; RV64-NEXT:    vmseq.vx v12, v8, t2
+; RV64-NEXT:    vmor.mm v10, v10, v13
+; RV64-NEXT:    vmseq.vx v13, v8, t3
 ; RV64-NEXT:    vmor.mm v10, v10, v11
+; RV64-NEXT:    vmseq.vx v11, v8, t4
+; RV64-NEXT:    vmor.mm v10, v10, v14
+; RV64-NEXT:    vmseq.vx v14, v8, t5
+; RV64-NEXT:    vmor.mm v10, v10, v12
+; RV64-NEXT:    vmseq.vx v12, v8, t6
 ; RV64-NEXT:    vmor.mm v10, v10, v13
+; RV64-NEXT:    vmseq.vx v13, v8, s0
+; RV64-NEXT:    vmor.mm v10, v10, v11
+; RV64-NEXT:    vmor.mm v10, v10, v14
 ; RV64-NEXT:    vmor.mm v10, v10, v12
+; RV64-NEXT:    vmor.mm v10, v10, v13
 ; RV64-NEXT:    vmseq.vx v11, v8, s1
 ; RV64-NEXT:    vmor.mm v8, v10, v11
 ; RV64-NEXT:    vmand.mm v0, v8, v0
-; RV64-NEXT:    ld ra, 104(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s0, 96(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s1, 88(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s2, 80(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s3, 72(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s4, 64(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s5, 56(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s6, 48(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s7, 40(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s8, 32(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s9, 24(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s10, 16(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s11, 8(sp) # 8-byte Folded Reload
-; RV64-NEXT:    .cfi_restore ra
+; RV64-NEXT:    ld s0, 72(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s1, 64(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s2, 56(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s3, 48(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s4, 40(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s5, 32(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s6, 24(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s7, 16(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s8, 8(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    .cfi_restore s0
 ; RV64-NEXT:    .cfi_restore s1
 ; RV64-NEXT:    .cfi_restore s2
@@ -884,10 +851,7 @@ define <vscale x 16 x i1> @match_nxv16i8_v32i8(<vscale x 16 x i8> %op1, <32 x i8
 ; RV64-NEXT:    .cfi_restore s6
 ; RV64-NEXT:    .cfi_restore s7
 ; RV64-NEXT:    .cfi_restore s8
-; RV64-NEXT:    .cfi_restore s9
-; RV64-NEXT:    .cfi_restore s10
-; RV64-NEXT:    .cfi_restore s11
-; RV64-NEXT:    addi sp, sp, 112
+; RV64-NEXT:    addi sp, sp, 80
 ; RV64-NEXT:    .cfi_def_cfa_offset 0
 ; RV64-NEXT:    ret
   %r = tail call <vscale x 16 x i1> @llvm.experimental.vector.match(<vscale x 16 x i8> %op1, <32 x i8> %op2, <vscale x 16 x i1> %mask)
@@ -897,34 +861,24 @@ define <vscale x 16 x i1> @match_nxv16i8_v32i8(<vscale x 16 x i8> %op1, <32 x i8
 define <16 x i1> @match_v16i8_v32i8(<16 x i8> %op1, <32 x i8> %op2, <16 x i1> %mask) {
 ; RV32-LABEL: match_v16i8_v32i8:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -64
-; RV32-NEXT:    .cfi_def_cfa_offset 64
-; RV32-NEXT:    sw ra, 60(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s0, 56(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s1, 52(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s2, 48(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s3, 44(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s4, 40(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s5, 36(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s6, 32(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s7, 28(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s8, 24(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s9, 20(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s10, 16(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s11, 12(sp) # 4-byte Folded Spill
-; RV32-NEXT:    .cfi_offset ra, -4
-; RV32-NEXT:    .cfi_offset s0, -8
-; RV32-NEXT:    .cfi_offset s1, -12
-; RV32-NEXT:    .cfi_offset s2, -16
-; RV32-NEXT:    .cfi_offset s3, -20
-; RV32-NEXT:    .cfi_offset s4, -24
-; RV32-NEXT:    .cfi_offset s5, -28
-; RV32-NEXT:    .cfi_offset s6, -32
-; RV32-NEXT:    .cfi_offset s7, -36
-; RV32-NEXT:    .cfi_offset s8, -40
-; RV32-NEXT:    .cfi_offset s9, -44
-; RV32-NEXT:    .cfi_offset s10, -48
-; RV32-NEXT:    .cfi_offset s11, -52
+; RV32-NEXT:    addi sp, sp, -32
+; RV32-NEXT:    .cfi_def_cfa_offset 32
+; RV32-NEXT:    sw s0, 28(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s1, 24(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s2, 20(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s3, 16(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s4, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s5, 8(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s6, 4(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s7, 0(sp) # 4-byte Folded Spill
+; RV32-NEXT:    .cfi_offset s0, -4
+; RV32-NEXT:    .cfi_offset s1, -8
+; RV32-NEXT:    .cfi_offset s2, -12
+; RV32-NEXT:    .cfi_offset s3, -16
+; RV32-NEXT:    .cfi_offset s4, -20
+; RV32-NEXT:    .cfi_offset s5, -24
+; RV32-NEXT:    .cfi_offset s6, -28
+; RV32-NEXT:    .cfi_offset s7, -32
 ; RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
 ; RV32-NEXT:    vmv.x.s a0, v10
 ; RV32-NEXT:    vslidedown.vi v9, v10, 1
@@ -982,93 +936,87 @@ define <16 x i1> @match_v16i8_v32i8(<16 x i8> %op1, <32 x i8> %op2, <16 x i1> %m
 ; RV32-NEXT:    vmv.x.s s5, v14
 ; RV32-NEXT:    vmv.x.s s6, v15
 ; RV32-NEXT:    vmv.x.s s7, v16
-; RV32-NEXT:    vmv.x.s s8, v17
-; RV32-NEXT:    vmv.x.s s9, v18
-; RV32-NEXT:    vmv.x.s s10, v19
-; RV32-NEXT:    vmv.x.s s11, v20
-; RV32-NEXT:    vmv.x.s ra, v21
 ; RV32-NEXT:    vmseq.vx v9, v8, a0
-; RV32-NEXT:    vmv.x.s a0, v22
+; RV32-NEXT:    vmv.x.s a0, v17
 ; RV32-NEXT:    vmseq.vx v12, v8, s2
-; RV32-NEXT:    vmv.x.s s2, v11
-; RV32-NEXT:    vmseq.vx v11, v8, s3
-; RV32-NEXT:    vmv.x.s s3, v23
-; RV32-NEXT:    vmseq.vx v13, v8, s4
-; RV32-NEXT:    vmv.x.s s4, v10
-; RV32-NEXT:    vmseq.vx v10, v8, s5
+; RV32-NEXT:    vmv.x.s s2, v18
+; RV32-NEXT:    vmseq.vx v13, v8, s3
+; RV32-NEXT:    vmv.x.s s3, v19
+; RV32-NEXT:    vmseq.vx v14, v8, s4
+; RV32-NEXT:    vmv.x.s s4, v20
+; RV32-NEXT:    vmseq.vx v15, v8, s5
+; RV32-NEXT:    vmv.x.s s5, v21
+; RV32-NEXT:    vmseq.vx v16, v8, s6
+; RV32-NEXT:    vmv.x.s s6, v22
+; RV32-NEXT:    vmseq.vx v17, v8, s7
+; RV32-NEXT:    vmv.x.s s7, v11
+; RV32-NEXT:    vmseq.vx v11, v8, a0
+; RV32-NEXT:    vmv.x.s a0, v23
+; RV32-NEXT:    vmseq.vx v18, v8, s2
+; RV32-NEXT:    vmv.x.s s2, v10
 ; RV32-NEXT:    vmor.mm v9, v9, v12
-; RV32-NEXT:    vmseq.vx v12, v8, s6
-; RV32-NEXT:    vmor.mm v9, v9, v11
-; RV32-NEXT:    vmseq.vx v11, v8, s7
 ; RV32-NEXT:    vmor.mm v9, v9, v13
-; RV32-NEXT:    vmseq.vx v13, v8, s8
-; RV32-NEXT:    vmor.mm v9, v9, v10
-; RV32-NEXT:    vmseq.vx v10, v8, s9
-; RV32-NEXT:    vmor.mm v9, v9, v12
-; RV32-NEXT:    vmseq.vx v12, v8, s10
+; RV32-NEXT:    vmor.mm v9, v9, v14
+; RV32-NEXT:    vmor.mm v9, v9, v15
+; RV32-NEXT:    vmor.mm v9, v9, v16
+; RV32-NEXT:    vmseq.vx v10, v8, s3
+; RV32-NEXT:    vmor.mm v9, v9, v17
+; RV32-NEXT:    vmseq.vx v12, v8, s4
 ; RV32-NEXT:    vmor.mm v9, v9, v11
-; RV32-NEXT:    vmseq.vx v11, v8, s11
-; RV32-NEXT:    vmor.mm v9, v9, v13
-; RV32-NEXT:    vmseq.vx v13, v8, ra
+; RV32-NEXT:    vmseq.vx v11, v8, s5
+; RV32-NEXT:    vmor.mm v9, v9, v18
+; RV32-NEXT:    vmseq.vx v13, v8, s6
 ; RV32-NEXT:    vmor.mm v9, v9, v10
-; RV32-NEXT:    vmseq.vx v10, v8, a0
+; RV32-NEXT:    vmseq.vx v10, v8, s7
 ; RV32-NEXT:    vmor.mm v9, v9, v12
-; RV32-NEXT:    vmseq.vx v12, v8, s2
+; RV32-NEXT:    vmseq.vx v12, v8, a0
 ; RV32-NEXT:    vmor.mm v9, v9, v11
-; RV32-NEXT:    vmseq.vx v11, v8, s3
+; RV32-NEXT:    vmseq.vx v11, v8, s2
 ; RV32-NEXT:    vmor.mm v9, v9, v13
-; RV32-NEXT:    vmseq.vx v13, v8, s4
+; RV32-NEXT:    vmseq.vx v13, v8, a1
 ; RV32-NEXT:    vmor.mm v9, v9, v10
-; RV32-NEXT:    vmseq.vx v10, v8, a1
+; RV32-NEXT:    vmseq.vx v10, v8, a2
 ; RV32-NEXT:    vmor.mm v9, v9, v12
-; RV32-NEXT:    vmseq.vx v12, v8, a2
+; RV32-NEXT:    vmseq.vx v12, v8, a3
 ; RV32-NEXT:    vmor.mm v9, v9, v11
-; RV32-NEXT:    vmseq.vx v11, v8, a3
+; RV32-NEXT:    vmseq.vx v11, v8, a4
 ; RV32-NEXT:    vmor.mm v9, v9, v13
-; RV32-NEXT:    vmseq.vx v13, v8, a4
+; RV32-NEXT:    vmseq.vx v13, v8, a5
 ; RV32-NEXT:    vmor.mm v9, v9, v10
-; RV32-NEXT:    vmseq.vx v10, v8, a5
+; RV32-NEXT:    vmseq.vx v10, v8, a6
 ; RV32-NEXT:    vmor.mm v9, v9, v12
-; RV32-NEXT:    vmseq.vx v12, v8, a6
+; RV32-NEXT:    vmseq.vx v12, v8, a7
 ; RV32-NEXT:    vmor.mm v9, v9, v11
-; RV32-NEXT:    vmseq.vx v11, v8, a7
+; RV32-NEXT:    vmseq.vx v11, v8, t0
 ; RV32-NEXT:    vmor.mm v9, v9, v13
-; RV32-NEXT:    vmseq.vx v13, v8, t0
+; RV32-NEXT:    vmseq.vx v13, v8, t1
 ; RV32-NEXT:    vmor.mm v9, v9, v10
-; RV32-NEXT:    vmseq.vx v10, v8, t1
+; RV32-NEXT:    vmseq.vx v10, v8, t2
 ; RV32-NEXT:    vmor.mm v9, v9, v12
-; RV32-NEXT:    vmseq.vx v12, v8, t2
+; RV32-NEXT:    vmseq.vx v12, v8, t3
 ; RV32-NEXT:    vmor.mm v9, v9, v11
-; RV32-NEXT:    vmseq.vx v11, v8, t3
+; RV32-NEXT:    vmseq.vx v11, v8, t4
 ; RV32-NEXT:    vmor.mm v9, v9, v13
-; RV32-NEXT:    vmseq.vx v13, v8, t4
+; RV32-NEXT:    vmseq.vx v13, v8, t5
 ; RV32-NEXT:    vmor.mm v9, v9, v10
-; RV32-NEXT:    vmseq.vx v10, v8, t5
+; RV32-NEXT:    vmseq.vx v10, v8, t6
 ; RV32-NEXT:    vmor.mm v9, v9, v12
-; RV32-NEXT:    vmseq.vx v12, v8, t6
+; RV32-NEXT:    vmseq.vx v12, v8, s0
 ; RV32-NEXT:    vmor.mm v9, v9, v11
-; RV32-NEXT:    vmseq.vx v11, v8, s0
 ; RV32-NEXT:    vmor.mm v9, v9, v13
 ; RV32-NEXT:    vmor.mm v9, v9, v10
 ; RV32-NEXT:    vmor.mm v9, v9, v12
-; RV32-NEXT:    vmor.mm v9, v9, v11
 ; RV32-NEXT:    vmseq.vx v8, v8, s1
 ; RV32-NEXT:    vmor.mm v8, v9, v8
 ; RV32-NEXT:    vmand.mm v0, v8, v0
-; RV32-NEXT:    lw ra, 60(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s0, 56(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s1, 52(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s2, 48(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s3, 44(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s4, 40(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s5, 36(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s6, 32(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s7, 28(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s8, 24(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s9, 20(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s10, 16(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s11, 12(sp) # 4-byte Folded Reload
-; RV32-NEXT:    .cfi_restore ra
+; RV32-NEXT:    lw s0, 28(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s1, 24(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s2, 20(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s3, 16(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s4, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s5, 8(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s6, 4(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s7, 0(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    .cfi_restore s0
 ; RV32-NEXT:    .cfi_restore s1
 ; RV32-NEXT:    .cfi_restore s2
@@ -1077,44 +1025,30 @@ define <16 x i1> @match_v16i8_v32i8(<16 x i8> %op1, <32 x i8> %op2, <16 x i1> %m
 ; RV32-NEXT:    .cfi_restore s5
 ; RV32-NEXT:    .cfi_restore s6
 ; RV32-NEXT:    .cfi_restore s7
-; RV32-NEXT:    .cfi_restore s8
-; RV32-NEXT:    .cfi_restore s9
-; RV32-NEXT:    .cfi_restore s10
-; RV32-NEXT:    .cfi_restore s11
-; RV32-NEXT:    addi sp, sp, 64
+; RV32-NEXT:    addi sp, sp, 32
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: match_v16i8_v32i8:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    addi sp, sp, -112
-; RV64-NEXT:    .cfi_def_cfa_offset 112
-; RV64-NEXT:    sd ra, 104(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s0, 96(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s1, 88(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s2, 80(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s3, 72(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s4, 64(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s5, 56(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s6, 48(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s7, 40(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s8, 32(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s9, 24(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s10, 16(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s11, 8(sp) # 8-byte Folded Spill
-; RV64-NEXT:    .cfi_offset ra, -8
-; RV64-NEXT:    .cfi_offset s0, -16
-; RV64-NEXT:    .cfi_offset s1, -24
-; RV64-NEXT:    .cfi_offset s2, -32
-; RV64-NEXT:    .cfi_offset s3, -40
-; RV64-NEXT:    .cfi_offset s4, -48
-; RV64-NEXT:    .cfi_offset s5, -56
-; RV64-NEXT:    .cfi_offset s6, -64
-; RV64-NEXT:    .cfi_offset s7, -72
-; RV64-NEXT:    .cfi_offset s8, -80
-; RV64-NEXT:    .cfi_offset s9, -88
-; RV64-NEXT:    .cfi_offset s10, -96
-; RV64-NEXT:    .cfi_offset s11, -104
+; RV64-NEXT:    addi sp, sp, -64
+; RV64-NEXT:    .cfi_def_cfa_offset 64
+; RV64-NEXT:    sd s0, 56(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s1, 48(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s2, 40(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s3, 32(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s4, 24(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s5, 16(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s6, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s7, 0(sp) # 8-byte Folded Spill
+; RV64-NEXT:    .cfi_offset s0, -8
+; RV64-NEXT:    .cfi_offset s1, -16
+; RV64-NEXT:    .cfi_offset s2, -24
+; RV64-NEXT:    .cfi_offset s3, -32
+; RV64-NEXT:    .cfi_offset s4, -40
+; RV64-NEXT:    .cfi_offset s5, -48
+; RV64-NEXT:    .cfi_offset s6, -56
+; RV64-NEXT:    .cfi_offset s7, -64
 ; RV64-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
 ; RV64-NEXT:    vmv.x.s a0, v10
 ; RV64-NEXT:    vslidedown.vi v9, v10, 1
@@ -1172,93 +1106,87 @@ define <16 x i1> @match_v16i8_v32i8(<16 x i8> %op1, <32 x i8> %op2, <16 x i1> %m
 ; RV64-NEXT:    vmv.x.s s5, v14
 ; RV64-NEXT:    vmv.x.s s6, v15
 ; RV64-NEXT:    vmv.x.s s7, v16
-; RV64-NEXT:    vmv.x.s s8, v17
-; RV64-NEXT:    vmv.x.s s9, v18
-; RV64-NEXT:    vmv.x.s s10, v19
-; RV64-NEXT:    vmv.x.s s11, v20
-; RV64-NEXT:    vmv.x.s ra, v21
 ; RV64-NEXT:    vmseq.vx v9, v8, a0
-; RV64-NEXT:    vmv.x.s a0, v22
+; RV64-NEXT:    vmv.x.s a0, v17
 ; RV64-NEXT:    vmseq.vx v12, v8, s2
-; RV64-NEXT:    vmv.x.s s2, v11
-; RV64-NEXT:    vmseq.vx v11, v8, s3
-; RV64-NEXT:    vmv.x.s s3, v23
-; RV64-NEXT:    vmseq.vx v13, v8, s4
-; RV64-NEXT:    vmv.x.s s4, v10
-; RV64-NEXT:    vmseq.vx v10, v8, s5
+; RV64-NEXT:    vmv.x.s s2, v18
+; RV64-NEXT:    vmseq.vx v13, v8, s3
+; RV64-NEXT:    vmv.x.s s3, v19
+; RV64-NEXT:    vmseq.vx v14, v8, s4
+; RV64-NEXT:    vmv.x.s s4, v20
+; RV64-NEXT:    vmseq.vx v15, v8, s5
+; RV64-NEXT:    vmv.x.s s5, v21
+; RV64-NEXT:    vmseq.vx v16, v8, s6
+; RV64-NEXT:    vmv.x.s s6, v22
+; RV64-NEXT:    vmseq.vx v17, v8, s7
+; RV64-NEXT:    vmv.x.s s7, v11
+; RV64-NEXT:    vmseq.vx v11, v8, a0
+; RV64-NEXT:    vmv.x.s a0, v23
+; RV64-NEXT:    vmseq.vx v18, v8, s2
+; RV64-NEXT:    vmv.x.s s2, v10
 ; RV64-NEXT:    vmor.mm v9, v9, v12
-; RV64-NEXT:    vmseq.vx v12, v8, s6
-; RV64-NEXT:    vmor.mm v9, v9, v11
-; RV64-NEXT:    vmseq.vx v11, v8, s7
 ; RV64-NEXT:    vmor.mm v9, v9, v13
-; RV64-NEXT:    vmseq.vx v13, v8, s8
-; RV64-NEXT:    vmor.mm v9, v9, v10
-; RV64-NEXT:    vmseq.vx v10, v8, s9
-; RV64-NEXT:    vmor.mm v9, v9, v12
-; RV64-NEXT:    vmseq.vx v12, v8, s10
+; RV64-NEXT:    vmor.mm v9, v9, v14
+; RV64-NEXT:    vmor.mm v9, v9, v15
+; RV64-NEXT:    vmor.mm v9, v9, v16
+; RV64-NEXT:    vmseq.vx v10, v8, s3
+; RV64-NEXT:    vmor.mm v9, v9, v17
+; RV64-NEXT:    vmseq.vx v12, v8, s4
 ; RV64-NEXT:    vmor.mm v9, v9, v11
-; RV64-NEXT:    vmseq.vx v11, v8, s11
-; RV64-NEXT:    vmor.mm v9, v9, v13
-; RV64-NEXT:    vmseq.vx v13, v8, ra
+; RV64-NEXT:    vmseq.vx v11, v8, s5
+; RV64-NEXT:    vmor.mm v9, v9, v18
+; RV64-NEXT:    vmseq.vx v13, v8, s6
 ; RV64-NEXT:    vmor.mm v9, v9, v10
-; RV64-NEXT:    vmseq.vx v10, v8, a0
+; RV64-NEXT:    vmseq.vx v10, v8, s7
 ; RV64-NEXT:    vmor.mm v9, v9, v12
-; RV64-NEXT:    vmseq.vx v12, v8, s2
+; RV64-NEXT:    vmseq.vx v12, v8, a0
 ; RV64-NEXT:    vmor.mm v9, v9, v11
-; RV64-NEXT:    vmseq.vx v11, v8, s3
+; RV64-NEXT:    vmseq.vx v11, v8, s2
 ; RV64-NEXT:    vmor.mm v9, v9, v13
-; RV64-NEXT:    vmseq.vx v13, v8, s4
+; RV64-NEXT:    vmseq.vx v13, v8, a1
 ; RV64-NEXT:    vmor.mm v9, v9, v10
-; RV64-NEXT:    vmseq.vx v10, v8, a1
+; RV64-NEXT:    vmseq.vx v10, v8, a2
 ; RV64-NEXT:    vmor.mm v9, v9, v12
-; RV64-NEXT:    vmseq.vx v12, v8, a2
+; RV64-NEXT:    vmseq.vx v12, v8, a3
 ; RV64-NEXT:    vmor.mm v9, v9, v11
-; RV64-NEXT:    vmseq.vx v11, v8, a3
+; RV64-NEXT:    vmseq.vx v11, v8, a4
 ; RV64-NEXT:    vmor.mm v9, v9, v13
-; RV64-NEXT:    vmseq.vx v13, v8, a4
+; RV64-NEXT:    vmseq.vx v13, v8, a5
 ; RV64-NEXT:    vmor.mm v9, v9, v10
-; RV64-NEXT:    vmseq.vx v10, v8, a5
+; RV64-NEXT:    vmseq.vx v10, v8, a6
 ; RV64-NEXT:    vmor.mm v9, v9, v12
-; RV64-NEXT:    vmseq.vx v12, v8, a6
+; RV64-NEXT:    vmseq.vx v12, v8, a7
 ; RV64-NEXT:    vmor.mm v9, v9, v11
-; RV64-NEXT:    vmseq.vx v11, v8, a7
+; RV64-NEXT:    vmseq.vx v11, v8, t0
 ; RV64-NEXT:    vmor.mm v9, v9, v13
-; RV64-NEXT:    vmseq.vx v13, v8, t0
+; RV64-NEXT:    vmseq.vx v13, v8, t1
 ; RV64-NEXT:    vmor.mm v9, v9, v10
-; RV64-NEXT:    vmseq.vx v10, v8, t1
+; RV64-NEXT:    vmseq.vx v10, v8, t2
 ; RV64-NEXT:    vmor.mm v9, v9, v12
-; RV64-NEXT:    vmseq.vx v12, v8, t2
+; RV64-NEXT:    vmseq.vx v12, v8, t3
 ; RV64-NEXT:    vmor.mm v9, v9, v11
-; RV64-NEXT:    vmseq.vx v11, v8, t3
+; RV64-NEXT:    vmseq.vx v11, v8, t4
 ; RV64-NEXT:    vmor.mm v9, v9, v13
-; RV64-NEXT:    vmseq.vx v13, v8, t4
+; RV64-NEXT:    vmseq.vx v13, v8, t5
 ; RV64-NEXT:    vmor.mm v9, v9, v10
-; RV64-NEXT:    vmseq.vx v10, v8, t5
+; RV64-NEXT:    vmseq.vx v10, v8, t6
 ; RV64-NEXT:    vmor.mm v9, v9, v12
-; RV64-NEXT:    vmseq.vx v12, v8, t6
+; RV64-NEXT:    vmseq.vx v12, v8, s0
 ; RV64-NEXT:    vmor.mm v9, v9, v11
-; RV64-NEXT:    vmseq.vx v11, v8, s0
 ; RV64-NEXT:    vmor.mm v9, v9, v13
 ; RV64-NEXT:    vmor.mm v9, v9, v10
 ; RV64-NEXT:    vmor.mm v9, v9, v12
-; RV64-NEXT:    vmor.mm v9, v9, v11
 ; RV64-NEXT:    vmseq.vx v8, v8, s1
 ; RV64-NEXT:    vmor.mm v8, v9, v8
 ; RV64-NEXT:    vmand.mm v0, v8, v0
-; RV64-NEXT:    ld ra, 104(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s0, 96(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s1, 88(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s2, 80(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s3, 72(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s4, 64(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s5, 56(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s6, 48(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s7, 40(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s8, 32(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s9, 24(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s10, 16(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s11, 8(sp) # 8-byte Folded Reload
-; RV64-NEXT:    .cfi_restore ra
+; RV64-NEXT:    ld s0, 56(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s1, 48(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s2, 40(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s3, 32(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s4, 24(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s5, 16(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s6, 8(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s7, 0(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    .cfi_restore s0
 ; RV64-NEXT:    .cfi_restore s1
 ; RV64-NEXT:    .cfi_restore s2
@@ -1267,11 +1195,7 @@ define <16 x i1> @match_v16i8_v32i8(<16 x i8> %op1, <32 x i8> %op2, <16 x i1> %m
 ; RV64-NEXT:    .cfi_restore s5
 ; RV64-NEXT:    .cfi_restore s6
 ; RV64-NEXT:    .cfi_restore s7
-; RV64-NEXT:    .cfi_restore s8
-; RV64-NEXT:    .cfi_restore s9
-; RV64-NEXT:    .cfi_restore s10
-; RV64-NEXT:    .cfi_restore s11
-; RV64-NEXT:    addi sp, sp, 112
+; RV64-NEXT:    addi sp, sp, 64
 ; RV64-NEXT:    .cfi_def_cfa_offset 0
 ; RV64-NEXT:    ret
   %r = tail call <16 x i1> @llvm.experimental.vector.match(<16 x i8> %op1, <32 x i8> %op2, <16 x i1> %mask)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll b/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll
index c35f05be304cce..ec2448cb3965f3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll
@@ -489,8 +489,9 @@ define void @test1(ptr nocapture noundef writeonly %dst, i32 noundef signext %i_
 ; RV64-NEXT:    j .LBB0_11
 ; RV64-NEXT:  .LBB0_8: # %vector.ph
 ; RV64-NEXT:    # in Loop: Header=BB0_6 Depth=1
-; RV64-NEXT:    slli t6, t0, 28
-; RV64-NEXT:    sub t6, t6, t1
+; RV64-NEXT:    slli t6, t0, 1
+; RV64-NEXT:    slli s0, t0, 28
+; RV64-NEXT:    sub t6, s0, t6
 ; RV64-NEXT:    and t6, t6, a6
 ; RV64-NEXT:    csrwi vxrm, 0
 ; RV64-NEXT:    mv s0, a2
diff --git a/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll
index 437b7e557718cc..22e6f23d4d6e6a 100644
--- a/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll
+++ b/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll
@@ -2203,139 +2203,136 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ;
 ; RV32I-LABEL: lshr_32bytes:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -128
-; RV32I-NEXT:    sw ra, 124(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 120(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 116(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 112(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s5, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s6, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s7, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s8, 88(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s9, 84(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s10, 80(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s11, 76(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu s1, 0(a0)
+; RV32I-NEXT:    addi sp, sp, -112
+; RV32I-NEXT:    sw s0, 108(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 104(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 100(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 96(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s5, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s6, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s7, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s8, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s9, 72(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s10, 68(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s11, 64(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a3, 0(a0)
 ; RV32I-NEXT:    lbu a4, 1(a0)
 ; RV32I-NEXT:    lbu a5, 2(a0)
 ; RV32I-NEXT:    lbu a6, 3(a0)
-; RV32I-NEXT:    lbu t1, 4(a0)
-; RV32I-NEXT:    lbu t3, 5(a0)
-; RV32I-NEXT:    lbu t4, 6(a0)
-; RV32I-NEXT:    lbu s0, 7(a0)
-; RV32I-NEXT:    lbu t2, 8(a0)
-; RV32I-NEXT:    lbu s3, 9(a0)
-; RV32I-NEXT:    lbu s6, 10(a0)
-; RV32I-NEXT:    lbu s8, 11(a0)
-; RV32I-NEXT:    lbu s9, 12(a0)
-; RV32I-NEXT:    lbu s10, 13(a0)
-; RV32I-NEXT:    lbu s4, 14(a0)
-; RV32I-NEXT:    lbu s7, 15(a0)
-; RV32I-NEXT:    lbu s5, 16(a0)
-; RV32I-NEXT:    lbu s11, 17(a0)
-; RV32I-NEXT:    lbu ra, 18(a0)
-; RV32I-NEXT:    lbu a3, 19(a0)
-; RV32I-NEXT:    lbu t5, 20(a0)
-; RV32I-NEXT:    lbu t6, 21(a0)
-; RV32I-NEXT:    lbu a7, 22(a0)
-; RV32I-NEXT:    lbu t0, 23(a0)
+; RV32I-NEXT:    lbu a7, 4(a0)
+; RV32I-NEXT:    lbu t0, 5(a0)
+; RV32I-NEXT:    lbu t1, 6(a0)
+; RV32I-NEXT:    lbu t2, 7(a0)
+; RV32I-NEXT:    lbu t3, 8(a0)
+; RV32I-NEXT:    lbu t4, 9(a0)
+; RV32I-NEXT:    lbu t5, 10(a0)
+; RV32I-NEXT:    lbu t6, 11(a0)
+; RV32I-NEXT:    lbu s0, 12(a0)
+; RV32I-NEXT:    lbu s1, 13(a0)
+; RV32I-NEXT:    lbu s2, 14(a0)
+; RV32I-NEXT:    lbu s3, 15(a0)
+; RV32I-NEXT:    lbu s4, 16(a0)
+; RV32I-NEXT:    lbu s5, 17(a0)
+; RV32I-NEXT:    lbu s6, 18(a0)
+; RV32I-NEXT:    lbu s7, 19(a0)
 ; RV32I-NEXT:    slli a4, a4, 8
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    slli t3, t3, 8
-; RV32I-NEXT:    slli t4, t4, 16
-; RV32I-NEXT:    slli s0, s0, 24
-; RV32I-NEXT:    or a4, a4, s1
-; RV32I-NEXT:    sw a4, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t2, t2, 24
+; RV32I-NEXT:    or a3, a4, a3
 ; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a5, t3, t1
-; RV32I-NEXT:    or a6, s0, t4
-; RV32I-NEXT:    lbu t1, 24(a0)
+; RV32I-NEXT:    or a5, t0, a7
+; RV32I-NEXT:    or a6, t2, t1
+; RV32I-NEXT:    lbu s8, 20(a0)
+; RV32I-NEXT:    lbu s9, 21(a0)
+; RV32I-NEXT:    lbu s10, 22(a0)
+; RV32I-NEXT:    lbu s11, 23(a0)
+; RV32I-NEXT:    slli t4, t4, 8
+; RV32I-NEXT:    slli t5, t5, 16
+; RV32I-NEXT:    slli t6, t6, 24
+; RV32I-NEXT:    slli s1, s1, 8
+; RV32I-NEXT:    slli s2, s2, 16
+; RV32I-NEXT:    slli s3, s3, 24
+; RV32I-NEXT:    or a7, t4, t3
+; RV32I-NEXT:    or t0, t6, t5
+; RV32I-NEXT:    or t1, s1, s0
+; RV32I-NEXT:    or t2, s3, s2
+; RV32I-NEXT:    lbu t6, 24(a0)
 ; RV32I-NEXT:    lbu s0, 25(a0)
 ; RV32I-NEXT:    lbu s1, 26(a0)
 ; RV32I-NEXT:    lbu s2, 27(a0)
-; RV32I-NEXT:    slli s3, s3, 8
+; RV32I-NEXT:    slli s5, s5, 8
 ; RV32I-NEXT:    slli s6, s6, 16
-; RV32I-NEXT:    slli s8, s8, 24
-; RV32I-NEXT:    slli s10, s10, 8
-; RV32I-NEXT:    or t2, s3, t2
-; RV32I-NEXT:    or t3, s8, s6
-; RV32I-NEXT:    or t4, s10, s9
-; RV32I-NEXT:    lbu s3, 28(a0)
-; RV32I-NEXT:    lbu s6, 29(a0)
-; RV32I-NEXT:    lbu s8, 30(a0)
-; RV32I-NEXT:    lbu s9, 31(a0)
-; RV32I-NEXT:    slli s4, s4, 16
 ; RV32I-NEXT:    slli s7, s7, 24
-; RV32I-NEXT:    slli s11, s11, 8
-; RV32I-NEXT:    slli ra, ra, 16
-; RV32I-NEXT:    slli a3, a3, 24
-; RV32I-NEXT:    or a0, s7, s4
-; RV32I-NEXT:    or s4, s11, s5
-; RV32I-NEXT:    or s5, a3, ra
-; RV32I-NEXT:    lbu a3, 0(a1)
-; RV32I-NEXT:    lbu s7, 1(a1)
-; RV32I-NEXT:    lbu s10, 2(a1)
+; RV32I-NEXT:    slli s9, s9, 8
+; RV32I-NEXT:    or t3, s5, s4
+; RV32I-NEXT:    or t4, s7, s6
+; RV32I-NEXT:    or t5, s9, s8
+; RV32I-NEXT:    lbu s3, 28(a0)
+; RV32I-NEXT:    lbu s4, 29(a0)
+; RV32I-NEXT:    lbu s5, 30(a0)
+; RV32I-NEXT:    lbu s6, 31(a0)
+; RV32I-NEXT:    slli s10, s10, 16
+; RV32I-NEXT:    slli s11, s11, 24
+; RV32I-NEXT:    slli s0, s0, 8
+; RV32I-NEXT:    slli s1, s1, 16
+; RV32I-NEXT:    slli s2, s2, 24
+; RV32I-NEXT:    or a0, s11, s10
+; RV32I-NEXT:    or t6, s0, t6
+; RV32I-NEXT:    or s0, s2, s1
+; RV32I-NEXT:    lbu s1, 0(a1)
+; RV32I-NEXT:    lbu s2, 1(a1)
+; RV32I-NEXT:    lbu s7, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    sw zero, 48(sp)
+; RV32I-NEXT:    sw zero, 52(sp)
 ; RV32I-NEXT:    sw zero, 56(sp)
 ; RV32I-NEXT:    sw zero, 60(sp)
-; RV32I-NEXT:    sw zero, 64(sp)
-; RV32I-NEXT:    sw zero, 68(sp)
+; RV32I-NEXT:    sw zero, 32(sp)
+; RV32I-NEXT:    sw zero, 36(sp)
 ; RV32I-NEXT:    sw zero, 40(sp)
 ; RV32I-NEXT:    sw zero, 44(sp)
-; RV32I-NEXT:    sw zero, 48(sp)
-; RV32I-NEXT:    sw zero, 52(sp)
-; RV32I-NEXT:    slli t6, t6, 8
-; RV32I-NEXT:    or t5, t6, t5
-; RV32I-NEXT:    addi t6, sp, 8
-; RV32I-NEXT:    slli a7, a7, 16
-; RV32I-NEXT:    slli t0, t0, 24
-; RV32I-NEXT:    slli s0, s0, 8
-; RV32I-NEXT:    slli s1, s1, 16
-; RV32I-NEXT:    slli s2, s2, 24
-; RV32I-NEXT:    slli s6, s6, 8
-; RV32I-NEXT:    slli s8, s8, 16
-; RV32I-NEXT:    slli s9, s9, 24
-; RV32I-NEXT:    slli s7, s7, 8
-; RV32I-NEXT:    slli s10, s10, 16
+; RV32I-NEXT:    slli s4, s4, 8
+; RV32I-NEXT:    or s3, s4, s3
+; RV32I-NEXT:    mv s4, sp
+; RV32I-NEXT:    slli s5, s5, 16
+; RV32I-NEXT:    slli s6, s6, 24
+; RV32I-NEXT:    slli s2, s2, 8
+; RV32I-NEXT:    slli s7, s7, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a7, t0, a7
-; RV32I-NEXT:    or t0, s0, t1
-; RV32I-NEXT:    or t1, s2, s1
-; RV32I-NEXT:    or s0, s6, s3
-; RV32I-NEXT:    or s1, s9, s8
-; RV32I-NEXT:    or a3, s7, a3
-; RV32I-NEXT:    or a1, a1, s10
-; RV32I-NEXT:    lw s2, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    or a4, a4, s2
-; RV32I-NEXT:    or a5, a6, a5
-; RV32I-NEXT:    or a6, t3, t2
-; RV32I-NEXT:    or a0, a0, t4
-; RV32I-NEXT:    or t2, s5, s4
-; RV32I-NEXT:    or a7, a7, t5
-; RV32I-NEXT:    or t0, t1, t0
-; RV32I-NEXT:    or s0, s1, s0
-; RV32I-NEXT:    or a1, a1, a3
-; RV32I-NEXT:    sw t2, 24(sp)
-; RV32I-NEXT:    sw a7, 28(sp)
-; RV32I-NEXT:    sw t0, 32(sp)
-; RV32I-NEXT:    sw s0, 36(sp)
-; RV32I-NEXT:    sw a4, 8(sp)
-; RV32I-NEXT:    sw a5, 12(sp)
-; RV32I-NEXT:    sw a6, 16(sp)
+; RV32I-NEXT:    or s5, s6, s5
+; RV32I-NEXT:    or s1, s2, s1
+; RV32I-NEXT:    or a1, a1, s7
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a5, t0, a7
+; RV32I-NEXT:    or a6, t2, t1
+; RV32I-NEXT:    or a7, t4, t3
+; RV32I-NEXT:    or a0, a0, t5
+; RV32I-NEXT:    or t0, s0, t6
+; RV32I-NEXT:    or t1, s5, s3
+; RV32I-NEXT:    or a1, a1, s1
+; RV32I-NEXT:    sw a7, 16(sp)
 ; RV32I-NEXT:    sw a0, 20(sp)
+; RV32I-NEXT:    sw t0, 24(sp)
+; RV32I-NEXT:    sw t1, 28(sp)
+; RV32I-NEXT:    sw a3, 0(sp)
+; RV32I-NEXT:    sw a4, 4(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a6, 12(sp)
 ; RV32I-NEXT:    slli t1, a1, 3
 ; RV32I-NEXT:    andi a1, a1, 28
-; RV32I-NEXT:    add a1, t6, a1
+; RV32I-NEXT:    add a1, s4, a1
 ; RV32I-NEXT:    andi a0, t1, 24
-; RV32I-NEXT:    xori t0, a0, 31
+; RV32I-NEXT:    xori a7, a0, 31
 ; RV32I-NEXT:    lw a3, 0(a1)
 ; RV32I-NEXT:    lw a4, 4(a1)
 ; RV32I-NEXT:    lw a5, 8(a1)
 ; RV32I-NEXT:    lw a6, 12(a1)
-; RV32I-NEXT:    lw a7, 16(a1)
+; RV32I-NEXT:    lw t0, 16(a1)
 ; RV32I-NEXT:    lw t2, 20(a1)
 ; RV32I-NEXT:    lw t3, 24(a1)
 ; RV32I-NEXT:    lw t4, 28(a1)
@@ -2344,33 +2341,33 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    srl a1, a3, t1
 ; RV32I-NEXT:    slli t6, a4, 1
 ; RV32I-NEXT:    srl a3, a6, t1
-; RV32I-NEXT:    slli s0, a7, 1
+; RV32I-NEXT:    slli s0, t0, 1
 ; RV32I-NEXT:    srl a4, a5, t1
 ; RV32I-NEXT:    slli s1, a6, 1
 ; RV32I-NEXT:    srl a5, t2, t1
 ; RV32I-NEXT:    slli s2, t3, 1
-; RV32I-NEXT:    srl a6, a7, t1
+; RV32I-NEXT:    srl a6, t0, t1
 ; RV32I-NEXT:    slli t2, t2, 1
-; RV32I-NEXT:    srl a7, t3, t1
+; RV32I-NEXT:    srl t0, t3, t1
 ; RV32I-NEXT:    slli t3, t4, 1
 ; RV32I-NEXT:    srl t1, t4, t1
-; RV32I-NEXT:    sll t4, t5, t0
-; RV32I-NEXT:    sll t5, t6, t0
-; RV32I-NEXT:    sll t6, s0, t0
-; RV32I-NEXT:    sll s0, s1, t0
-; RV32I-NEXT:    sll s1, s2, t0
-; RV32I-NEXT:    sll t2, t2, t0
-; RV32I-NEXT:    sll t3, t3, t0
+; RV32I-NEXT:    sll t4, t5, a7
+; RV32I-NEXT:    sll t5, t6, a7
+; RV32I-NEXT:    sll t6, s0, a7
+; RV32I-NEXT:    sll s0, s1, a7
+; RV32I-NEXT:    sll s1, s2, a7
+; RV32I-NEXT:    sll t2, t2, a7
+; RV32I-NEXT:    sll t3, t3, a7
 ; RV32I-NEXT:    srli s2, t1, 24
 ; RV32I-NEXT:    srli s3, t1, 16
 ; RV32I-NEXT:    srli s4, t1, 8
-; RV32I-NEXT:    or t0, a0, t4
+; RV32I-NEXT:    or a7, a0, t4
 ; RV32I-NEXT:    or t4, a1, t5
 ; RV32I-NEXT:    or t5, a3, t6
 ; RV32I-NEXT:    or s0, a4, s0
 ; RV32I-NEXT:    or s1, a5, s1
 ; RV32I-NEXT:    or t2, a6, t2
-; RV32I-NEXT:    or t3, a7, t3
+; RV32I-NEXT:    or t3, t0, t3
 ; RV32I-NEXT:    sb t1, 28(a2)
 ; RV32I-NEXT:    sb s4, 29(a2)
 ; RV32I-NEXT:    sb s3, 30(a2)
@@ -2387,23 +2384,23 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    srli s6, s0, 24
 ; RV32I-NEXT:    srli s7, s0, 16
 ; RV32I-NEXT:    srli s0, s0, 8
-; RV32I-NEXT:    srli s8, t5, 24
-; RV32I-NEXT:    srli s9, t5, 16
-; RV32I-NEXT:    srli t5, t5, 8
-; RV32I-NEXT:    srli s10, t4, 24
-; RV32I-NEXT:    srli s11, t4, 16
-; RV32I-NEXT:    srli t4, t4, 8
-; RV32I-NEXT:    sb a7, 24(a2)
+; RV32I-NEXT:    sb t0, 24(a2)
+; RV32I-NEXT:    srli t0, t5, 24
 ; RV32I-NEXT:    sb t3, 25(a2)
+; RV32I-NEXT:    srli t3, t5, 16
+; RV32I-NEXT:    srli t5, t5, 8
 ; RV32I-NEXT:    sb t6, 26(a2)
+; RV32I-NEXT:    srli t6, t4, 24
 ; RV32I-NEXT:    sb t1, 27(a2)
-; RV32I-NEXT:    srli a7, t0, 24
+; RV32I-NEXT:    srli t1, t4, 16
+; RV32I-NEXT:    srli t4, t4, 8
 ; RV32I-NEXT:    sb a6, 16(a2)
+; RV32I-NEXT:    srli a6, a7, 24
 ; RV32I-NEXT:    sb t2, 17(a2)
 ; RV32I-NEXT:    sb s3, 18(a2)
 ; RV32I-NEXT:    sb s2, 19(a2)
-; RV32I-NEXT:    srli a6, t0, 16
-; RV32I-NEXT:    srli t0, t0, 8
+; RV32I-NEXT:    srli t2, a7, 16
+; RV32I-NEXT:    srli a7, a7, 8
 ; RV32I-NEXT:    sb a5, 20(a2)
 ; RV32I-NEXT:    sb s1, 21(a2)
 ; RV32I-NEXT:    sb s5, 22(a2)
@@ -2414,30 +2411,29 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sb s6, 11(a2)
 ; RV32I-NEXT:    sb a3, 12(a2)
 ; RV32I-NEXT:    sb t5, 13(a2)
-; RV32I-NEXT:    sb s9, 14(a2)
-; RV32I-NEXT:    sb s8, 15(a2)
+; RV32I-NEXT:    sb t3, 14(a2)
+; RV32I-NEXT:    sb t0, 15(a2)
 ; RV32I-NEXT:    sb a1, 0(a2)
 ; RV32I-NEXT:    sb t4, 1(a2)
-; RV32I-NEXT:    sb s11, 2(a2)
-; RV32I-NEXT:    sb s10, 3(a2)
+; RV32I-NEXT:    sb t1, 2(a2)
+; RV32I-NEXT:    sb t6, 3(a2)
 ; RV32I-NEXT:    sb a0, 4(a2)
-; RV32I-NEXT:    sb t0, 5(a2)
-; RV32I-NEXT:    sb a6, 6(a2)
-; RV32I-NEXT:    sb a7, 7(a2)
-; RV32I-NEXT:    lw ra, 124(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 120(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 116(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 112(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s4, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s5, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s6, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s7, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s8, 88(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s9, 84(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s10, 80(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s11, 76(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 128
+; RV32I-NEXT:    sb a7, 5(a2)
+; RV32I-NEXT:    sb t2, 6(a2)
+; RV32I-NEXT:    sb a6, 7(a2)
+; RV32I-NEXT:    lw s0, 108(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 104(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 100(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 96(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s5, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s6, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s7, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s8, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s9, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s10, 68(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s11, 64(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 112
 ; RV32I-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
   %byteOff = load i256, ptr %byteOff.ptr, align 1
@@ -2682,129 +2678,128 @@ define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
 ;
 ; RV32I-LABEL: lshr_32bytes_wordOff:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -128
-; RV32I-NEXT:    sw ra, 124(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 120(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 116(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 112(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s5, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s6, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s7, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s8, 88(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s9, 84(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s10, 80(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s11, 76(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a7, 0(a0)
-; RV32I-NEXT:    lbu t0, 1(a0)
-; RV32I-NEXT:    lbu t1, 2(a0)
-; RV32I-NEXT:    lbu s1, 3(a0)
-; RV32I-NEXT:    lbu s7, 4(a0)
-; RV32I-NEXT:    lbu s8, 5(a0)
-; RV32I-NEXT:    lbu s4, 6(a0)
-; RV32I-NEXT:    lbu s6, 7(a0)
-; RV32I-NEXT:    lbu s5, 8(a0)
-; RV32I-NEXT:    lbu s10, 9(a0)
-; RV32I-NEXT:    lbu s11, 10(a0)
-; RV32I-NEXT:    lbu ra, 11(a0)
-; RV32I-NEXT:    lbu t4, 12(a0)
-; RV32I-NEXT:    lbu t6, 13(a0)
-; RV32I-NEXT:    lbu a5, 14(a0)
-; RV32I-NEXT:    lbu a6, 15(a0)
-; RV32I-NEXT:    lbu a3, 16(a0)
-; RV32I-NEXT:    lbu t2, 17(a0)
-; RV32I-NEXT:    lbu t3, 18(a0)
-; RV32I-NEXT:    lbu t5, 19(a0)
-; RV32I-NEXT:    lbu a4, 20(a0)
-; RV32I-NEXT:    lbu s0, 21(a0)
-; RV32I-NEXT:    lbu s2, 22(a0)
-; RV32I-NEXT:    lbu s3, 23(a0)
+; RV32I-NEXT:    addi sp, sp, -112
+; RV32I-NEXT:    sw s0, 108(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 104(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 100(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 96(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s5, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s6, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s7, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s8, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s9, 72(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s10, 68(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s11, 64(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a3, 0(a0)
+; RV32I-NEXT:    lbu a4, 1(a0)
+; RV32I-NEXT:    lbu a5, 2(a0)
+; RV32I-NEXT:    lbu a6, 3(a0)
+; RV32I-NEXT:    lbu a7, 4(a0)
+; RV32I-NEXT:    lbu t0, 5(a0)
+; RV32I-NEXT:    lbu t1, 6(a0)
+; RV32I-NEXT:    lbu t2, 7(a0)
+; RV32I-NEXT:    lbu t3, 8(a0)
+; RV32I-NEXT:    lbu t4, 9(a0)
+; RV32I-NEXT:    lbu t5, 10(a0)
+; RV32I-NEXT:    lbu t6, 11(a0)
+; RV32I-NEXT:    lbu s2, 12(a0)
+; RV32I-NEXT:    lbu s3, 13(a0)
+; RV32I-NEXT:    lbu s4, 14(a0)
+; RV32I-NEXT:    lbu s5, 15(a0)
+; RV32I-NEXT:    lbu s6, 16(a0)
+; RV32I-NEXT:    lbu s7, 17(a0)
+; RV32I-NEXT:    lbu s8, 18(a0)
+; RV32I-NEXT:    lbu s9, 19(a0)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a6, a6, 24
 ; RV32I-NEXT:    slli t0, t0, 8
 ; RV32I-NEXT:    slli t1, t1, 16
-; RV32I-NEXT:    slli s1, s1, 24
-; RV32I-NEXT:    slli s8, s8, 8
-; RV32I-NEXT:    or a7, t0, a7
-; RV32I-NEXT:    or t0, s1, t1
-; RV32I-NEXT:    or t1, s8, s7
-; RV32I-NEXT:    lbu s1, 24(a0)
-; RV32I-NEXT:    lbu s7, 25(a0)
-; RV32I-NEXT:    lbu s8, 26(a0)
-; RV32I-NEXT:    lbu s9, 27(a0)
+; RV32I-NEXT:    slli t2, t2, 24
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a5, t0, a7
+; RV32I-NEXT:    or a6, t2, t1
+; RV32I-NEXT:    lbu s10, 20(a0)
+; RV32I-NEXT:    lbu s11, 21(a0)
+; RV32I-NEXT:    lbu s0, 22(a0)
+; RV32I-NEXT:    lbu s1, 23(a0)
+; RV32I-NEXT:    slli t4, t4, 8
+; RV32I-NEXT:    slli t5, t5, 16
+; RV32I-NEXT:    slli t6, t6, 24
+; RV32I-NEXT:    slli s3, s3, 8
 ; RV32I-NEXT:    slli s4, s4, 16
-; RV32I-NEXT:    slli s6, s6, 24
-; RV32I-NEXT:    slli s10, s10, 8
-; RV32I-NEXT:    slli s11, s11, 16
-; RV32I-NEXT:    slli ra, ra, 24
-; RV32I-NEXT:    or s4, s6, s4
-; RV32I-NEXT:    or s5, s10, s5
-; RV32I-NEXT:    or s6, ra, s11
-; RV32I-NEXT:    lbu s10, 28(a0)
-; RV32I-NEXT:    lbu s11, 29(a0)
-; RV32I-NEXT:    lbu ra, 30(a0)
+; RV32I-NEXT:    slli s5, s5, 24
+; RV32I-NEXT:    or a7, t4, t3
+; RV32I-NEXT:    or t0, t6, t5
+; RV32I-NEXT:    or t1, s3, s2
+; RV32I-NEXT:    or t2, s5, s4
+; RV32I-NEXT:    lbu t3, 24(a0)
+; RV32I-NEXT:    lbu s2, 25(a0)
+; RV32I-NEXT:    lbu s3, 26(a0)
+; RV32I-NEXT:    lbu s4, 27(a0)
+; RV32I-NEXT:    slli s7, s7, 8
+; RV32I-NEXT:    slli s8, s8, 16
+; RV32I-NEXT:    slli s9, s9, 24
+; RV32I-NEXT:    slli s11, s11, 8
+; RV32I-NEXT:    or t4, s7, s6
+; RV32I-NEXT:    or t5, s9, s8
+; RV32I-NEXT:    or t6, s11, s10
+; RV32I-NEXT:    lbu s5, 28(a0)
+; RV32I-NEXT:    lbu s6, 29(a0)
+; RV32I-NEXT:    lbu s7, 30(a0)
 ; RV32I-NEXT:    lbu a0, 31(a0)
 ; RV32I-NEXT:    lbu a1, 0(a1)
+; RV32I-NEXT:    sw zero, 48(sp)
+; RV32I-NEXT:    sw zero, 52(sp)
 ; RV32I-NEXT:    sw zero, 56(sp)
 ; RV32I-NEXT:    sw zero, 60(sp)
-; RV32I-NEXT:    sw zero, 64(sp)
-; RV32I-NEXT:    sw zero, 68(sp)
+; RV32I-NEXT:    sw zero, 32(sp)
+; RV32I-NEXT:    sw zero, 36(sp)
 ; RV32I-NEXT:    sw zero, 40(sp)
 ; RV32I-NEXT:    sw zero, 44(sp)
-; RV32I-NEXT:    sw zero, 48(sp)
-; RV32I-NEXT:    sw zero, 52(sp)
-; RV32I-NEXT:    slli t6, t6, 8
-; RV32I-NEXT:    or t4, t6, t4
-; RV32I-NEXT:    addi t6, sp, 8
-; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    slli t2, t2, 8
-; RV32I-NEXT:    slli t3, t3, 16
-; RV32I-NEXT:    slli t5, t5, 24
-; RV32I-NEXT:    slli s0, s0, 8
-; RV32I-NEXT:    slli s2, s2, 16
-; RV32I-NEXT:    slli s3, s3, 24
-; RV32I-NEXT:    slli s7, s7, 8
-; RV32I-NEXT:    slli s8, s8, 16
-; RV32I-NEXT:    slli s9, s9, 24
-; RV32I-NEXT:    slli s11, s11, 8
-; RV32I-NEXT:    slli ra, ra, 16
+; RV32I-NEXT:    slli s0, s0, 16
+; RV32I-NEXT:    slli s1, s1, 24
+; RV32I-NEXT:    or s0, s1, s0
+; RV32I-NEXT:    mv s1, sp
+; RV32I-NEXT:    slli s2, s2, 8
+; RV32I-NEXT:    slli s3, s3, 16
+; RV32I-NEXT:    slli s4, s4, 24
+; RV32I-NEXT:    slli s6, s6, 8
+; RV32I-NEXT:    slli s7, s7, 16
 ; RV32I-NEXT:    slli a0, a0, 24
 ; RV32I-NEXT:    slli a1, a1, 2
-; RV32I-NEXT:    or a5, a6, a5
-; RV32I-NEXT:    or a3, t2, a3
-; RV32I-NEXT:    or a6, t5, t3
-; RV32I-NEXT:    or a4, s0, a4
-; RV32I-NEXT:    or t2, s3, s2
-; RV32I-NEXT:    or t3, s7, s1
-; RV32I-NEXT:    or t5, s9, s8
-; RV32I-NEXT:    or s0, s11, s10
-; RV32I-NEXT:    or a0, a0, ra
+; RV32I-NEXT:    or t3, s2, t3
+; RV32I-NEXT:    or s2, s4, s3
+; RV32I-NEXT:    or s3, s6, s5
+; RV32I-NEXT:    or a0, a0, s7
 ; RV32I-NEXT:    andi a1, a1, 28
-; RV32I-NEXT:    or a7, t0, a7
-; RV32I-NEXT:    or t0, s4, t1
-; RV32I-NEXT:    or t1, s6, s5
-; RV32I-NEXT:    or a5, a5, t4
-; RV32I-NEXT:    or a3, a6, a3
-; RV32I-NEXT:    or a4, t2, a4
-; RV32I-NEXT:    or a6, t5, t3
-; RV32I-NEXT:    or a0, a0, s0
-; RV32I-NEXT:    add t6, t6, a1
-; RV32I-NEXT:    sw a3, 24(sp)
-; RV32I-NEXT:    sw a4, 28(sp)
-; RV32I-NEXT:    sw a6, 32(sp)
-; RV32I-NEXT:    sw a0, 36(sp)
-; RV32I-NEXT:    sw a7, 8(sp)
-; RV32I-NEXT:    sw t0, 12(sp)
-; RV32I-NEXT:    sw t1, 16(sp)
-; RV32I-NEXT:    sw a5, 20(sp)
-; RV32I-NEXT:    lw a6, 16(t6)
-; RV32I-NEXT:    lw a5, 20(t6)
-; RV32I-NEXT:    lw a7, 24(t6)
-; RV32I-NEXT:    lw a1, 0(t6)
-; RV32I-NEXT:    lw a0, 4(t6)
-; RV32I-NEXT:    lw a4, 8(t6)
-; RV32I-NEXT:    lw a3, 12(t6)
-; RV32I-NEXT:    lw t0, 28(t6)
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a5, t0, a7
+; RV32I-NEXT:    or a6, t2, t1
+; RV32I-NEXT:    or a7, t5, t4
+; RV32I-NEXT:    or t0, s0, t6
+; RV32I-NEXT:    or t1, s2, t3
+; RV32I-NEXT:    or a0, a0, s3
+; RV32I-NEXT:    add s1, s1, a1
+; RV32I-NEXT:    sw a7, 16(sp)
+; RV32I-NEXT:    sw t0, 20(sp)
+; RV32I-NEXT:    sw t1, 24(sp)
+; RV32I-NEXT:    sw a0, 28(sp)
+; RV32I-NEXT:    sw a3, 0(sp)
+; RV32I-NEXT:    sw a4, 4(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a6, 12(sp)
+; RV32I-NEXT:    lw a6, 16(s1)
+; RV32I-NEXT:    lw a5, 20(s1)
+; RV32I-NEXT:    lw a7, 24(s1)
+; RV32I-NEXT:    lw a1, 0(s1)
+; RV32I-NEXT:    lw a0, 4(s1)
+; RV32I-NEXT:    lw a4, 8(s1)
+; RV32I-NEXT:    lw a3, 12(s1)
+; RV32I-NEXT:    lw t0, 28(s1)
 ; RV32I-NEXT:    srli t1, a7, 24
 ; RV32I-NEXT:    srli t2, a7, 16
 ; RV32I-NEXT:    srli t3, a7, 8
@@ -2819,21 +2814,21 @@ define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
 ; RV32I-NEXT:    srli s5, a5, 8
 ; RV32I-NEXT:    srli s6, a4, 24
 ; RV32I-NEXT:    srli s7, a4, 16
-; RV32I-NEXT:    srli s8, a4, 8
-; RV32I-NEXT:    srli s9, a3, 24
-; RV32I-NEXT:    srli s10, a3, 16
-; RV32I-NEXT:    srli s11, a3, 8
-; RV32I-NEXT:    srli ra, a1, 24
 ; RV32I-NEXT:    sb a7, 24(a2)
+; RV32I-NEXT:    srli a7, a4, 8
 ; RV32I-NEXT:    sb t3, 25(a2)
+; RV32I-NEXT:    srli t3, a3, 24
 ; RV32I-NEXT:    sb t2, 26(a2)
+; RV32I-NEXT:    srli t2, a3, 16
 ; RV32I-NEXT:    sb t1, 27(a2)
-; RV32I-NEXT:    srli a7, a1, 16
+; RV32I-NEXT:    srli t1, a3, 8
 ; RV32I-NEXT:    sb t0, 28(a2)
+; RV32I-NEXT:    srli t0, a1, 24
 ; RV32I-NEXT:    sb t6, 29(a2)
+; RV32I-NEXT:    srli t6, a1, 16
 ; RV32I-NEXT:    sb t5, 30(a2)
 ; RV32I-NEXT:    sb t4, 31(a2)
-; RV32I-NEXT:    srli t0, a1, 8
+; RV32I-NEXT:    srli t4, a1, 8
 ; RV32I-NEXT:    sb a6, 16(a2)
 ; RV32I-NEXT:    sb s2, 17(a2)
 ; RV32I-NEXT:    sb s1, 18(a2)
@@ -2845,36 +2840,35 @@ define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
 ; RV32I-NEXT:    sb s3, 23(a2)
 ; RV32I-NEXT:    srli a5, a0, 16
 ; RV32I-NEXT:    sb a4, 8(a2)
-; RV32I-NEXT:    sb s8, 9(a2)
+; RV32I-NEXT:    sb a7, 9(a2)
 ; RV32I-NEXT:    sb s7, 10(a2)
 ; RV32I-NEXT:    sb s6, 11(a2)
 ; RV32I-NEXT:    srli a4, a0, 8
 ; RV32I-NEXT:    sb a3, 12(a2)
-; RV32I-NEXT:    sb s11, 13(a2)
-; RV32I-NEXT:    sb s10, 14(a2)
-; RV32I-NEXT:    sb s9, 15(a2)
+; RV32I-NEXT:    sb t1, 13(a2)
+; RV32I-NEXT:    sb t2, 14(a2)
+; RV32I-NEXT:    sb t3, 15(a2)
 ; RV32I-NEXT:    sb a1, 0(a2)
-; RV32I-NEXT:    sb t0, 1(a2)
-; RV32I-NEXT:    sb a7, 2(a2)
-; RV32I-NEXT:    sb ra, 3(a2)
+; RV32I-NEXT:    sb t4, 1(a2)
+; RV32I-NEXT:    sb t6, 2(a2)
+; RV32I-NEXT:    sb t0, 3(a2)
 ; RV32I-NEXT:    sb a0, 4(a2)
 ; RV32I-NEXT:    sb a4, 5(a2)
 ; RV32I-NEXT:    sb a5, 6(a2)
 ; RV32I-NEXT:    sb a6, 7(a2)
-; RV32I-NEXT:    lw ra, 124(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 120(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 116(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 112(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s4, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s5, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s6, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s7, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s8, 88(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s9, 84(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s10, 80(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s11, 76(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 128
+; RV32I-NEXT:    lw s0, 108(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 104(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 100(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 96(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s5, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s6, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s7, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s8, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s9, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s10, 68(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s11, 64(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 112
 ; RV32I-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
   %wordOff = load i256, ptr %wordOff.ptr, align 1
@@ -2900,111 +2894,111 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; RV64I-NEXT:    sd s9, 80(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s10, 72(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s11, 64(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a5, 0(a0)
-; RV64I-NEXT:    lbu a7, 1(a0)
-; RV64I-NEXT:    lbu t2, 2(a0)
-; RV64I-NEXT:    lbu s3, 3(a0)
-; RV64I-NEXT:    lbu t0, 4(a0)
-; RV64I-NEXT:    lbu s8, 5(a0)
-; RV64I-NEXT:    lbu s9, 6(a0)
-; RV64I-NEXT:    lbu s10, 7(a0)
-; RV64I-NEXT:    lbu s2, 8(a0)
-; RV64I-NEXT:    lbu s4, 9(a0)
-; RV64I-NEXT:    lbu s5, 10(a0)
-; RV64I-NEXT:    lbu s6, 11(a0)
-; RV64I-NEXT:    lbu s7, 12(a0)
-; RV64I-NEXT:    lbu s11, 13(a0)
-; RV64I-NEXT:    lbu t1, 14(a0)
-; RV64I-NEXT:    lbu t3, 15(a0)
-; RV64I-NEXT:    lbu a3, 16(a0)
-; RV64I-NEXT:    lbu a6, 17(a0)
-; RV64I-NEXT:    lbu t4, 18(a0)
-; RV64I-NEXT:    lbu t5, 19(a0)
-; RV64I-NEXT:    lbu a4, 20(a0)
-; RV64I-NEXT:    lbu t6, 21(a0)
-; RV64I-NEXT:    lbu s0, 22(a0)
-; RV64I-NEXT:    lbu s1, 23(a0)
-; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    slli t2, t2, 16
+; RV64I-NEXT:    lbu a3, 0(a0)
+; RV64I-NEXT:    lbu a4, 1(a0)
+; RV64I-NEXT:    lbu a5, 2(a0)
+; RV64I-NEXT:    lbu a6, 3(a0)
+; RV64I-NEXT:    lbu a7, 4(a0)
+; RV64I-NEXT:    lbu t0, 5(a0)
+; RV64I-NEXT:    lbu t1, 6(a0)
+; RV64I-NEXT:    lbu t2, 7(a0)
+; RV64I-NEXT:    lbu t3, 8(a0)
+; RV64I-NEXT:    lbu t4, 9(a0)
+; RV64I-NEXT:    lbu t5, 10(a0)
+; RV64I-NEXT:    lbu t6, 11(a0)
+; RV64I-NEXT:    lbu s0, 12(a0)
+; RV64I-NEXT:    lbu s1, 13(a0)
+; RV64I-NEXT:    lbu s2, 14(a0)
+; RV64I-NEXT:    lbu s3, 15(a0)
+; RV64I-NEXT:    lbu s4, 16(a0)
+; RV64I-NEXT:    lbu s5, 17(a0)
+; RV64I-NEXT:    lbu s6, 18(a0)
+; RV64I-NEXT:    lbu s7, 19(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    slli a5, a5, 16
+; RV64I-NEXT:    slli a6, a6, 24
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t2, t2, 24
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a5, t0, a7
+; RV64I-NEXT:    or a6, t2, t1
+; RV64I-NEXT:    lbu s8, 20(a0)
+; RV64I-NEXT:    lbu s9, 21(a0)
+; RV64I-NEXT:    lbu s10, 22(a0)
+; RV64I-NEXT:    lbu s11, 23(a0)
+; RV64I-NEXT:    slli t4, t4, 8
+; RV64I-NEXT:    slli t5, t5, 16
+; RV64I-NEXT:    slli t6, t6, 24
+; RV64I-NEXT:    slli s1, s1, 8
+; RV64I-NEXT:    slli s2, s2, 16
 ; RV64I-NEXT:    slli s3, s3, 24
-; RV64I-NEXT:    slli s8, s8, 8
-; RV64I-NEXT:    slli s9, s9, 16
-; RV64I-NEXT:    slli s10, s10, 24
-; RV64I-NEXT:    or a5, a7, a5
-; RV64I-NEXT:    or a7, s3, t2
-; RV64I-NEXT:    or t0, s8, t0
-; RV64I-NEXT:    or t2, s10, s9
-; RV64I-NEXT:    lbu s3, 24(a0)
-; RV64I-NEXT:    lbu s8, 25(a0)
-; RV64I-NEXT:    lbu s9, 26(a0)
-; RV64I-NEXT:    lbu s10, 27(a0)
-; RV64I-NEXT:    slli s4, s4, 8
-; RV64I-NEXT:    slli s5, s5, 16
-; RV64I-NEXT:    slli s6, s6, 24
-; RV64I-NEXT:    slli s11, s11, 8
-; RV64I-NEXT:    or s2, s4, s2
-; RV64I-NEXT:    or s4, s6, s5
-; RV64I-NEXT:    or s5, s11, s7
-; RV64I-NEXT:    lbu s6, 28(a0)
-; RV64I-NEXT:    lbu s7, 29(a0)
-; RV64I-NEXT:    lbu s11, 30(a0)
+; RV64I-NEXT:    or a7, t4, t3
+; RV64I-NEXT:    or t0, t6, t5
+; RV64I-NEXT:    or t1, s1, s0
+; RV64I-NEXT:    or t2, s3, s2
+; RV64I-NEXT:    lbu t3, 24(a0)
+; RV64I-NEXT:    lbu t4, 25(a0)
+; RV64I-NEXT:    lbu t5, 26(a0)
+; RV64I-NEXT:    lbu t6, 27(a0)
+; RV64I-NEXT:    slli s5, s5, 8
+; RV64I-NEXT:    slli s6, s6, 16
+; RV64I-NEXT:    slli s7, s7, 24
+; RV64I-NEXT:    slli s9, s9, 8
+; RV64I-NEXT:    or s0, s5, s4
+; RV64I-NEXT:    or s1, s7, s6
+; RV64I-NEXT:    or s2, s9, s8
+; RV64I-NEXT:    lbu s3, 28(a0)
+; RV64I-NEXT:    lbu s4, 29(a0)
+; RV64I-NEXT:    lbu s5, 30(a0)
 ; RV64I-NEXT:    lbu a0, 31(a0)
 ; RV64I-NEXT:    lbu a1, 0(a1)
 ; RV64I-NEXT:    sd zero, 32(sp)
 ; RV64I-NEXT:    sd zero, 40(sp)
 ; RV64I-NEXT:    sd zero, 48(sp)
 ; RV64I-NEXT:    sd zero, 56(sp)
-; RV64I-NEXT:    slli t1, t1, 16
-; RV64I-NEXT:    slli t3, t3, 24
-; RV64I-NEXT:    or t1, t3, t1
-; RV64I-NEXT:    mv t3, sp
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    slli t4, t4, 16
-; RV64I-NEXT:    slli t5, t5, 24
-; RV64I-NEXT:    slli t6, t6, 8
-; RV64I-NEXT:    slli s0, s0, 16
-; RV64I-NEXT:    slli s1, s1, 24
-; RV64I-NEXT:    slli s8, s8, 8
-; RV64I-NEXT:    slli s9, s9, 16
-; RV64I-NEXT:    slli s10, s10, 24
-; RV64I-NEXT:    slli s7, s7, 8
-; RV64I-NEXT:    slli s11, s11, 16
+; RV64I-NEXT:    slli s10, s10, 16
+; RV64I-NEXT:    slli s11, s11, 24
+; RV64I-NEXT:    or s6, s11, s10
+; RV64I-NEXT:    mv s7, sp
+; RV64I-NEXT:    slli t4, t4, 8
+; RV64I-NEXT:    slli t5, t5, 16
+; RV64I-NEXT:    slli t6, t6, 24
+; RV64I-NEXT:    slli s4, s4, 8
+; RV64I-NEXT:    slli s5, s5, 16
 ; RV64I-NEXT:    slli a0, a0, 24
 ; RV64I-NEXT:    slli a1, a1, 3
-; RV64I-NEXT:    or a3, a6, a3
-; RV64I-NEXT:    or a6, t5, t4
-; RV64I-NEXT:    or a4, t6, a4
-; RV64I-NEXT:    or s0, s1, s0
-; RV64I-NEXT:    or t4, s8, s3
-; RV64I-NEXT:    or t5, s10, s9
-; RV64I-NEXT:    or t6, s7, s6
-; RV64I-NEXT:    or a0, a0, s11
+; RV64I-NEXT:    or t3, t4, t3
+; RV64I-NEXT:    or t4, t6, t5
+; RV64I-NEXT:    or t5, s4, s3
+; RV64I-NEXT:    or a0, a0, s5
 ; RV64I-NEXT:    andi a1, a1, 24
-; RV64I-NEXT:    or a5, a7, a5
-; RV64I-NEXT:    or a7, t2, t0
-; RV64I-NEXT:    or t0, s4, s2
-; RV64I-NEXT:    or t1, t1, s5
-; RV64I-NEXT:    or a3, a6, a3
-; RV64I-NEXT:    or a4, s0, a4
-; RV64I-NEXT:    or a6, t5, t4
-; RV64I-NEXT:    or a0, a0, t6
-; RV64I-NEXT:    add t3, t3, a1
-; RV64I-NEXT:    slli a7, a7, 32
-; RV64I-NEXT:    slli t1, t1, 32
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a5, t0, a7
+; RV64I-NEXT:    or a6, t2, t1
+; RV64I-NEXT:    or s0, s1, s0
+; RV64I-NEXT:    or a7, s6, s2
+; RV64I-NEXT:    or t0, t4, t3
+; RV64I-NEXT:    or a0, a0, t5
+; RV64I-NEXT:    add s7, s7, a1
 ; RV64I-NEXT:    slli a4, a4, 32
+; RV64I-NEXT:    slli a6, a6, 32
+; RV64I-NEXT:    slli a7, a7, 32
 ; RV64I-NEXT:    slli a0, a0, 32
-; RV64I-NEXT:    or a1, a7, a5
-; RV64I-NEXT:    or a5, t1, t0
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    or a0, a0, a6
-; RV64I-NEXT:    sd a1, 0(sp)
-; RV64I-NEXT:    sd a5, 8(sp)
-; RV64I-NEXT:    sd a3, 16(sp)
+; RV64I-NEXT:    or a1, a6, a5
+; RV64I-NEXT:    or a4, a7, s0
+; RV64I-NEXT:    or a0, a0, t0
+; RV64I-NEXT:    sd a3, 0(sp)
+; RV64I-NEXT:    sd a1, 8(sp)
+; RV64I-NEXT:    sd a4, 16(sp)
 ; RV64I-NEXT:    sd a0, 24(sp)
-; RV64I-NEXT:    ld a4, 16(t3)
-; RV64I-NEXT:    ld a0, 8(t3)
-; RV64I-NEXT:    ld a1, 0(t3)
-; RV64I-NEXT:    ld a3, 24(t3)
+; RV64I-NEXT:    ld a4, 16(s7)
+; RV64I-NEXT:    ld a0, 8(s7)
+; RV64I-NEXT:    ld a1, 0(s7)
+; RV64I-NEXT:    ld a3, 24(s7)
 ; RV64I-NEXT:    srli a5, a4, 56
 ; RV64I-NEXT:    srli a6, a4, 48
 ; RV64I-NEXT:    srli a7, a4, 40
@@ -3023,25 +3017,25 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; RV64I-NEXT:    srli s5, a1, 48
 ; RV64I-NEXT:    srli s6, a1, 40
 ; RV64I-NEXT:    srli s7, a1, 32
-; RV64I-NEXT:    srli s8, a1, 24
-; RV64I-NEXT:    srli s9, a1, 16
-; RV64I-NEXT:    srli s10, a1, 8
-; RV64I-NEXT:    srli s11, a0, 56
 ; RV64I-NEXT:    sb t0, 20(a2)
+; RV64I-NEXT:    srli t0, a1, 24
 ; RV64I-NEXT:    sb a7, 21(a2)
+; RV64I-NEXT:    srli a7, a1, 16
 ; RV64I-NEXT:    sb a6, 22(a2)
+; RV64I-NEXT:    srli a6, a1, 8
 ; RV64I-NEXT:    sb a5, 23(a2)
-; RV64I-NEXT:    srli a5, a0, 48
+; RV64I-NEXT:    srli a5, a0, 56
 ; RV64I-NEXT:    sb a4, 16(a2)
+; RV64I-NEXT:    srli a4, a0, 48
 ; RV64I-NEXT:    sb t3, 17(a2)
 ; RV64I-NEXT:    sb t2, 18(a2)
 ; RV64I-NEXT:    sb t1, 19(a2)
-; RV64I-NEXT:    srli a4, a0, 40
+; RV64I-NEXT:    srli t1, a0, 40
 ; RV64I-NEXT:    sb s0, 28(a2)
 ; RV64I-NEXT:    sb t6, 29(a2)
 ; RV64I-NEXT:    sb t5, 30(a2)
 ; RV64I-NEXT:    sb t4, 31(a2)
-; RV64I-NEXT:    srli a6, a0, 32
+; RV64I-NEXT:    srli t2, a0, 32
 ; RV64I-NEXT:    sb a3, 24(a2)
 ; RV64I-NEXT:    sb s3, 25(a2)
 ; RV64I-NEXT:    sb s2, 26(a2)
@@ -3051,19 +3045,19 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; RV64I-NEXT:    sb s6, 5(a2)
 ; RV64I-NEXT:    sb s5, 6(a2)
 ; RV64I-NEXT:    sb s4, 7(a2)
-; RV64I-NEXT:    srli a7, a0, 16
+; RV64I-NEXT:    srli t3, a0, 16
 ; RV64I-NEXT:    sb a1, 0(a2)
-; RV64I-NEXT:    sb s10, 1(a2)
-; RV64I-NEXT:    sb s9, 2(a2)
-; RV64I-NEXT:    sb s8, 3(a2)
+; RV64I-NEXT:    sb a6, 1(a2)
+; RV64I-NEXT:    sb a7, 2(a2)
+; RV64I-NEXT:    sb t0, 3(a2)
 ; RV64I-NEXT:    srli a1, a0, 8
-; RV64I-NEXT:    sb a6, 12(a2)
-; RV64I-NEXT:    sb a4, 13(a2)
-; RV64I-NEXT:    sb a5, 14(a2)
-; RV64I-NEXT:    sb s11, 15(a2)
+; RV64I-NEXT:    sb t2, 12(a2)
+; RV64I-NEXT:    sb t1, 13(a2)
+; RV64I-NEXT:    sb a4, 14(a2)
+; RV64I-NEXT:    sb a5, 15(a2)
 ; RV64I-NEXT:    sb a0, 8(a2)
 ; RV64I-NEXT:    sb a1, 9(a2)
-; RV64I-NEXT:    sb a7, 10(a2)
+; RV64I-NEXT:    sb t3, 10(a2)
 ; RV64I-NEXT:    sb a3, 11(a2)
 ; RV64I-NEXT:    ld s0, 152(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 144(sp) # 8-byte Folded Reload
@@ -3082,129 +3076,128 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ;
 ; RV32I-LABEL: lshr_32bytes_dwordOff:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -128
-; RV32I-NEXT:    sw ra, 124(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 120(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 116(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 112(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s5, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s6, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s7, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s8, 88(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s9, 84(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s10, 80(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s11, 76(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a7, 0(a0)
-; RV32I-NEXT:    lbu t0, 1(a0)
-; RV32I-NEXT:    lbu t1, 2(a0)
-; RV32I-NEXT:    lbu s1, 3(a0)
-; RV32I-NEXT:    lbu s7, 4(a0)
-; RV32I-NEXT:    lbu s8, 5(a0)
-; RV32I-NEXT:    lbu s4, 6(a0)
-; RV32I-NEXT:    lbu s6, 7(a0)
-; RV32I-NEXT:    lbu s5, 8(a0)
-; RV32I-NEXT:    lbu s10, 9(a0)
-; RV32I-NEXT:    lbu s11, 10(a0)
-; RV32I-NEXT:    lbu ra, 11(a0)
-; RV32I-NEXT:    lbu t4, 12(a0)
-; RV32I-NEXT:    lbu t6, 13(a0)
-; RV32I-NEXT:    lbu a5, 14(a0)
-; RV32I-NEXT:    lbu a6, 15(a0)
-; RV32I-NEXT:    lbu a3, 16(a0)
-; RV32I-NEXT:    lbu t2, 17(a0)
-; RV32I-NEXT:    lbu t3, 18(a0)
-; RV32I-NEXT:    lbu t5, 19(a0)
-; RV32I-NEXT:    lbu a4, 20(a0)
-; RV32I-NEXT:    lbu s0, 21(a0)
-; RV32I-NEXT:    lbu s2, 22(a0)
-; RV32I-NEXT:    lbu s3, 23(a0)
+; RV32I-NEXT:    addi sp, sp, -112
+; RV32I-NEXT:    sw s0, 108(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 104(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 100(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 96(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s5, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s6, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s7, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s8, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s9, 72(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s10, 68(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s11, 64(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a3, 0(a0)
+; RV32I-NEXT:    lbu a4, 1(a0)
+; RV32I-NEXT:    lbu a5, 2(a0)
+; RV32I-NEXT:    lbu a6, 3(a0)
+; RV32I-NEXT:    lbu a7, 4(a0)
+; RV32I-NEXT:    lbu t0, 5(a0)
+; RV32I-NEXT:    lbu t1, 6(a0)
+; RV32I-NEXT:    lbu t2, 7(a0)
+; RV32I-NEXT:    lbu t3, 8(a0)
+; RV32I-NEXT:    lbu t4, 9(a0)
+; RV32I-NEXT:    lbu t5, 10(a0)
+; RV32I-NEXT:    lbu t6, 11(a0)
+; RV32I-NEXT:    lbu s2, 12(a0)
+; RV32I-NEXT:    lbu s3, 13(a0)
+; RV32I-NEXT:    lbu s4, 14(a0)
+; RV32I-NEXT:    lbu s5, 15(a0)
+; RV32I-NEXT:    lbu s6, 16(a0)
+; RV32I-NEXT:    lbu s7, 17(a0)
+; RV32I-NEXT:    lbu s8, 18(a0)
+; RV32I-NEXT:    lbu s9, 19(a0)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a6, a6, 24
 ; RV32I-NEXT:    slli t0, t0, 8
 ; RV32I-NEXT:    slli t1, t1, 16
-; RV32I-NEXT:    slli s1, s1, 24
-; RV32I-NEXT:    slli s8, s8, 8
-; RV32I-NEXT:    or a7, t0, a7
-; RV32I-NEXT:    or t0, s1, t1
-; RV32I-NEXT:    or t1, s8, s7
-; RV32I-NEXT:    lbu s1, 24(a0)
-; RV32I-NEXT:    lbu s7, 25(a0)
-; RV32I-NEXT:    lbu s8, 26(a0)
-; RV32I-NEXT:    lbu s9, 27(a0)
+; RV32I-NEXT:    slli t2, t2, 24
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a5, t0, a7
+; RV32I-NEXT:    or a6, t2, t1
+; RV32I-NEXT:    lbu s10, 20(a0)
+; RV32I-NEXT:    lbu s11, 21(a0)
+; RV32I-NEXT:    lbu s0, 22(a0)
+; RV32I-NEXT:    lbu s1, 23(a0)
+; RV32I-NEXT:    slli t4, t4, 8
+; RV32I-NEXT:    slli t5, t5, 16
+; RV32I-NEXT:    slli t6, t6, 24
+; RV32I-NEXT:    slli s3, s3, 8
 ; RV32I-NEXT:    slli s4, s4, 16
-; RV32I-NEXT:    slli s6, s6, 24
-; RV32I-NEXT:    slli s10, s10, 8
-; RV32I-NEXT:    slli s11, s11, 16
-; RV32I-NEXT:    slli ra, ra, 24
-; RV32I-NEXT:    or s4, s6, s4
-; RV32I-NEXT:    or s5, s10, s5
-; RV32I-NEXT:    or s6, ra, s11
-; RV32I-NEXT:    lbu s10, 28(a0)
-; RV32I-NEXT:    lbu s11, 29(a0)
-; RV32I-NEXT:    lbu ra, 30(a0)
+; RV32I-NEXT:    slli s5, s5, 24
+; RV32I-NEXT:    or a7, t4, t3
+; RV32I-NEXT:    or t0, t6, t5
+; RV32I-NEXT:    or t1, s3, s2
+; RV32I-NEXT:    or t2, s5, s4
+; RV32I-NEXT:    lbu t3, 24(a0)
+; RV32I-NEXT:    lbu s2, 25(a0)
+; RV32I-NEXT:    lbu s3, 26(a0)
+; RV32I-NEXT:    lbu s4, 27(a0)
+; RV32I-NEXT:    slli s7, s7, 8
+; RV32I-NEXT:    slli s8, s8, 16
+; RV32I-NEXT:    slli s9, s9, 24
+; RV32I-NEXT:    slli s11, s11, 8
+; RV32I-NEXT:    or t4, s7, s6
+; RV32I-NEXT:    or t5, s9, s8
+; RV32I-NEXT:    or t6, s11, s10
+; RV32I-NEXT:    lbu s5, 28(a0)
+; RV32I-NEXT:    lbu s6, 29(a0)
+; RV32I-NEXT:    lbu s7, 30(a0)
 ; RV32I-NEXT:    lbu a0, 31(a0)
 ; RV32I-NEXT:    lbu a1, 0(a1)
+; RV32I-NEXT:    sw zero, 48(sp)
+; RV32I-NEXT:    sw zero, 52(sp)
 ; RV32I-NEXT:    sw zero, 56(sp)
 ; RV32I-NEXT:    sw zero, 60(sp)
-; RV32I-NEXT:    sw zero, 64(sp)
-; RV32I-NEXT:    sw zero, 68(sp)
+; RV32I-NEXT:    sw zero, 32(sp)
+; RV32I-NEXT:    sw zero, 36(sp)
 ; RV32I-NEXT:    sw zero, 40(sp)
 ; RV32I-NEXT:    sw zero, 44(sp)
-; RV32I-NEXT:    sw zero, 48(sp)
-; RV32I-NEXT:    sw zero, 52(sp)
-; RV32I-NEXT:    slli t6, t6, 8
-; RV32I-NEXT:    or t4, t6, t4
-; RV32I-NEXT:    addi t6, sp, 8
-; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    slli t2, t2, 8
-; RV32I-NEXT:    slli t3, t3, 16
-; RV32I-NEXT:    slli t5, t5, 24
-; RV32I-NEXT:    slli s0, s0, 8
-; RV32I-NEXT:    slli s2, s2, 16
-; RV32I-NEXT:    slli s3, s3, 24
-; RV32I-NEXT:    slli s7, s7, 8
-; RV32I-NEXT:    slli s8, s8, 16
-; RV32I-NEXT:    slli s9, s9, 24
-; RV32I-NEXT:    slli s11, s11, 8
-; RV32I-NEXT:    slli ra, ra, 16
+; RV32I-NEXT:    slli s0, s0, 16
+; RV32I-NEXT:    slli s1, s1, 24
+; RV32I-NEXT:    or s0, s1, s0
+; RV32I-NEXT:    mv s1, sp
+; RV32I-NEXT:    slli s2, s2, 8
+; RV32I-NEXT:    slli s3, s3, 16
+; RV32I-NEXT:    slli s4, s4, 24
+; RV32I-NEXT:    slli s6, s6, 8
+; RV32I-NEXT:    slli s7, s7, 16
 ; RV32I-NEXT:    slli a0, a0, 24
 ; RV32I-NEXT:    slli a1, a1, 3
-; RV32I-NEXT:    or a5, a6, a5
-; RV32I-NEXT:    or a3, t2, a3
-; RV32I-NEXT:    or a6, t5, t3
-; RV32I-NEXT:    or a4, s0, a4
-; RV32I-NEXT:    or t2, s3, s2
-; RV32I-NEXT:    or t3, s7, s1
-; RV32I-NEXT:    or t5, s9, s8
-; RV32I-NEXT:    or s0, s11, s10
-; RV32I-NEXT:    or a0, a0, ra
+; RV32I-NEXT:    or t3, s2, t3
+; RV32I-NEXT:    or s2, s4, s3
+; RV32I-NEXT:    or s3, s6, s5
+; RV32I-NEXT:    or a0, a0, s7
 ; RV32I-NEXT:    andi a1, a1, 24
-; RV32I-NEXT:    or a7, t0, a7
-; RV32I-NEXT:    or t0, s4, t1
-; RV32I-NEXT:    or t1, s6, s5
-; RV32I-NEXT:    or a5, a5, t4
-; RV32I-NEXT:    or a3, a6, a3
-; RV32I-NEXT:    or a4, t2, a4
-; RV32I-NEXT:    or a6, t5, t3
-; RV32I-NEXT:    or a0, a0, s0
-; RV32I-NEXT:    add t6, t6, a1
-; RV32I-NEXT:    sw a3, 24(sp)
-; RV32I-NEXT:    sw a4, 28(sp)
-; RV32I-NEXT:    sw a6, 32(sp)
-; RV32I-NEXT:    sw a0, 36(sp)
-; RV32I-NEXT:    sw a7, 8(sp)
-; RV32I-NEXT:    sw t0, 12(sp)
-; RV32I-NEXT:    sw t1, 16(sp)
-; RV32I-NEXT:    sw a5, 20(sp)
-; RV32I-NEXT:    lw a6, 16(t6)
-; RV32I-NEXT:    lw a5, 20(t6)
-; RV32I-NEXT:    lw a7, 24(t6)
-; RV32I-NEXT:    lw a1, 0(t6)
-; RV32I-NEXT:    lw a0, 4(t6)
-; RV32I-NEXT:    lw a4, 8(t6)
-; RV32I-NEXT:    lw a3, 12(t6)
-; RV32I-NEXT:    lw t0, 28(t6)
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a5, t0, a7
+; RV32I-NEXT:    or a6, t2, t1
+; RV32I-NEXT:    or a7, t5, t4
+; RV32I-NEXT:    or t0, s0, t6
+; RV32I-NEXT:    or t1, s2, t3
+; RV32I-NEXT:    or a0, a0, s3
+; RV32I-NEXT:    add s1, s1, a1
+; RV32I-NEXT:    sw a7, 16(sp)
+; RV32I-NEXT:    sw t0, 20(sp)
+; RV32I-NEXT:    sw t1, 24(sp)
+; RV32I-NEXT:    sw a0, 28(sp)
+; RV32I-NEXT:    sw a3, 0(sp)
+; RV32I-NEXT:    sw a4, 4(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a6, 12(sp)
+; RV32I-NEXT:    lw a6, 16(s1)
+; RV32I-NEXT:    lw a5, 20(s1)
+; RV32I-NEXT:    lw a7, 24(s1)
+; RV32I-NEXT:    lw a1, 0(s1)
+; RV32I-NEXT:    lw a0, 4(s1)
+; RV32I-NEXT:    lw a4, 8(s1)
+; RV32I-NEXT:    lw a3, 12(s1)
+; RV32I-NEXT:    lw t0, 28(s1)
 ; RV32I-NEXT:    srli t1, a7, 24
 ; RV32I-NEXT:    srli t2, a7, 16
 ; RV32I-NEXT:    srli t3, a7, 8
@@ -3219,21 +3212,21 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; RV32I-NEXT:    srli s5, a5, 8
 ; RV32I-NEXT:    srli s6, a4, 24
 ; RV32I-NEXT:    srli s7, a4, 16
-; RV32I-NEXT:    srli s8, a4, 8
-; RV32I-NEXT:    srli s9, a3, 24
-; RV32I-NEXT:    srli s10, a3, 16
-; RV32I-NEXT:    srli s11, a3, 8
-; RV32I-NEXT:    srli ra, a1, 24
 ; RV32I-NEXT:    sb a7, 24(a2)
+; RV32I-NEXT:    srli a7, a4, 8
 ; RV32I-NEXT:    sb t3, 25(a2)
+; RV32I-NEXT:    srli t3, a3, 24
 ; RV32I-NEXT:    sb t2, 26(a2)
+; RV32I-NEXT:    srli t2, a3, 16
 ; RV32I-NEXT:    sb t1, 27(a2)
-; RV32I-NEXT:    srli a7, a1, 16
+; RV32I-NEXT:    srli t1, a3, 8
 ; RV32I-NEXT:    sb t0, 28(a2)
+; RV32I-NEXT:    srli t0, a1, 24
 ; RV32I-NEXT:    sb t6, 29(a2)
+; RV32I-NEXT:    srli t6, a1, 16
 ; RV32I-NEXT:    sb t5, 30(a2)
 ; RV32I-NEXT:    sb t4, 31(a2)
-; RV32I-NEXT:    srli t0, a1, 8
+; RV32I-NEXT:    srli t4, a1, 8
 ; RV32I-NEXT:    sb a6, 16(a2)
 ; RV32I-NEXT:    sb s2, 17(a2)
 ; RV32I-NEXT:    sb s1, 18(a2)
@@ -3245,36 +3238,35 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; RV32I-NEXT:    sb s3, 23(a2)
 ; RV32I-NEXT:    srli a5, a0, 16
 ; RV32I-NEXT:    sb a4, 8(a2)
-; RV32I-NEXT:    sb s8, 9(a2)
+; RV32I-NEXT:    sb a7, 9(a2)
 ; RV32I-NEXT:    sb s7, 10(a2)
 ; RV32I-NEXT:    sb s6, 11(a2)
 ; RV32I-NEXT:    srli a4, a0, 8
 ; RV32I-NEXT:    sb a3, 12(a2)
-; RV32I-NEXT:    sb s11, 13(a2)
-; RV32I-NEXT:    sb s10, 14(a2)
-; RV32I-NEXT:    sb s9, 15(a2)
+; RV32I-NEXT:    sb t1, 13(a2)
+; RV32I-NEXT:    sb t2, 14(a2)
+; RV32I-NEXT:    sb t3, 15(a2)
 ; RV32I-NEXT:    sb a1, 0(a2)
-; RV32I-NEXT:    sb t0, 1(a2)
-; RV32I-NEXT:    sb a7, 2(a2)
-; RV32I-NEXT:    sb ra, 3(a2)
+; RV32I-NEXT:    sb t4, 1(a2)
+; RV32I-NEXT:    sb t6, 2(a2)
+; RV32I-NEXT:    sb t0, 3(a2)
 ; RV32I-NEXT:    sb a0, 4(a2)
 ; RV32I-NEXT:    sb a4, 5(a2)
 ; RV32I-NEXT:    sb a5, 6(a2)
 ; RV32I-NEXT:    sb a6, 7(a2)
-; RV32I-NEXT:    lw ra, 124(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 120(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 116(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 112(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s4, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s5, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s6, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s7, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s8, 88(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s9, 84(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s10, 80(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s11, 76(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 128
+; RV32I-NEXT:    lw s0, 108(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 104(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 100(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 96(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s5, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s6, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s7, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s8, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s9, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s10, 68(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s11, 64(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 112
 ; RV32I-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
   %dwordOff = load i256, ptr %dwordOff.ptr, align 1
@@ -3518,132 +3510,129 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ;
 ; RV32I-LABEL: shl_32bytes:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -128
-; RV32I-NEXT:    sw ra, 124(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 120(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 116(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 112(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s5, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s6, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s7, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s8, 88(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s9, 84(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s10, 80(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s11, 76(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu s1, 0(a0)
+; RV32I-NEXT:    addi sp, sp, -112
+; RV32I-NEXT:    sw s0, 108(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 104(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 100(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 96(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s5, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s6, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s7, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s8, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s9, 72(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s10, 68(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s11, 64(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a3, 0(a0)
 ; RV32I-NEXT:    lbu a4, 1(a0)
 ; RV32I-NEXT:    lbu a5, 2(a0)
 ; RV32I-NEXT:    lbu a6, 3(a0)
-; RV32I-NEXT:    lbu t1, 4(a0)
-; RV32I-NEXT:    lbu t3, 5(a0)
-; RV32I-NEXT:    lbu t4, 6(a0)
-; RV32I-NEXT:    lbu s0, 7(a0)
-; RV32I-NEXT:    lbu t2, 8(a0)
-; RV32I-NEXT:    lbu s3, 9(a0)
-; RV32I-NEXT:    lbu s6, 10(a0)
-; RV32I-NEXT:    lbu s8, 11(a0)
-; RV32I-NEXT:    lbu s9, 12(a0)
-; RV32I-NEXT:    lbu s10, 13(a0)
-; RV32I-NEXT:    lbu s4, 14(a0)
-; RV32I-NEXT:    lbu s7, 15(a0)
-; RV32I-NEXT:    lbu s5, 16(a0)
-; RV32I-NEXT:    lbu s11, 17(a0)
-; RV32I-NEXT:    lbu ra, 18(a0)
-; RV32I-NEXT:    lbu a3, 19(a0)
-; RV32I-NEXT:    lbu t5, 20(a0)
-; RV32I-NEXT:    lbu t6, 21(a0)
-; RV32I-NEXT:    lbu a7, 22(a0)
-; RV32I-NEXT:    lbu t0, 23(a0)
+; RV32I-NEXT:    lbu a7, 4(a0)
+; RV32I-NEXT:    lbu t0, 5(a0)
+; RV32I-NEXT:    lbu t1, 6(a0)
+; RV32I-NEXT:    lbu t2, 7(a0)
+; RV32I-NEXT:    lbu t3, 8(a0)
+; RV32I-NEXT:    lbu t4, 9(a0)
+; RV32I-NEXT:    lbu t5, 10(a0)
+; RV32I-NEXT:    lbu t6, 11(a0)
+; RV32I-NEXT:    lbu s0, 12(a0)
+; RV32I-NEXT:    lbu s1, 13(a0)
+; RV32I-NEXT:    lbu s2, 14(a0)
+; RV32I-NEXT:    lbu s3, 15(a0)
+; RV32I-NEXT:    lbu s4, 16(a0)
+; RV32I-NEXT:    lbu s5, 17(a0)
+; RV32I-NEXT:    lbu s6, 18(a0)
+; RV32I-NEXT:    lbu s7, 19(a0)
 ; RV32I-NEXT:    slli a4, a4, 8
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    slli t3, t3, 8
-; RV32I-NEXT:    slli t4, t4, 16
-; RV32I-NEXT:    slli s0, s0, 24
-; RV32I-NEXT:    or a4, a4, s1
-; RV32I-NEXT:    sw a4, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t2, t2, 24
+; RV32I-NEXT:    or a3, a4, a3
 ; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a5, t3, t1
-; RV32I-NEXT:    or a6, s0, t4
-; RV32I-NEXT:    lbu t1, 24(a0)
+; RV32I-NEXT:    or a5, t0, a7
+; RV32I-NEXT:    or a6, t2, t1
+; RV32I-NEXT:    lbu s8, 20(a0)
+; RV32I-NEXT:    lbu s9, 21(a0)
+; RV32I-NEXT:    lbu s10, 22(a0)
+; RV32I-NEXT:    lbu s11, 23(a0)
+; RV32I-NEXT:    slli t4, t4, 8
+; RV32I-NEXT:    slli t5, t5, 16
+; RV32I-NEXT:    slli t6, t6, 24
+; RV32I-NEXT:    slli s1, s1, 8
+; RV32I-NEXT:    slli s2, s2, 16
+; RV32I-NEXT:    slli s3, s3, 24
+; RV32I-NEXT:    or a7, t4, t3
+; RV32I-NEXT:    or t0, t6, t5
+; RV32I-NEXT:    or t1, s1, s0
+; RV32I-NEXT:    or t2, s3, s2
+; RV32I-NEXT:    lbu t6, 24(a0)
 ; RV32I-NEXT:    lbu s0, 25(a0)
 ; RV32I-NEXT:    lbu s1, 26(a0)
 ; RV32I-NEXT:    lbu s2, 27(a0)
-; RV32I-NEXT:    slli s3, s3, 8
+; RV32I-NEXT:    slli s5, s5, 8
 ; RV32I-NEXT:    slli s6, s6, 16
-; RV32I-NEXT:    slli s8, s8, 24
-; RV32I-NEXT:    slli s10, s10, 8
-; RV32I-NEXT:    or t2, s3, t2
-; RV32I-NEXT:    or t3, s8, s6
-; RV32I-NEXT:    or t4, s10, s9
-; RV32I-NEXT:    lbu s3, 28(a0)
-; RV32I-NEXT:    lbu s6, 29(a0)
-; RV32I-NEXT:    lbu s8, 30(a0)
-; RV32I-NEXT:    lbu s9, 31(a0)
-; RV32I-NEXT:    slli s4, s4, 16
 ; RV32I-NEXT:    slli s7, s7, 24
-; RV32I-NEXT:    slli s11, s11, 8
-; RV32I-NEXT:    slli ra, ra, 16
-; RV32I-NEXT:    slli a3, a3, 24
-; RV32I-NEXT:    or a0, s7, s4
-; RV32I-NEXT:    or s4, s11, s5
-; RV32I-NEXT:    or s5, a3, ra
-; RV32I-NEXT:    lbu a3, 0(a1)
-; RV32I-NEXT:    lbu s7, 1(a1)
-; RV32I-NEXT:    lbu s10, 2(a1)
+; RV32I-NEXT:    slli s9, s9, 8
+; RV32I-NEXT:    or t3, s5, s4
+; RV32I-NEXT:    or t4, s7, s6
+; RV32I-NEXT:    or t5, s9, s8
+; RV32I-NEXT:    lbu s3, 28(a0)
+; RV32I-NEXT:    lbu s4, 29(a0)
+; RV32I-NEXT:    lbu s5, 30(a0)
+; RV32I-NEXT:    lbu s6, 31(a0)
+; RV32I-NEXT:    slli s10, s10, 16
+; RV32I-NEXT:    slli s11, s11, 24
+; RV32I-NEXT:    slli s0, s0, 8
+; RV32I-NEXT:    slli s1, s1, 16
+; RV32I-NEXT:    slli s2, s2, 24
+; RV32I-NEXT:    or a0, s11, s10
+; RV32I-NEXT:    or t6, s0, t6
+; RV32I-NEXT:    or s0, s2, s1
+; RV32I-NEXT:    lbu s1, 0(a1)
+; RV32I-NEXT:    lbu s2, 1(a1)
+; RV32I-NEXT:    lbu s7, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    sw zero, 16(sp)
+; RV32I-NEXT:    sw zero, 20(sp)
 ; RV32I-NEXT:    sw zero, 24(sp)
 ; RV32I-NEXT:    sw zero, 28(sp)
-; RV32I-NEXT:    sw zero, 32(sp)
-; RV32I-NEXT:    sw zero, 36(sp)
+; RV32I-NEXT:    sw zero, 0(sp)
+; RV32I-NEXT:    sw zero, 4(sp)
 ; RV32I-NEXT:    sw zero, 8(sp)
 ; RV32I-NEXT:    sw zero, 12(sp)
-; RV32I-NEXT:    sw zero, 16(sp)
-; RV32I-NEXT:    sw zero, 20(sp)
-; RV32I-NEXT:    slli t6, t6, 8
-; RV32I-NEXT:    or t5, t6, t5
-; RV32I-NEXT:    addi t6, sp, 40
-; RV32I-NEXT:    slli a7, a7, 16
-; RV32I-NEXT:    slli t0, t0, 24
-; RV32I-NEXT:    slli s0, s0, 8
-; RV32I-NEXT:    slli s1, s1, 16
-; RV32I-NEXT:    slli s2, s2, 24
-; RV32I-NEXT:    slli s6, s6, 8
-; RV32I-NEXT:    slli s8, s8, 16
-; RV32I-NEXT:    slli s9, s9, 24
-; RV32I-NEXT:    slli s7, s7, 8
-; RV32I-NEXT:    slli s10, s10, 16
+; RV32I-NEXT:    slli s4, s4, 8
+; RV32I-NEXT:    or s3, s4, s3
+; RV32I-NEXT:    addi s4, sp, 32
+; RV32I-NEXT:    slli s5, s5, 16
+; RV32I-NEXT:    slli s6, s6, 24
+; RV32I-NEXT:    slli s2, s2, 8
+; RV32I-NEXT:    slli s7, s7, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a7, t0, a7
-; RV32I-NEXT:    or t0, s0, t1
-; RV32I-NEXT:    or t1, s2, s1
-; RV32I-NEXT:    or s0, s6, s3
-; RV32I-NEXT:    or s1, s9, s8
-; RV32I-NEXT:    or a3, s7, a3
-; RV32I-NEXT:    or a1, a1, s10
-; RV32I-NEXT:    lw s2, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    or a4, a4, s2
-; RV32I-NEXT:    or a5, a6, a5
-; RV32I-NEXT:    or a6, t3, t2
-; RV32I-NEXT:    or a0, a0, t4
-; RV32I-NEXT:    or t2, s5, s4
-; RV32I-NEXT:    or a7, a7, t5
-; RV32I-NEXT:    or t0, t1, t0
-; RV32I-NEXT:    or s0, s1, s0
-; RV32I-NEXT:    or a1, a1, a3
-; RV32I-NEXT:    sw t2, 56(sp)
-; RV32I-NEXT:    sw a7, 60(sp)
-; RV32I-NEXT:    sw t0, 64(sp)
-; RV32I-NEXT:    sw s0, 68(sp)
-; RV32I-NEXT:    sw a4, 40(sp)
-; RV32I-NEXT:    sw a5, 44(sp)
-; RV32I-NEXT:    sw a6, 48(sp)
+; RV32I-NEXT:    or s5, s6, s5
+; RV32I-NEXT:    or s1, s2, s1
+; RV32I-NEXT:    or a1, a1, s7
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a5, t0, a7
+; RV32I-NEXT:    or a6, t2, t1
+; RV32I-NEXT:    or a7, t4, t3
+; RV32I-NEXT:    or a0, a0, t5
+; RV32I-NEXT:    or t0, s0, t6
+; RV32I-NEXT:    or t1, s5, s3
+; RV32I-NEXT:    or a1, a1, s1
+; RV32I-NEXT:    sw a7, 48(sp)
 ; RV32I-NEXT:    sw a0, 52(sp)
+; RV32I-NEXT:    sw t0, 56(sp)
+; RV32I-NEXT:    sw t1, 60(sp)
+; RV32I-NEXT:    sw a3, 32(sp)
+; RV32I-NEXT:    sw a4, 36(sp)
+; RV32I-NEXT:    sw a5, 40(sp)
+; RV32I-NEXT:    sw a6, 44(sp)
 ; RV32I-NEXT:    slli a3, a1, 3
 ; RV32I-NEXT:    andi a1, a1, 28
-; RV32I-NEXT:    sub a1, t6, a1
+; RV32I-NEXT:    sub a1, s4, a1
 ; RV32I-NEXT:    andi a0, a3, 24
 ; RV32I-NEXT:    xori a0, a0, 31
 ; RV32I-NEXT:    lw a4, 0(a1)
@@ -3658,10 +3647,10 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    srli t4, a4, 1
 ; RV32I-NEXT:    sll t5, a7, a3
 ; RV32I-NEXT:    srli t6, a6, 1
-; RV32I-NEXT:    sll s0, a6, a3
+; RV32I-NEXT:    sll a6, a6, a3
 ; RV32I-NEXT:    srli a5, a5, 1
-; RV32I-NEXT:    sll s1, t1, a3
-; RV32I-NEXT:    srli a6, t0, 1
+; RV32I-NEXT:    sll s0, t1, a3
+; RV32I-NEXT:    srli s1, t0, 1
 ; RV32I-NEXT:    sll s2, t0, a3
 ; RV32I-NEXT:    srli a7, a7, 1
 ; RV32I-NEXT:    sll s3, a1, a3
@@ -3669,56 +3658,56 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sll s4, t2, a3
 ; RV32I-NEXT:    srli t0, t1, 1
 ; RV32I-NEXT:    sll s5, a4, a3
-; RV32I-NEXT:    srl t2, t4, a0
-; RV32I-NEXT:    srl t4, t6, a0
-; RV32I-NEXT:    srl t6, a5, a0
-; RV32I-NEXT:    srl s6, a6, a0
-; RV32I-NEXT:    srl s7, a7, a0
-; RV32I-NEXT:    srl s8, a1, a0
-; RV32I-NEXT:    srl s9, t0, a0
-; RV32I-NEXT:    srli t1, s4, 24
-; RV32I-NEXT:    srli a7, s3, 24
+; RV32I-NEXT:    srl t4, t4, a0
+; RV32I-NEXT:    srl a4, t6, a0
+; RV32I-NEXT:    srl t1, a5, a0
+; RV32I-NEXT:    srl t6, s1, a0
+; RV32I-NEXT:    srl s1, a7, a0
+; RV32I-NEXT:    srl s6, a1, a0
+; RV32I-NEXT:    srl s7, t0, a0
+; RV32I-NEXT:    srli t2, s4, 24
+; RV32I-NEXT:    srli t0, s3, 24
 ; RV32I-NEXT:    srli a5, s2, 24
-; RV32I-NEXT:    srli a3, s1, 24
-; RV32I-NEXT:    srli a1, s0, 24
+; RV32I-NEXT:    srli a3, s0, 24
+; RV32I-NEXT:    srli a1, a6, 24
 ; RV32I-NEXT:    srli a0, t5, 24
-; RV32I-NEXT:    srli s10, s5, 24
-; RV32I-NEXT:    srli s11, s5, 16
-; RV32I-NEXT:    srli ra, s5, 8
-; RV32I-NEXT:    srli a4, t3, 24
-; RV32I-NEXT:    or a6, t3, t2
-; RV32I-NEXT:    or t0, t5, t4
-; RV32I-NEXT:    or t2, s0, t6
-; RV32I-NEXT:    or t3, s1, s6
-; RV32I-NEXT:    or t4, s2, s7
-; RV32I-NEXT:    or t5, s3, s8
-; RV32I-NEXT:    or t6, s4, s9
+; RV32I-NEXT:    srli s8, s5, 24
+; RV32I-NEXT:    or a4, t5, a4
+; RV32I-NEXT:    srli t5, s5, 16
+; RV32I-NEXT:    or t1, a6, t1
+; RV32I-NEXT:    srli s9, s5, 8
+; RV32I-NEXT:    or a7, t3, t4
+; RV32I-NEXT:    srli a6, t3, 24
+; RV32I-NEXT:    or t3, s0, t6
+; RV32I-NEXT:    or t4, s2, s1
+; RV32I-NEXT:    or t6, s3, s6
+; RV32I-NEXT:    or s0, s4, s7
 ; RV32I-NEXT:    sb s5, 0(a2)
-; RV32I-NEXT:    sb ra, 1(a2)
-; RV32I-NEXT:    sb s11, 2(a2)
-; RV32I-NEXT:    sb s10, 3(a2)
-; RV32I-NEXT:    srli s0, t6, 16
-; RV32I-NEXT:    srli s1, t6, 8
-; RV32I-NEXT:    srli s2, t5, 16
-; RV32I-NEXT:    srli s3, t5, 8
+; RV32I-NEXT:    sb s9, 1(a2)
+; RV32I-NEXT:    sb t5, 2(a2)
+; RV32I-NEXT:    sb s8, 3(a2)
+; RV32I-NEXT:    srli t5, s0, 16
+; RV32I-NEXT:    srli s1, s0, 8
+; RV32I-NEXT:    srli s2, t6, 16
+; RV32I-NEXT:    srli s3, t6, 8
 ; RV32I-NEXT:    srli s4, t4, 16
 ; RV32I-NEXT:    srli s5, t4, 8
 ; RV32I-NEXT:    srli s6, t3, 16
 ; RV32I-NEXT:    srli s7, t3, 8
-; RV32I-NEXT:    srli s8, t2, 16
-; RV32I-NEXT:    srli s9, t2, 8
-; RV32I-NEXT:    srli s10, t0, 16
-; RV32I-NEXT:    srli s11, t0, 8
-; RV32I-NEXT:    sb t6, 24(a2)
+; RV32I-NEXT:    sb s0, 24(a2)
+; RV32I-NEXT:    srli s0, t1, 16
 ; RV32I-NEXT:    sb s1, 25(a2)
-; RV32I-NEXT:    sb s0, 26(a2)
-; RV32I-NEXT:    sb t1, 27(a2)
-; RV32I-NEXT:    srli t1, a6, 16
-; RV32I-NEXT:    sb t5, 28(a2)
+; RV32I-NEXT:    srli s1, t1, 8
+; RV32I-NEXT:    sb t5, 26(a2)
+; RV32I-NEXT:    srli t5, a4, 16
+; RV32I-NEXT:    sb t2, 27(a2)
+; RV32I-NEXT:    srli t2, a4, 8
+; RV32I-NEXT:    sb t6, 28(a2)
+; RV32I-NEXT:    srli t6, a7, 16
 ; RV32I-NEXT:    sb s3, 29(a2)
 ; RV32I-NEXT:    sb s2, 30(a2)
-; RV32I-NEXT:    sb a7, 31(a2)
-; RV32I-NEXT:    srli a7, a6, 8
+; RV32I-NEXT:    sb t0, 31(a2)
+; RV32I-NEXT:    srli t0, a7, 8
 ; RV32I-NEXT:    sb t4, 16(a2)
 ; RV32I-NEXT:    sb s5, 17(a2)
 ; RV32I-NEXT:    sb s4, 18(a2)
@@ -3727,32 +3716,31 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sb s7, 21(a2)
 ; RV32I-NEXT:    sb s6, 22(a2)
 ; RV32I-NEXT:    sb a3, 23(a2)
-; RV32I-NEXT:    sb t2, 8(a2)
-; RV32I-NEXT:    sb s9, 9(a2)
-; RV32I-NEXT:    sb s8, 10(a2)
+; RV32I-NEXT:    sb t1, 8(a2)
+; RV32I-NEXT:    sb s1, 9(a2)
+; RV32I-NEXT:    sb s0, 10(a2)
 ; RV32I-NEXT:    sb a1, 11(a2)
-; RV32I-NEXT:    sb t0, 12(a2)
-; RV32I-NEXT:    sb s11, 13(a2)
-; RV32I-NEXT:    sb s10, 14(a2)
+; RV32I-NEXT:    sb a4, 12(a2)
+; RV32I-NEXT:    sb t2, 13(a2)
+; RV32I-NEXT:    sb t5, 14(a2)
 ; RV32I-NEXT:    sb a0, 15(a2)
-; RV32I-NEXT:    sb a6, 4(a2)
-; RV32I-NEXT:    sb a7, 5(a2)
-; RV32I-NEXT:    sb t1, 6(a2)
-; RV32I-NEXT:    sb a4, 7(a2)
-; RV32I-NEXT:    lw ra, 124(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 120(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 116(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 112(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s4, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s5, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s6, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s7, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s8, 88(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s9, 84(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s10, 80(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s11, 76(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 128
+; RV32I-NEXT:    sb a7, 4(a2)
+; RV32I-NEXT:    sb t0, 5(a2)
+; RV32I-NEXT:    sb t6, 6(a2)
+; RV32I-NEXT:    sb a6, 7(a2)
+; RV32I-NEXT:    lw s0, 108(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 104(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 100(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 96(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s5, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s6, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s7, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s8, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s9, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s10, 68(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s11, 64(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 112
 ; RV32I-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
   %byteOff = load i256, ptr %byteOff.ptr, align 1
@@ -3997,129 +3985,128 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw
 ;
 ; RV32I-LABEL: shl_32bytes_wordOff:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -128
-; RV32I-NEXT:    sw ra, 124(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 120(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 116(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 112(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s5, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s6, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s7, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s8, 88(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s9, 84(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s10, 80(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s11, 76(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a7, 0(a0)
-; RV32I-NEXT:    lbu t0, 1(a0)
-; RV32I-NEXT:    lbu t1, 2(a0)
-; RV32I-NEXT:    lbu s1, 3(a0)
-; RV32I-NEXT:    lbu s7, 4(a0)
-; RV32I-NEXT:    lbu s8, 5(a0)
-; RV32I-NEXT:    lbu s4, 6(a0)
-; RV32I-NEXT:    lbu s6, 7(a0)
-; RV32I-NEXT:    lbu s5, 8(a0)
-; RV32I-NEXT:    lbu s10, 9(a0)
-; RV32I-NEXT:    lbu s11, 10(a0)
-; RV32I-NEXT:    lbu ra, 11(a0)
-; RV32I-NEXT:    lbu t4, 12(a0)
-; RV32I-NEXT:    lbu t6, 13(a0)
-; RV32I-NEXT:    lbu a5, 14(a0)
-; RV32I-NEXT:    lbu a6, 15(a0)
-; RV32I-NEXT:    lbu a3, 16(a0)
-; RV32I-NEXT:    lbu t2, 17(a0)
-; RV32I-NEXT:    lbu t3, 18(a0)
-; RV32I-NEXT:    lbu t5, 19(a0)
-; RV32I-NEXT:    lbu a4, 20(a0)
-; RV32I-NEXT:    lbu s0, 21(a0)
-; RV32I-NEXT:    lbu s2, 22(a0)
-; RV32I-NEXT:    lbu s3, 23(a0)
+; RV32I-NEXT:    addi sp, sp, -112
+; RV32I-NEXT:    sw s0, 108(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 104(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 100(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 96(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s5, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s6, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s7, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s8, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s9, 72(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s10, 68(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s11, 64(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a3, 0(a0)
+; RV32I-NEXT:    lbu a4, 1(a0)
+; RV32I-NEXT:    lbu a5, 2(a0)
+; RV32I-NEXT:    lbu a6, 3(a0)
+; RV32I-NEXT:    lbu a7, 4(a0)
+; RV32I-NEXT:    lbu t0, 5(a0)
+; RV32I-NEXT:    lbu t1, 6(a0)
+; RV32I-NEXT:    lbu t2, 7(a0)
+; RV32I-NEXT:    lbu t3, 8(a0)
+; RV32I-NEXT:    lbu t4, 9(a0)
+; RV32I-NEXT:    lbu t5, 10(a0)
+; RV32I-NEXT:    lbu t6, 11(a0)
+; RV32I-NEXT:    lbu s2, 12(a0)
+; RV32I-NEXT:    lbu s3, 13(a0)
+; RV32I-NEXT:    lbu s4, 14(a0)
+; RV32I-NEXT:    lbu s5, 15(a0)
+; RV32I-NEXT:    lbu s6, 16(a0)
+; RV32I-NEXT:    lbu s7, 17(a0)
+; RV32I-NEXT:    lbu s8, 18(a0)
+; RV32I-NEXT:    lbu s9, 19(a0)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a6, a6, 24
 ; RV32I-NEXT:    slli t0, t0, 8
 ; RV32I-NEXT:    slli t1, t1, 16
-; RV32I-NEXT:    slli s1, s1, 24
-; RV32I-NEXT:    slli s8, s8, 8
-; RV32I-NEXT:    or a7, t0, a7
-; RV32I-NEXT:    or t0, s1, t1
-; RV32I-NEXT:    or t1, s8, s7
-; RV32I-NEXT:    lbu s1, 24(a0)
-; RV32I-NEXT:    lbu s7, 25(a0)
-; RV32I-NEXT:    lbu s8, 26(a0)
-; RV32I-NEXT:    lbu s9, 27(a0)
+; RV32I-NEXT:    slli t2, t2, 24
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a5, t0, a7
+; RV32I-NEXT:    or a6, t2, t1
+; RV32I-NEXT:    lbu s10, 20(a0)
+; RV32I-NEXT:    lbu s11, 21(a0)
+; RV32I-NEXT:    lbu s0, 22(a0)
+; RV32I-NEXT:    lbu s1, 23(a0)
+; RV32I-NEXT:    slli t4, t4, 8
+; RV32I-NEXT:    slli t5, t5, 16
+; RV32I-NEXT:    slli t6, t6, 24
+; RV32I-NEXT:    slli s3, s3, 8
 ; RV32I-NEXT:    slli s4, s4, 16
-; RV32I-NEXT:    slli s6, s6, 24
-; RV32I-NEXT:    slli s10, s10, 8
-; RV32I-NEXT:    slli s11, s11, 16
-; RV32I-NEXT:    slli ra, ra, 24
-; RV32I-NEXT:    or s4, s6, s4
-; RV32I-NEXT:    or s5, s10, s5
-; RV32I-NEXT:    or s6, ra, s11
-; RV32I-NEXT:    lbu s10, 28(a0)
-; RV32I-NEXT:    lbu s11, 29(a0)
-; RV32I-NEXT:    lbu ra, 30(a0)
+; RV32I-NEXT:    slli s5, s5, 24
+; RV32I-NEXT:    or a7, t4, t3
+; RV32I-NEXT:    or t0, t6, t5
+; RV32I-NEXT:    or t1, s3, s2
+; RV32I-NEXT:    or t2, s5, s4
+; RV32I-NEXT:    lbu t3, 24(a0)
+; RV32I-NEXT:    lbu s2, 25(a0)
+; RV32I-NEXT:    lbu s3, 26(a0)
+; RV32I-NEXT:    lbu s4, 27(a0)
+; RV32I-NEXT:    slli s7, s7, 8
+; RV32I-NEXT:    slli s8, s8, 16
+; RV32I-NEXT:    slli s9, s9, 24
+; RV32I-NEXT:    slli s11, s11, 8
+; RV32I-NEXT:    or t4, s7, s6
+; RV32I-NEXT:    or t5, s9, s8
+; RV32I-NEXT:    or t6, s11, s10
+; RV32I-NEXT:    lbu s5, 28(a0)
+; RV32I-NEXT:    lbu s6, 29(a0)
+; RV32I-NEXT:    lbu s7, 30(a0)
 ; RV32I-NEXT:    lbu a0, 31(a0)
 ; RV32I-NEXT:    lbu a1, 0(a1)
+; RV32I-NEXT:    sw zero, 16(sp)
+; RV32I-NEXT:    sw zero, 20(sp)
 ; RV32I-NEXT:    sw zero, 24(sp)
 ; RV32I-NEXT:    sw zero, 28(sp)
-; RV32I-NEXT:    sw zero, 32(sp)
-; RV32I-NEXT:    sw zero, 36(sp)
+; RV32I-NEXT:    sw zero, 0(sp)
+; RV32I-NEXT:    sw zero, 4(sp)
 ; RV32I-NEXT:    sw zero, 8(sp)
 ; RV32I-NEXT:    sw zero, 12(sp)
-; RV32I-NEXT:    sw zero, 16(sp)
-; RV32I-NEXT:    sw zero, 20(sp)
-; RV32I-NEXT:    slli t6, t6, 8
-; RV32I-NEXT:    or t4, t6, t4
-; RV32I-NEXT:    addi t6, sp, 40
-; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    slli t2, t2, 8
-; RV32I-NEXT:    slli t3, t3, 16
-; RV32I-NEXT:    slli t5, t5, 24
-; RV32I-NEXT:    slli s0, s0, 8
-; RV32I-NEXT:    slli s2, s2, 16
-; RV32I-NEXT:    slli s3, s3, 24
-; RV32I-NEXT:    slli s7, s7, 8
-; RV32I-NEXT:    slli s8, s8, 16
-; RV32I-NEXT:    slli s9, s9, 24
-; RV32I-NEXT:    slli s11, s11, 8
-; RV32I-NEXT:    slli ra, ra, 16
+; RV32I-NEXT:    slli s0, s0, 16
+; RV32I-NEXT:    slli s1, s1, 24
+; RV32I-NEXT:    or s0, s1, s0
+; RV32I-NEXT:    addi s1, sp, 32
+; RV32I-NEXT:    slli s2, s2, 8
+; RV32I-NEXT:    slli s3, s3, 16
+; RV32I-NEXT:    slli s4, s4, 24
+; RV32I-NEXT:    slli s6, s6, 8
+; RV32I-NEXT:    slli s7, s7, 16
 ; RV32I-NEXT:    slli a0, a0, 24
 ; RV32I-NEXT:    slli a1, a1, 2
-; RV32I-NEXT:    or a5, a6, a5
-; RV32I-NEXT:    or a3, t2, a3
-; RV32I-NEXT:    or a6, t5, t3
-; RV32I-NEXT:    or a4, s0, a4
-; RV32I-NEXT:    or t2, s3, s2
-; RV32I-NEXT:    or t3, s7, s1
-; RV32I-NEXT:    or t5, s9, s8
-; RV32I-NEXT:    or s0, s11, s10
-; RV32I-NEXT:    or a0, a0, ra
+; RV32I-NEXT:    or t3, s2, t3
+; RV32I-NEXT:    or s2, s4, s3
+; RV32I-NEXT:    or s3, s6, s5
+; RV32I-NEXT:    or a0, a0, s7
 ; RV32I-NEXT:    andi a1, a1, 28
-; RV32I-NEXT:    or a7, t0, a7
-; RV32I-NEXT:    or t0, s4, t1
-; RV32I-NEXT:    or t1, s6, s5
-; RV32I-NEXT:    or a5, a5, t4
-; RV32I-NEXT:    or a3, a6, a3
-; RV32I-NEXT:    or a4, t2, a4
-; RV32I-NEXT:    or a6, t5, t3
-; RV32I-NEXT:    or a0, a0, s0
-; RV32I-NEXT:    sub t2, t6, a1
-; RV32I-NEXT:    sw a3, 56(sp)
-; RV32I-NEXT:    sw a4, 60(sp)
-; RV32I-NEXT:    sw a6, 64(sp)
-; RV32I-NEXT:    sw a0, 68(sp)
-; RV32I-NEXT:    sw a7, 40(sp)
-; RV32I-NEXT:    sw t0, 44(sp)
-; RV32I-NEXT:    sw t1, 48(sp)
-; RV32I-NEXT:    sw a5, 52(sp)
-; RV32I-NEXT:    lw a6, 16(t2)
-; RV32I-NEXT:    lw a5, 20(t2)
-; RV32I-NEXT:    lw a7, 24(t2)
-; RV32I-NEXT:    lw a1, 0(t2)
-; RV32I-NEXT:    lw a0, 4(t2)
-; RV32I-NEXT:    lw a4, 8(t2)
-; RV32I-NEXT:    lw a3, 12(t2)
-; RV32I-NEXT:    lw t0, 28(t2)
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a5, t0, a7
+; RV32I-NEXT:    or a6, t2, t1
+; RV32I-NEXT:    or a7, t5, t4
+; RV32I-NEXT:    or t0, s0, t6
+; RV32I-NEXT:    or t1, s2, t3
+; RV32I-NEXT:    or a0, a0, s3
+; RV32I-NEXT:    sub s1, s1, a1
+; RV32I-NEXT:    sw a7, 48(sp)
+; RV32I-NEXT:    sw t0, 52(sp)
+; RV32I-NEXT:    sw t1, 56(sp)
+; RV32I-NEXT:    sw a0, 60(sp)
+; RV32I-NEXT:    sw a3, 32(sp)
+; RV32I-NEXT:    sw a4, 36(sp)
+; RV32I-NEXT:    sw a5, 40(sp)
+; RV32I-NEXT:    sw a6, 44(sp)
+; RV32I-NEXT:    lw a6, 16(s1)
+; RV32I-NEXT:    lw a5, 20(s1)
+; RV32I-NEXT:    lw a7, 24(s1)
+; RV32I-NEXT:    lw a1, 0(s1)
+; RV32I-NEXT:    lw a0, 4(s1)
+; RV32I-NEXT:    lw a4, 8(s1)
+; RV32I-NEXT:    lw a3, 12(s1)
+; RV32I-NEXT:    lw t0, 28(s1)
 ; RV32I-NEXT:    srli t1, a7, 24
 ; RV32I-NEXT:    srli t2, a7, 16
 ; RV32I-NEXT:    srli t3, a7, 8
@@ -4134,21 +4121,21 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw
 ; RV32I-NEXT:    srli s5, a5, 8
 ; RV32I-NEXT:    srli s6, a4, 24
 ; RV32I-NEXT:    srli s7, a4, 16
-; RV32I-NEXT:    srli s8, a4, 8
-; RV32I-NEXT:    srli s9, a3, 24
-; RV32I-NEXT:    srli s10, a3, 16
-; RV32I-NEXT:    srli s11, a3, 8
-; RV32I-NEXT:    srli ra, a1, 24
 ; RV32I-NEXT:    sb a7, 24(a2)
+; RV32I-NEXT:    srli a7, a4, 8
 ; RV32I-NEXT:    sb t3, 25(a2)
+; RV32I-NEXT:    srli t3, a3, 24
 ; RV32I-NEXT:    sb t2, 26(a2)
+; RV32I-NEXT:    srli t2, a3, 16
 ; RV32I-NEXT:    sb t1, 27(a2)
-; RV32I-NEXT:    srli a7, a1, 16
+; RV32I-NEXT:    srli t1, a3, 8
 ; RV32I-NEXT:    sb t0, 28(a2)
+; RV32I-NEXT:    srli t0, a1, 24
 ; RV32I-NEXT:    sb t6, 29(a2)
+; RV32I-NEXT:    srli t6, a1, 16
 ; RV32I-NEXT:    sb t5, 30(a2)
 ; RV32I-NEXT:    sb t4, 31(a2)
-; RV32I-NEXT:    srli t0, a1, 8
+; RV32I-NEXT:    srli t4, a1, 8
 ; RV32I-NEXT:    sb a6, 16(a2)
 ; RV32I-NEXT:    sb s2, 17(a2)
 ; RV32I-NEXT:    sb s1, 18(a2)
@@ -4160,36 +4147,35 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw
 ; RV32I-NEXT:    sb s3, 23(a2)
 ; RV32I-NEXT:    srli a5, a0, 16
 ; RV32I-NEXT:    sb a4, 8(a2)
-; RV32I-NEXT:    sb s8, 9(a2)
+; RV32I-NEXT:    sb a7, 9(a2)
 ; RV32I-NEXT:    sb s7, 10(a2)
 ; RV32I-NEXT:    sb s6, 11(a2)
 ; RV32I-NEXT:    srli a4, a0, 8
 ; RV32I-NEXT:    sb a3, 12(a2)
-; RV32I-NEXT:    sb s11, 13(a2)
-; RV32I-NEXT:    sb s10, 14(a2)
-; RV32I-NEXT:    sb s9, 15(a2)
+; RV32I-NEXT:    sb t1, 13(a2)
+; RV32I-NEXT:    sb t2, 14(a2)
+; RV32I-NEXT:    sb t3, 15(a2)
 ; RV32I-NEXT:    sb a1, 0(a2)
-; RV32I-NEXT:    sb t0, 1(a2)
-; RV32I-NEXT:    sb a7, 2(a2)
-; RV32I-NEXT:    sb ra, 3(a2)
+; RV32I-NEXT:    sb t4, 1(a2)
+; RV32I-NEXT:    sb t6, 2(a2)
+; RV32I-NEXT:    sb t0, 3(a2)
 ; RV32I-NEXT:    sb a0, 4(a2)
 ; RV32I-NEXT:    sb a4, 5(a2)
 ; RV32I-NEXT:    sb a5, 6(a2)
 ; RV32I-NEXT:    sb a6, 7(a2)
-; RV32I-NEXT:    lw ra, 124(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 120(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 116(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 112(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s4, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s5, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s6, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s7, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s8, 88(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s9, 84(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s10, 80(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s11, 76(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 128
+; RV32I-NEXT:    lw s0, 108(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 104(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 100(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 96(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s5, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s6, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s7, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s8, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s9, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s10, 68(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s11, 64(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 112
 ; RV32I-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
   %wordOff = load i256, ptr %wordOff.ptr, align 1
@@ -4215,111 +4201,111 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
 ; RV64I-NEXT:    sd s9, 80(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s10, 72(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s11, 64(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a5, 0(a0)
-; RV64I-NEXT:    lbu a7, 1(a0)
-; RV64I-NEXT:    lbu t2, 2(a0)
-; RV64I-NEXT:    lbu s3, 3(a0)
-; RV64I-NEXT:    lbu t0, 4(a0)
-; RV64I-NEXT:    lbu s8, 5(a0)
-; RV64I-NEXT:    lbu s9, 6(a0)
-; RV64I-NEXT:    lbu s10, 7(a0)
-; RV64I-NEXT:    lbu s2, 8(a0)
-; RV64I-NEXT:    lbu s4, 9(a0)
-; RV64I-NEXT:    lbu s5, 10(a0)
-; RV64I-NEXT:    lbu s6, 11(a0)
-; RV64I-NEXT:    lbu s7, 12(a0)
-; RV64I-NEXT:    lbu s11, 13(a0)
-; RV64I-NEXT:    lbu t1, 14(a0)
-; RV64I-NEXT:    lbu t3, 15(a0)
-; RV64I-NEXT:    lbu a3, 16(a0)
-; RV64I-NEXT:    lbu a6, 17(a0)
-; RV64I-NEXT:    lbu t4, 18(a0)
-; RV64I-NEXT:    lbu t5, 19(a0)
-; RV64I-NEXT:    lbu a4, 20(a0)
-; RV64I-NEXT:    lbu t6, 21(a0)
-; RV64I-NEXT:    lbu s0, 22(a0)
-; RV64I-NEXT:    lbu s1, 23(a0)
-; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    slli t2, t2, 16
+; RV64I-NEXT:    lbu a3, 0(a0)
+; RV64I-NEXT:    lbu a4, 1(a0)
+; RV64I-NEXT:    lbu a5, 2(a0)
+; RV64I-NEXT:    lbu a6, 3(a0)
+; RV64I-NEXT:    lbu a7, 4(a0)
+; RV64I-NEXT:    lbu t0, 5(a0)
+; RV64I-NEXT:    lbu t1, 6(a0)
+; RV64I-NEXT:    lbu t2, 7(a0)
+; RV64I-NEXT:    lbu t3, 8(a0)
+; RV64I-NEXT:    lbu t4, 9(a0)
+; RV64I-NEXT:    lbu t5, 10(a0)
+; RV64I-NEXT:    lbu t6, 11(a0)
+; RV64I-NEXT:    lbu s0, 12(a0)
+; RV64I-NEXT:    lbu s1, 13(a0)
+; RV64I-NEXT:    lbu s2, 14(a0)
+; RV64I-NEXT:    lbu s3, 15(a0)
+; RV64I-NEXT:    lbu s4, 16(a0)
+; RV64I-NEXT:    lbu s5, 17(a0)
+; RV64I-NEXT:    lbu s6, 18(a0)
+; RV64I-NEXT:    lbu s7, 19(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    slli a5, a5, 16
+; RV64I-NEXT:    slli a6, a6, 24
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t2, t2, 24
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a5, t0, a7
+; RV64I-NEXT:    or a6, t2, t1
+; RV64I-NEXT:    lbu s8, 20(a0)
+; RV64I-NEXT:    lbu s9, 21(a0)
+; RV64I-NEXT:    lbu s10, 22(a0)
+; RV64I-NEXT:    lbu s11, 23(a0)
+; RV64I-NEXT:    slli t4, t4, 8
+; RV64I-NEXT:    slli t5, t5, 16
+; RV64I-NEXT:    slli t6, t6, 24
+; RV64I-NEXT:    slli s1, s1, 8
+; RV64I-NEXT:    slli s2, s2, 16
 ; RV64I-NEXT:    slli s3, s3, 24
-; RV64I-NEXT:    slli s8, s8, 8
-; RV64I-NEXT:    slli s9, s9, 16
-; RV64I-NEXT:    slli s10, s10, 24
-; RV64I-NEXT:    or a5, a7, a5
-; RV64I-NEXT:    or a7, s3, t2
-; RV64I-NEXT:    or t0, s8, t0
-; RV64I-NEXT:    or t2, s10, s9
-; RV64I-NEXT:    lbu s3, 24(a0)
-; RV64I-NEXT:    lbu s8, 25(a0)
-; RV64I-NEXT:    lbu s9, 26(a0)
-; RV64I-NEXT:    lbu s10, 27(a0)
-; RV64I-NEXT:    slli s4, s4, 8
-; RV64I-NEXT:    slli s5, s5, 16
-; RV64I-NEXT:    slli s6, s6, 24
-; RV64I-NEXT:    slli s11, s11, 8
-; RV64I-NEXT:    or s2, s4, s2
-; RV64I-NEXT:    or s4, s6, s5
-; RV64I-NEXT:    or s5, s11, s7
-; RV64I-NEXT:    lbu s6, 28(a0)
-; RV64I-NEXT:    lbu s7, 29(a0)
-; RV64I-NEXT:    lbu s11, 30(a0)
+; RV64I-NEXT:    or a7, t4, t3
+; RV64I-NEXT:    or t0, t6, t5
+; RV64I-NEXT:    or t1, s1, s0
+; RV64I-NEXT:    or t2, s3, s2
+; RV64I-NEXT:    lbu t3, 24(a0)
+; RV64I-NEXT:    lbu t4, 25(a0)
+; RV64I-NEXT:    lbu t5, 26(a0)
+; RV64I-NEXT:    lbu t6, 27(a0)
+; RV64I-NEXT:    slli s5, s5, 8
+; RV64I-NEXT:    slli s6, s6, 16
+; RV64I-NEXT:    slli s7, s7, 24
+; RV64I-NEXT:    slli s9, s9, 8
+; RV64I-NEXT:    or s0, s5, s4
+; RV64I-NEXT:    or s1, s7, s6
+; RV64I-NEXT:    or s2, s9, s8
+; RV64I-NEXT:    lbu s3, 28(a0)
+; RV64I-NEXT:    lbu s4, 29(a0)
+; RV64I-NEXT:    lbu s5, 30(a0)
 ; RV64I-NEXT:    lbu a0, 31(a0)
 ; RV64I-NEXT:    lbu a1, 0(a1)
 ; RV64I-NEXT:    sd zero, 0(sp)
 ; RV64I-NEXT:    sd zero, 8(sp)
 ; RV64I-NEXT:    sd zero, 16(sp)
 ; RV64I-NEXT:    sd zero, 24(sp)
-; RV64I-NEXT:    slli t1, t1, 16
-; RV64I-NEXT:    slli t3, t3, 24
-; RV64I-NEXT:    or t1, t3, t1
-; RV64I-NEXT:    addi t3, sp, 32
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    slli t4, t4, 16
-; RV64I-NEXT:    slli t5, t5, 24
-; RV64I-NEXT:    slli t6, t6, 8
-; RV64I-NEXT:    slli s0, s0, 16
-; RV64I-NEXT:    slli s1, s1, 24
-; RV64I-NEXT:    slli s8, s8, 8
-; RV64I-NEXT:    slli s9, s9, 16
-; RV64I-NEXT:    slli s10, s10, 24
-; RV64I-NEXT:    slli s7, s7, 8
-; RV64I-NEXT:    slli s11, s11, 16
+; RV64I-NEXT:    slli s10, s10, 16
+; RV64I-NEXT:    slli s11, s11, 24
+; RV64I-NEXT:    or s6, s11, s10
+; RV64I-NEXT:    addi s7, sp, 32
+; RV64I-NEXT:    slli t4, t4, 8
+; RV64I-NEXT:    slli t5, t5, 16
+; RV64I-NEXT:    slli t6, t6, 24
+; RV64I-NEXT:    slli s4, s4, 8
+; RV64I-NEXT:    slli s5, s5, 16
 ; RV64I-NEXT:    slli a0, a0, 24
 ; RV64I-NEXT:    slli a1, a1, 3
-; RV64I-NEXT:    or a3, a6, a3
-; RV64I-NEXT:    or a6, t5, t4
-; RV64I-NEXT:    or a4, t6, a4
-; RV64I-NEXT:    or s0, s1, s0
-; RV64I-NEXT:    or t4, s8, s3
-; RV64I-NEXT:    or t5, s10, s9
-; RV64I-NEXT:    or t6, s7, s6
-; RV64I-NEXT:    or a0, a0, s11
+; RV64I-NEXT:    or t3, t4, t3
+; RV64I-NEXT:    or t4, t6, t5
+; RV64I-NEXT:    or t5, s4, s3
+; RV64I-NEXT:    or a0, a0, s5
 ; RV64I-NEXT:    andi a1, a1, 24
-; RV64I-NEXT:    or a5, a7, a5
-; RV64I-NEXT:    or a7, t2, t0
-; RV64I-NEXT:    or t0, s4, s2
-; RV64I-NEXT:    or t1, t1, s5
-; RV64I-NEXT:    or a3, a6, a3
-; RV64I-NEXT:    or a4, s0, a4
-; RV64I-NEXT:    or a6, t5, t4
-; RV64I-NEXT:    or a0, a0, t6
-; RV64I-NEXT:    sub t2, t3, a1
-; RV64I-NEXT:    slli a7, a7, 32
-; RV64I-NEXT:    slli t1, t1, 32
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a5, t0, a7
+; RV64I-NEXT:    or a6, t2, t1
+; RV64I-NEXT:    or s0, s1, s0
+; RV64I-NEXT:    or a7, s6, s2
+; RV64I-NEXT:    or t0, t4, t3
+; RV64I-NEXT:    or a0, a0, t5
+; RV64I-NEXT:    sub t1, s7, a1
 ; RV64I-NEXT:    slli a4, a4, 32
+; RV64I-NEXT:    slli a6, a6, 32
+; RV64I-NEXT:    slli a7, a7, 32
 ; RV64I-NEXT:    slli a0, a0, 32
-; RV64I-NEXT:    or a1, a7, a5
-; RV64I-NEXT:    or a5, t1, t0
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    or a0, a0, a6
-; RV64I-NEXT:    sd a1, 32(sp)
-; RV64I-NEXT:    sd a5, 40(sp)
-; RV64I-NEXT:    sd a3, 48(sp)
+; RV64I-NEXT:    or a1, a6, a5
+; RV64I-NEXT:    or a4, a7, s0
+; RV64I-NEXT:    or a0, a0, t0
+; RV64I-NEXT:    sd a3, 32(sp)
+; RV64I-NEXT:    sd a1, 40(sp)
+; RV64I-NEXT:    sd a4, 48(sp)
 ; RV64I-NEXT:    sd a0, 56(sp)
-; RV64I-NEXT:    ld a4, 16(t2)
-; RV64I-NEXT:    ld a0, 8(t2)
-; RV64I-NEXT:    ld a1, 0(t2)
-; RV64I-NEXT:    ld a3, 24(t2)
+; RV64I-NEXT:    ld a4, 16(t1)
+; RV64I-NEXT:    ld a0, 8(t1)
+; RV64I-NEXT:    ld a1, 0(t1)
+; RV64I-NEXT:    ld a3, 24(t1)
 ; RV64I-NEXT:    srli a5, a4, 56
 ; RV64I-NEXT:    srli a6, a4, 48
 ; RV64I-NEXT:    srli a7, a4, 40
@@ -4338,25 +4324,25 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
 ; RV64I-NEXT:    srli s5, a1, 48
 ; RV64I-NEXT:    srli s6, a1, 40
 ; RV64I-NEXT:    srli s7, a1, 32
-; RV64I-NEXT:    srli s8, a1, 24
-; RV64I-NEXT:    srli s9, a1, 16
-; RV64I-NEXT:    srli s10, a1, 8
-; RV64I-NEXT:    srli s11, a0, 56
 ; RV64I-NEXT:    sb t0, 20(a2)
+; RV64I-NEXT:    srli t0, a1, 24
 ; RV64I-NEXT:    sb a7, 21(a2)
+; RV64I-NEXT:    srli a7, a1, 16
 ; RV64I-NEXT:    sb a6, 22(a2)
+; RV64I-NEXT:    srli a6, a1, 8
 ; RV64I-NEXT:    sb a5, 23(a2)
-; RV64I-NEXT:    srli a5, a0, 48
+; RV64I-NEXT:    srli a5, a0, 56
 ; RV64I-NEXT:    sb a4, 16(a2)
+; RV64I-NEXT:    srli a4, a0, 48
 ; RV64I-NEXT:    sb t3, 17(a2)
 ; RV64I-NEXT:    sb t2, 18(a2)
 ; RV64I-NEXT:    sb t1, 19(a2)
-; RV64I-NEXT:    srli a4, a0, 40
+; RV64I-NEXT:    srli t1, a0, 40
 ; RV64I-NEXT:    sb s0, 28(a2)
 ; RV64I-NEXT:    sb t6, 29(a2)
 ; RV64I-NEXT:    sb t5, 30(a2)
 ; RV64I-NEXT:    sb t4, 31(a2)
-; RV64I-NEXT:    srli a6, a0, 32
+; RV64I-NEXT:    srli t2, a0, 32
 ; RV64I-NEXT:    sb a3, 24(a2)
 ; RV64I-NEXT:    sb s3, 25(a2)
 ; RV64I-NEXT:    sb s2, 26(a2)
@@ -4366,19 +4352,19 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
 ; RV64I-NEXT:    sb s6, 5(a2)
 ; RV64I-NEXT:    sb s5, 6(a2)
 ; RV64I-NEXT:    sb s4, 7(a2)
-; RV64I-NEXT:    srli a7, a0, 16
+; RV64I-NEXT:    srli t3, a0, 16
 ; RV64I-NEXT:    sb a1, 0(a2)
-; RV64I-NEXT:    sb s10, 1(a2)
-; RV64I-NEXT:    sb s9, 2(a2)
-; RV64I-NEXT:    sb s8, 3(a2)
+; RV64I-NEXT:    sb a6, 1(a2)
+; RV64I-NEXT:    sb a7, 2(a2)
+; RV64I-NEXT:    sb t0, 3(a2)
 ; RV64I-NEXT:    srli a1, a0, 8
-; RV64I-NEXT:    sb a6, 12(a2)
-; RV64I-NEXT:    sb a4, 13(a2)
-; RV64I-NEXT:    sb a5, 14(a2)
-; RV64I-NEXT:    sb s11, 15(a2)
+; RV64I-NEXT:    sb t2, 12(a2)
+; RV64I-NEXT:    sb t1, 13(a2)
+; RV64I-NEXT:    sb a4, 14(a2)
+; RV64I-NEXT:    sb a5, 15(a2)
 ; RV64I-NEXT:    sb a0, 8(a2)
 ; RV64I-NEXT:    sb a1, 9(a2)
-; RV64I-NEXT:    sb a7, 10(a2)
+; RV64I-NEXT:    sb t3, 10(a2)
 ; RV64I-NEXT:    sb a3, 11(a2)
 ; RV64I-NEXT:    ld s0, 152(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 144(sp) # 8-byte Folded Reload
@@ -4397,129 +4383,128 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
 ;
 ; RV32I-LABEL: shl_32bytes_dwordOff:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -128
-; RV32I-NEXT:    sw ra, 124(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 120(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 116(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 112(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s5, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s6, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s7, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s8, 88(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s9, 84(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s10, 80(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s11, 76(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a7, 0(a0)
-; RV32I-NEXT:    lbu t0, 1(a0)
-; RV32I-NEXT:    lbu t1, 2(a0)
-; RV32I-NEXT:    lbu s1, 3(a0)
-; RV32I-NEXT:    lbu s7, 4(a0)
-; RV32I-NEXT:    lbu s8, 5(a0)
-; RV32I-NEXT:    lbu s4, 6(a0)
-; RV32I-NEXT:    lbu s6, 7(a0)
-; RV32I-NEXT:    lbu s5, 8(a0)
-; RV32I-NEXT:    lbu s10, 9(a0)
-; RV32I-NEXT:    lbu s11, 10(a0)
-; RV32I-NEXT:    lbu ra, 11(a0)
-; RV32I-NEXT:    lbu t4, 12(a0)
-; RV32I-NEXT:    lbu t6, 13(a0)
-; RV32I-NEXT:    lbu a5, 14(a0)
-; RV32I-NEXT:    lbu a6, 15(a0)
-; RV32I-NEXT:    lbu a3, 16(a0)
-; RV32I-NEXT:    lbu t2, 17(a0)
-; RV32I-NEXT:    lbu t3, 18(a0)
-; RV32I-NEXT:    lbu t5, 19(a0)
-; RV32I-NEXT:    lbu a4, 20(a0)
-; RV32I-NEXT:    lbu s0, 21(a0)
-; RV32I-NEXT:    lbu s2, 22(a0)
-; RV32I-NEXT:    lbu s3, 23(a0)
+; RV32I-NEXT:    addi sp, sp, -112
+; RV32I-NEXT:    sw s0, 108(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 104(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 100(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 96(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s5, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s6, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s7, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s8, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s9, 72(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s10, 68(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s11, 64(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a3, 0(a0)
+; RV32I-NEXT:    lbu a4, 1(a0)
+; RV32I-NEXT:    lbu a5, 2(a0)
+; RV32I-NEXT:    lbu a6, 3(a0)
+; RV32I-NEXT:    lbu a7, 4(a0)
+; RV32I-NEXT:    lbu t0, 5(a0)
+; RV32I-NEXT:    lbu t1, 6(a0)
+; RV32I-NEXT:    lbu t2, 7(a0)
+; RV32I-NEXT:    lbu t3, 8(a0)
+; RV32I-NEXT:    lbu t4, 9(a0)
+; RV32I-NEXT:    lbu t5, 10(a0)
+; RV32I-NEXT:    lbu t6, 11(a0)
+; RV32I-NEXT:    lbu s2, 12(a0)
+; RV32I-NEXT:    lbu s3, 13(a0)
+; RV32I-NEXT:    lbu s4, 14(a0)
+; RV32I-NEXT:    lbu s5, 15(a0)
+; RV32I-NEXT:    lbu s6, 16(a0)
+; RV32I-NEXT:    lbu s7, 17(a0)
+; RV32I-NEXT:    lbu s8, 18(a0)
+; RV32I-NEXT:    lbu s9, 19(a0)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a6, a6, 24
 ; RV32I-NEXT:    slli t0, t0, 8
 ; RV32I-NEXT:    slli t1, t1, 16
-; RV32I-NEXT:    slli s1, s1, 24
-; RV32I-NEXT:    slli s8, s8, 8
-; RV32I-NEXT:    or a7, t0, a7
-; RV32I-NEXT:    or t0, s1, t1
-; RV32I-NEXT:    or t1, s8, s7
-; RV32I-NEXT:    lbu s1, 24(a0)
-; RV32I-NEXT:    lbu s7, 25(a0)
-; RV32I-NEXT:    lbu s8, 26(a0)
-; RV32I-NEXT:    lbu s9, 27(a0)
+; RV32I-NEXT:    slli t2, t2, 24
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a5, t0, a7
+; RV32I-NEXT:    or a6, t2, t1
+; RV32I-NEXT:    lbu s10, 20(a0)
+; RV32I-NEXT:    lbu s11, 21(a0)
+; RV32I-NEXT:    lbu s0, 22(a0)
+; RV32I-NEXT:    lbu s1, 23(a0)
+; RV32I-NEXT:    slli t4, t4, 8
+; RV32I-NEXT:    slli t5, t5, 16
+; RV32I-NEXT:    slli t6, t6, 24
+; RV32I-NEXT:    slli s3, s3, 8
 ; RV32I-NEXT:    slli s4, s4, 16
-; RV32I-NEXT:    slli s6, s6, 24
-; RV32I-NEXT:    slli s10, s10, 8
-; RV32I-NEXT:    slli s11, s11, 16
-; RV32I-NEXT:    slli ra, ra, 24
-; RV32I-NEXT:    or s4, s6, s4
-; RV32I-NEXT:    or s5, s10, s5
-; RV32I-NEXT:    or s6, ra, s11
-; RV32I-NEXT:    lbu s10, 28(a0)
-; RV32I-NEXT:    lbu s11, 29(a0)
-; RV32I-NEXT:    lbu ra, 30(a0)
+; RV32I-NEXT:    slli s5, s5, 24
+; RV32I-NEXT:    or a7, t4, t3
+; RV32I-NEXT:    or t0, t6, t5
+; RV32I-NEXT:    or t1, s3, s2
+; RV32I-NEXT:    or t2, s5, s4
+; RV32I-NEXT:    lbu t3, 24(a0)
+; RV32I-NEXT:    lbu s2, 25(a0)
+; RV32I-NEXT:    lbu s3, 26(a0)
+; RV32I-NEXT:    lbu s4, 27(a0)
+; RV32I-NEXT:    slli s7, s7, 8
+; RV32I-NEXT:    slli s8, s8, 16
+; RV32I-NEXT:    slli s9, s9, 24
+; RV32I-NEXT:    slli s11, s11, 8
+; RV32I-NEXT:    or t4, s7, s6
+; RV32I-NEXT:    or t5, s9, s8
+; RV32I-NEXT:    or t6, s11, s10
+; RV32I-NEXT:    lbu s5, 28(a0)
+; RV32I-NEXT:    lbu s6, 29(a0)
+; RV32I-NEXT:    lbu s7, 30(a0)
 ; RV32I-NEXT:    lbu a0, 31(a0)
 ; RV32I-NEXT:    lbu a1, 0(a1)
+; RV32I-NEXT:    sw zero, 16(sp)
+; RV32I-NEXT:    sw zero, 20(sp)
 ; RV32I-NEXT:    sw zero, 24(sp)
 ; RV32I-NEXT:    sw zero, 28(sp)
-; RV32I-NEXT:    sw zero, 32(sp)
-; RV32I-NEXT:    sw zero, 36(sp)
+; RV32I-NEXT:    sw zero, 0(sp)
+; RV32I-NEXT:    sw zero, 4(sp)
 ; RV32I-NEXT:    sw zero, 8(sp)
 ; RV32I-NEXT:    sw zero, 12(sp)
-; RV32I-NEXT:    sw zero, 16(sp)
-; RV32I-NEXT:    sw zero, 20(sp)
-; RV32I-NEXT:    slli t6, t6, 8
-; RV32I-NEXT:    or t4, t6, t4
-; RV32I-NEXT:    addi t6, sp, 40
-; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    slli t2, t2, 8
-; RV32I-NEXT:    slli t3, t3, 16
-; RV32I-NEXT:    slli t5, t5, 24
-; RV32I-NEXT:    slli s0, s0, 8
-; RV32I-NEXT:    slli s2, s2, 16
-; RV32I-NEXT:    slli s3, s3, 24
-; RV32I-NEXT:    slli s7, s7, 8
-; RV32I-NEXT:    slli s8, s8, 16
-; RV32I-NEXT:    slli s9, s9, 24
-; RV32I-NEXT:    slli s11, s11, 8
-; RV32I-NEXT:    slli ra, ra, 16
+; RV32I-NEXT:    slli s0, s0, 16
+; RV32I-NEXT:    slli s1, s1, 24
+; RV32I-NEXT:    or s0, s1, s0
+; RV32I-NEXT:    addi s1, sp, 32
+; RV32I-NEXT:    slli s2, s2, 8
+; RV32I-NEXT:    slli s3, s3, 16
+; RV32I-NEXT:    slli s4, s4, 24
+; RV32I-NEXT:    slli s6, s6, 8
+; RV32I-NEXT:    slli s7, s7, 16
 ; RV32I-NEXT:    slli a0, a0, 24
 ; RV32I-NEXT:    slli a1, a1, 3
-; RV32I-NEXT:    or a5, a6, a5
-; RV32I-NEXT:    or a3, t2, a3
-; RV32I-NEXT:    or a6, t5, t3
-; RV32I-NEXT:    or a4, s0, a4
-; RV32I-NEXT:    or t2, s3, s2
-; RV32I-NEXT:    or t3, s7, s1
-; RV32I-NEXT:    or t5, s9, s8
-; RV32I-NEXT:    or s0, s11, s10
-; RV32I-NEXT:    or a0, a0, ra
+; RV32I-NEXT:    or t3, s2, t3
+; RV32I-NEXT:    or s2, s4, s3
+; RV32I-NEXT:    or s3, s6, s5
+; RV32I-NEXT:    or a0, a0, s7
 ; RV32I-NEXT:    andi a1, a1, 24
-; RV32I-NEXT:    or a7, t0, a7
-; RV32I-NEXT:    or t0, s4, t1
-; RV32I-NEXT:    or t1, s6, s5
-; RV32I-NEXT:    or a5, a5, t4
-; RV32I-NEXT:    or a3, a6, a3
-; RV32I-NEXT:    or a4, t2, a4
-; RV32I-NEXT:    or a6, t5, t3
-; RV32I-NEXT:    or a0, a0, s0
-; RV32I-NEXT:    sub t2, t6, a1
-; RV32I-NEXT:    sw a3, 56(sp)
-; RV32I-NEXT:    sw a4, 60(sp)
-; RV32I-NEXT:    sw a6, 64(sp)
-; RV32I-NEXT:    sw a0, 68(sp)
-; RV32I-NEXT:    sw a7, 40(sp)
-; RV32I-NEXT:    sw t0, 44(sp)
-; RV32I-NEXT:    sw t1, 48(sp)
-; RV32I-NEXT:    sw a5, 52(sp)
-; RV32I-NEXT:    lw a6, 16(t2)
-; RV32I-NEXT:    lw a5, 20(t2)
-; RV32I-NEXT:    lw a7, 24(t2)
-; RV32I-NEXT:    lw a1, 0(t2)
-; RV32I-NEXT:    lw a0, 4(t2)
-; RV32I-NEXT:    lw a4, 8(t2)
-; RV32I-NEXT:    lw a3, 12(t2)
-; RV32I-NEXT:    lw t0, 28(t2)
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a5, t0, a7
+; RV32I-NEXT:    or a6, t2, t1
+; RV32I-NEXT:    or a7, t5, t4
+; RV32I-NEXT:    or t0, s0, t6
+; RV32I-NEXT:    or t1, s2, t3
+; RV32I-NEXT:    or a0, a0, s3
+; RV32I-NEXT:    sub s1, s1, a1
+; RV32I-NEXT:    sw a7, 48(sp)
+; RV32I-NEXT:    sw t0, 52(sp)
+; RV32I-NEXT:    sw t1, 56(sp)
+; RV32I-NEXT:    sw a0, 60(sp)
+; RV32I-NEXT:    sw a3, 32(sp)
+; RV32I-NEXT:    sw a4, 36(sp)
+; RV32I-NEXT:    sw a5, 40(sp)
+; RV32I-NEXT:    sw a6, 44(sp)
+; RV32I-NEXT:    lw a6, 16(s1)
+; RV32I-NEXT:    lw a5, 20(s1)
+; RV32I-NEXT:    lw a7, 24(s1)
+; RV32I-NEXT:    lw a1, 0(s1)
+; RV32I-NEXT:    lw a0, 4(s1)
+; RV32I-NEXT:    lw a4, 8(s1)
+; RV32I-NEXT:    lw a3, 12(s1)
+; RV32I-NEXT:    lw t0, 28(s1)
 ; RV32I-NEXT:    srli t1, a7, 24
 ; RV32I-NEXT:    srli t2, a7, 16
 ; RV32I-NEXT:    srli t3, a7, 8
@@ -4534,21 +4519,21 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
 ; RV32I-NEXT:    srli s5, a5, 8
 ; RV32I-NEXT:    srli s6, a4, 24
 ; RV32I-NEXT:    srli s7, a4, 16
-; RV32I-NEXT:    srli s8, a4, 8
-; RV32I-NEXT:    srli s9, a3, 24
-; RV32I-NEXT:    srli s10, a3, 16
-; RV32I-NEXT:    srli s11, a3, 8
-; RV32I-NEXT:    srli ra, a1, 24
 ; RV32I-NEXT:    sb a7, 24(a2)
+; RV32I-NEXT:    srli a7, a4, 8
 ; RV32I-NEXT:    sb t3, 25(a2)
+; RV32I-NEXT:    srli t3, a3, 24
 ; RV32I-NEXT:    sb t2, 26(a2)
+; RV32I-NEXT:    srli t2, a3, 16
 ; RV32I-NEXT:    sb t1, 27(a2)
-; RV32I-NEXT:    srli a7, a1, 16
+; RV32I-NEXT:    srli t1, a3, 8
 ; RV32I-NEXT:    sb t0, 28(a2)
+; RV32I-NEXT:    srli t0, a1, 24
 ; RV32I-NEXT:    sb t6, 29(a2)
+; RV32I-NEXT:    srli t6, a1, 16
 ; RV32I-NEXT:    sb t5, 30(a2)
 ; RV32I-NEXT:    sb t4, 31(a2)
-; RV32I-NEXT:    srli t0, a1, 8
+; RV32I-NEXT:    srli t4, a1, 8
 ; RV32I-NEXT:    sb a6, 16(a2)
 ; RV32I-NEXT:    sb s2, 17(a2)
 ; RV32I-NEXT:    sb s1, 18(a2)
@@ -4560,36 +4545,35 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
 ; RV32I-NEXT:    sb s3, 23(a2)
 ; RV32I-NEXT:    srli a5, a0, 16
 ; RV32I-NEXT:    sb a4, 8(a2)
-; RV32I-NEXT:    sb s8, 9(a2)
+; RV32I-NEXT:    sb a7, 9(a2)
 ; RV32I-NEXT:    sb s7, 10(a2)
 ; RV32I-NEXT:    sb s6, 11(a2)
 ; RV32I-NEXT:    srli a4, a0, 8
 ; RV32I-NEXT:    sb a3, 12(a2)
-; RV32I-NEXT:    sb s11, 13(a2)
-; RV32I-NEXT:    sb s10, 14(a2)
-; RV32I-NEXT:    sb s9, 15(a2)
+; RV32I-NEXT:    sb t1, 13(a2)
+; RV32I-NEXT:    sb t2, 14(a2)
+; RV32I-NEXT:    sb t3, 15(a2)
 ; RV32I-NEXT:    sb a1, 0(a2)
-; RV32I-NEXT:    sb t0, 1(a2)
-; RV32I-NEXT:    sb a7, 2(a2)
-; RV32I-NEXT:    sb ra, 3(a2)
+; RV32I-NEXT:    sb t4, 1(a2)
+; RV32I-NEXT:    sb t6, 2(a2)
+; RV32I-NEXT:    sb t0, 3(a2)
 ; RV32I-NEXT:    sb a0, 4(a2)
 ; RV32I-NEXT:    sb a4, 5(a2)
 ; RV32I-NEXT:    sb a5, 6(a2)
 ; RV32I-NEXT:    sb a6, 7(a2)
-; RV32I-NEXT:    lw ra, 124(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 120(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 116(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 112(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s4, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s5, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s6, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s7, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s8, 88(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s9, 84(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s10, 80(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s11, 76(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 128
+; RV32I-NEXT:    lw s0, 108(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 104(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 100(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 96(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s5, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s6, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s7, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s8, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s9, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s10, 68(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s11, 64(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 112
 ; RV32I-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
   %dwordOff = load i256, ptr %dwordOff.ptr, align 1
@@ -4834,140 +4818,137 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ;
 ; RV32I-LABEL: ashr_32bytes:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -128
-; RV32I-NEXT:    sw ra, 124(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 120(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 116(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 112(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s5, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s6, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s7, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s8, 88(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s9, 84(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s10, 80(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s11, 76(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu t6, 0(a0)
+; RV32I-NEXT:    addi sp, sp, -112
+; RV32I-NEXT:    sw s0, 108(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 104(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 100(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 96(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s5, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s6, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s7, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s8, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s9, 72(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s10, 68(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s11, 64(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a3, 0(a0)
 ; RV32I-NEXT:    lbu a4, 1(a0)
 ; RV32I-NEXT:    lbu a5, 2(a0)
 ; RV32I-NEXT:    lbu a6, 3(a0)
-; RV32I-NEXT:    lbu t1, 4(a0)
-; RV32I-NEXT:    lbu t3, 5(a0)
-; RV32I-NEXT:    lbu t4, 6(a0)
-; RV32I-NEXT:    lbu t5, 7(a0)
-; RV32I-NEXT:    lbu t2, 8(a0)
-; RV32I-NEXT:    lbu s1, 9(a0)
-; RV32I-NEXT:    lbu s7, 10(a0)
-; RV32I-NEXT:    lbu s8, 11(a0)
-; RV32I-NEXT:    lbu s9, 12(a0)
-; RV32I-NEXT:    lbu s10, 13(a0)
-; RV32I-NEXT:    lbu s4, 14(a0)
-; RV32I-NEXT:    lbu s6, 15(a0)
-; RV32I-NEXT:    lbu s5, 16(a0)
-; RV32I-NEXT:    lbu s11, 17(a0)
-; RV32I-NEXT:    lbu ra, 18(a0)
-; RV32I-NEXT:    lbu a3, 19(a0)
-; RV32I-NEXT:    lbu s2, 20(a0)
-; RV32I-NEXT:    lbu s3, 21(a0)
-; RV32I-NEXT:    lbu a7, 22(a0)
-; RV32I-NEXT:    lbu t0, 23(a0)
+; RV32I-NEXT:    lbu a7, 4(a0)
+; RV32I-NEXT:    lbu t0, 5(a0)
+; RV32I-NEXT:    lbu t1, 6(a0)
+; RV32I-NEXT:    lbu t2, 7(a0)
+; RV32I-NEXT:    lbu t3, 8(a0)
+; RV32I-NEXT:    lbu t4, 9(a0)
+; RV32I-NEXT:    lbu t5, 10(a0)
+; RV32I-NEXT:    lbu t6, 11(a0)
+; RV32I-NEXT:    lbu s0, 12(a0)
+; RV32I-NEXT:    lbu s1, 13(a0)
+; RV32I-NEXT:    lbu s2, 14(a0)
+; RV32I-NEXT:    lbu s3, 15(a0)
+; RV32I-NEXT:    lbu s4, 16(a0)
+; RV32I-NEXT:    lbu s5, 17(a0)
+; RV32I-NEXT:    lbu s6, 18(a0)
+; RV32I-NEXT:    lbu s7, 19(a0)
 ; RV32I-NEXT:    slli a4, a4, 8
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    slli t3, t3, 8
-; RV32I-NEXT:    slli t4, t4, 16
-; RV32I-NEXT:    slli t5, t5, 24
-; RV32I-NEXT:    or a4, a4, t6
-; RV32I-NEXT:    sw a4, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t2, t2, 24
+; RV32I-NEXT:    or a3, a4, a3
 ; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a5, t3, t1
-; RV32I-NEXT:    or a6, t5, t4
-; RV32I-NEXT:    lbu t1, 24(a0)
-; RV32I-NEXT:    lbu t5, 25(a0)
-; RV32I-NEXT:    lbu t6, 26(a0)
-; RV32I-NEXT:    lbu s0, 27(a0)
+; RV32I-NEXT:    or a5, t0, a7
+; RV32I-NEXT:    or a6, t2, t1
+; RV32I-NEXT:    lbu s8, 20(a0)
+; RV32I-NEXT:    lbu s9, 21(a0)
+; RV32I-NEXT:    lbu s10, 22(a0)
+; RV32I-NEXT:    lbu s11, 23(a0)
+; RV32I-NEXT:    slli t4, t4, 8
+; RV32I-NEXT:    slli t5, t5, 16
+; RV32I-NEXT:    slli t6, t6, 24
 ; RV32I-NEXT:    slli s1, s1, 8
-; RV32I-NEXT:    slli s7, s7, 16
-; RV32I-NEXT:    slli s8, s8, 24
-; RV32I-NEXT:    slli s10, s10, 8
-; RV32I-NEXT:    or t2, s1, t2
-; RV32I-NEXT:    or t3, s8, s7
-; RV32I-NEXT:    or t4, s10, s9
-; RV32I-NEXT:    lbu s1, 28(a0)
-; RV32I-NEXT:    lbu s7, 29(a0)
-; RV32I-NEXT:    lbu s8, 30(a0)
-; RV32I-NEXT:    lbu s9, 31(a0)
-; RV32I-NEXT:    slli s4, s4, 16
-; RV32I-NEXT:    slli s6, s6, 24
-; RV32I-NEXT:    slli s11, s11, 8
-; RV32I-NEXT:    slli ra, ra, 16
-; RV32I-NEXT:    slli a3, a3, 24
-; RV32I-NEXT:    or a0, s6, s4
-; RV32I-NEXT:    or s4, s11, s5
-; RV32I-NEXT:    or s5, a3, ra
-; RV32I-NEXT:    lbu a3, 0(a1)
-; RV32I-NEXT:    lbu s6, 1(a1)
-; RV32I-NEXT:    lbu s10, 2(a1)
-; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli s3, s3, 8
-; RV32I-NEXT:    or s2, s3, s2
-; RV32I-NEXT:    addi s3, sp, 8
-; RV32I-NEXT:    slli a7, a7, 16
-; RV32I-NEXT:    slli t0, t0, 24
-; RV32I-NEXT:    slli t5, t5, 8
-; RV32I-NEXT:    slli t6, t6, 16
-; RV32I-NEXT:    slli s0, s0, 24
-; RV32I-NEXT:    slli s7, s7, 8
-; RV32I-NEXT:    slli s8, s8, 16
-; RV32I-NEXT:    slli s9, s9, 24
-; RV32I-NEXT:    slli s6, s6, 8
+; RV32I-NEXT:    slli s2, s2, 16
+; RV32I-NEXT:    slli s3, s3, 24
+; RV32I-NEXT:    or a7, t4, t3
+; RV32I-NEXT:    or t0, t6, t5
+; RV32I-NEXT:    or t1, s1, s0
+; RV32I-NEXT:    or t2, s3, s2
+; RV32I-NEXT:    lbu t6, 24(a0)
+; RV32I-NEXT:    lbu s0, 25(a0)
+; RV32I-NEXT:    lbu s1, 26(a0)
+; RV32I-NEXT:    lbu s2, 27(a0)
+; RV32I-NEXT:    slli s5, s5, 8
+; RV32I-NEXT:    slli s6, s6, 16
+; RV32I-NEXT:    slli s7, s7, 24
+; RV32I-NEXT:    slli s9, s9, 8
+; RV32I-NEXT:    or t3, s5, s4
+; RV32I-NEXT:    or t4, s7, s6
+; RV32I-NEXT:    or t5, s9, s8
+; RV32I-NEXT:    lbu s3, 28(a0)
+; RV32I-NEXT:    lbu s4, 29(a0)
+; RV32I-NEXT:    lbu s5, 30(a0)
+; RV32I-NEXT:    lbu a0, 31(a0)
 ; RV32I-NEXT:    slli s10, s10, 16
+; RV32I-NEXT:    slli s11, s11, 24
+; RV32I-NEXT:    slli s0, s0, 8
+; RV32I-NEXT:    slli s1, s1, 16
+; RV32I-NEXT:    slli s2, s2, 24
+; RV32I-NEXT:    or s6, s11, s10
+; RV32I-NEXT:    or t6, s0, t6
+; RV32I-NEXT:    or s0, s2, s1
+; RV32I-NEXT:    lbu s1, 0(a1)
+; RV32I-NEXT:    lbu s2, 1(a1)
+; RV32I-NEXT:    lbu s7, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli s4, s4, 8
+; RV32I-NEXT:    or s3, s4, s3
+; RV32I-NEXT:    mv s4, sp
+; RV32I-NEXT:    slli s5, s5, 16
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    slli s2, s2, 8
+; RV32I-NEXT:    slli s7, s7, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a7, t0, a7
-; RV32I-NEXT:    or t0, t5, t1
-; RV32I-NEXT:    or t1, s0, t6
-; RV32I-NEXT:    or t5, s7, s1
-; RV32I-NEXT:    or t6, s9, s8
-; RV32I-NEXT:    or a3, s6, a3
-; RV32I-NEXT:    or a1, a1, s10
-; RV32I-NEXT:    srai s0, s9, 31
-; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    or a4, a4, s1
-; RV32I-NEXT:    or a5, a6, a5
-; RV32I-NEXT:    or a6, t3, t2
-; RV32I-NEXT:    or a0, a0, t4
-; RV32I-NEXT:    or t2, s5, s4
-; RV32I-NEXT:    or a7, a7, s2
-; RV32I-NEXT:    or t0, t1, t0
-; RV32I-NEXT:    or t1, t6, t5
-; RV32I-NEXT:    or a1, a1, a3
-; RV32I-NEXT:    sw s0, 56(sp)
-; RV32I-NEXT:    sw s0, 60(sp)
-; RV32I-NEXT:    sw s0, 64(sp)
-; RV32I-NEXT:    sw s0, 68(sp)
-; RV32I-NEXT:    sw s0, 40(sp)
-; RV32I-NEXT:    sw s0, 44(sp)
-; RV32I-NEXT:    sw s0, 48(sp)
-; RV32I-NEXT:    sw s0, 52(sp)
-; RV32I-NEXT:    sw t2, 24(sp)
-; RV32I-NEXT:    sw a7, 28(sp)
-; RV32I-NEXT:    sw t0, 32(sp)
-; RV32I-NEXT:    sw t1, 36(sp)
-; RV32I-NEXT:    sw a4, 8(sp)
-; RV32I-NEXT:    sw a5, 12(sp)
-; RV32I-NEXT:    sw a6, 16(sp)
-; RV32I-NEXT:    sw a0, 20(sp)
+; RV32I-NEXT:    or s5, a0, s5
+; RV32I-NEXT:    or s1, s2, s1
+; RV32I-NEXT:    or a1, a1, s7
+; RV32I-NEXT:    srai a0, a0, 31
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a5, t0, a7
+; RV32I-NEXT:    or a6, t2, t1
+; RV32I-NEXT:    or a7, t4, t3
+; RV32I-NEXT:    or t0, s6, t5
+; RV32I-NEXT:    or t1, s0, t6
+; RV32I-NEXT:    or t2, s5, s3
+; RV32I-NEXT:    or a1, a1, s1
+; RV32I-NEXT:    sw a0, 48(sp)
+; RV32I-NEXT:    sw a0, 52(sp)
+; RV32I-NEXT:    sw a0, 56(sp)
+; RV32I-NEXT:    sw a0, 60(sp)
+; RV32I-NEXT:    sw a0, 32(sp)
+; RV32I-NEXT:    sw a0, 36(sp)
+; RV32I-NEXT:    sw a0, 40(sp)
+; RV32I-NEXT:    sw a0, 44(sp)
+; RV32I-NEXT:    sw a7, 16(sp)
+; RV32I-NEXT:    sw t0, 20(sp)
+; RV32I-NEXT:    sw t1, 24(sp)
+; RV32I-NEXT:    sw t2, 28(sp)
+; RV32I-NEXT:    sw a3, 0(sp)
+; RV32I-NEXT:    sw a4, 4(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a6, 12(sp)
 ; RV32I-NEXT:    slli t1, a1, 3
 ; RV32I-NEXT:    andi a1, a1, 28
-; RV32I-NEXT:    add a1, s3, a1
+; RV32I-NEXT:    add a1, s4, a1
 ; RV32I-NEXT:    andi a0, t1, 24
-; RV32I-NEXT:    xori t0, a0, 31
+; RV32I-NEXT:    xori a7, a0, 31
 ; RV32I-NEXT:    lw a3, 0(a1)
 ; RV32I-NEXT:    lw a4, 4(a1)
 ; RV32I-NEXT:    lw a5, 8(a1)
 ; RV32I-NEXT:    lw a6, 12(a1)
-; RV32I-NEXT:    lw a7, 16(a1)
+; RV32I-NEXT:    lw t0, 16(a1)
 ; RV32I-NEXT:    lw t2, 20(a1)
 ; RV32I-NEXT:    lw t3, 24(a1)
 ; RV32I-NEXT:    lw t4, 28(a1)
@@ -4976,33 +4957,33 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    srl a1, a3, t1
 ; RV32I-NEXT:    slli t6, a4, 1
 ; RV32I-NEXT:    srl a3, a6, t1
-; RV32I-NEXT:    slli s0, a7, 1
+; RV32I-NEXT:    slli s0, t0, 1
 ; RV32I-NEXT:    srl a4, a5, t1
 ; RV32I-NEXT:    slli s1, a6, 1
 ; RV32I-NEXT:    srl a5, t2, t1
 ; RV32I-NEXT:    slli s2, t3, 1
-; RV32I-NEXT:    srl a6, a7, t1
+; RV32I-NEXT:    srl a6, t0, t1
 ; RV32I-NEXT:    slli t2, t2, 1
-; RV32I-NEXT:    srl a7, t3, t1
+; RV32I-NEXT:    srl t0, t3, t1
 ; RV32I-NEXT:    slli t3, t4, 1
 ; RV32I-NEXT:    sra t1, t4, t1
-; RV32I-NEXT:    sll t4, t5, t0
-; RV32I-NEXT:    sll t5, t6, t0
-; RV32I-NEXT:    sll t6, s0, t0
-; RV32I-NEXT:    sll s0, s1, t0
-; RV32I-NEXT:    sll s1, s2, t0
-; RV32I-NEXT:    sll t2, t2, t0
-; RV32I-NEXT:    sll t3, t3, t0
+; RV32I-NEXT:    sll t4, t5, a7
+; RV32I-NEXT:    sll t5, t6, a7
+; RV32I-NEXT:    sll t6, s0, a7
+; RV32I-NEXT:    sll s0, s1, a7
+; RV32I-NEXT:    sll s1, s2, a7
+; RV32I-NEXT:    sll t2, t2, a7
+; RV32I-NEXT:    sll t3, t3, a7
 ; RV32I-NEXT:    srli s2, t1, 24
 ; RV32I-NEXT:    srli s3, t1, 16
 ; RV32I-NEXT:    srli s4, t1, 8
-; RV32I-NEXT:    or t0, a0, t4
+; RV32I-NEXT:    or a7, a0, t4
 ; RV32I-NEXT:    or t4, a1, t5
 ; RV32I-NEXT:    or t5, a3, t6
 ; RV32I-NEXT:    or s0, a4, s0
 ; RV32I-NEXT:    or s1, a5, s1
 ; RV32I-NEXT:    or t2, a6, t2
-; RV32I-NEXT:    or t3, a7, t3
+; RV32I-NEXT:    or t3, t0, t3
 ; RV32I-NEXT:    sb t1, 28(a2)
 ; RV32I-NEXT:    sb s4, 29(a2)
 ; RV32I-NEXT:    sb s3, 30(a2)
@@ -5019,23 +5000,23 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    srli s6, s0, 24
 ; RV32I-NEXT:    srli s7, s0, 16
 ; RV32I-NEXT:    srli s0, s0, 8
-; RV32I-NEXT:    srli s8, t5, 24
-; RV32I-NEXT:    srli s9, t5, 16
-; RV32I-NEXT:    srli t5, t5, 8
-; RV32I-NEXT:    srli s10, t4, 24
-; RV32I-NEXT:    srli s11, t4, 16
-; RV32I-NEXT:    srli t4, t4, 8
-; RV32I-NEXT:    sb a7, 24(a2)
+; RV32I-NEXT:    sb t0, 24(a2)
+; RV32I-NEXT:    srli t0, t5, 24
 ; RV32I-NEXT:    sb t3, 25(a2)
+; RV32I-NEXT:    srli t3, t5, 16
+; RV32I-NEXT:    srli t5, t5, 8
 ; RV32I-NEXT:    sb t6, 26(a2)
+; RV32I-NEXT:    srli t6, t4, 24
 ; RV32I-NEXT:    sb t1, 27(a2)
-; RV32I-NEXT:    srli a7, t0, 24
+; RV32I-NEXT:    srli t1, t4, 16
+; RV32I-NEXT:    srli t4, t4, 8
 ; RV32I-NEXT:    sb a6, 16(a2)
+; RV32I-NEXT:    srli a6, a7, 24
 ; RV32I-NEXT:    sb t2, 17(a2)
 ; RV32I-NEXT:    sb s3, 18(a2)
 ; RV32I-NEXT:    sb s2, 19(a2)
-; RV32I-NEXT:    srli a6, t0, 16
-; RV32I-NEXT:    srli t0, t0, 8
+; RV32I-NEXT:    srli t2, a7, 16
+; RV32I-NEXT:    srli a7, a7, 8
 ; RV32I-NEXT:    sb a5, 20(a2)
 ; RV32I-NEXT:    sb s1, 21(a2)
 ; RV32I-NEXT:    sb s5, 22(a2)
@@ -5046,30 +5027,29 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sb s6, 11(a2)
 ; RV32I-NEXT:    sb a3, 12(a2)
 ; RV32I-NEXT:    sb t5, 13(a2)
-; RV32I-NEXT:    sb s9, 14(a2)
-; RV32I-NEXT:    sb s8, 15(a2)
+; RV32I-NEXT:    sb t3, 14(a2)
+; RV32I-NEXT:    sb t0, 15(a2)
 ; RV32I-NEXT:    sb a1, 0(a2)
 ; RV32I-NEXT:    sb t4, 1(a2)
-; RV32I-NEXT:    sb s11, 2(a2)
-; RV32I-NEXT:    sb s10, 3(a2)
+; RV32I-NEXT:    sb t1, 2(a2)
+; RV32I-NEXT:    sb t6, 3(a2)
 ; RV32I-NEXT:    sb a0, 4(a2)
-; RV32I-NEXT:    sb t0, 5(a2)
-; RV32I-NEXT:    sb a6, 6(a2)
-; RV32I-NEXT:    sb a7, 7(a2)
-; RV32I-NEXT:    lw ra, 124(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 120(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 116(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 112(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s4, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s5, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s6, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s7, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s8, 88(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s9, 84(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s10, 80(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s11, 76(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 128
+; RV32I-NEXT:    sb a7, 5(a2)
+; RV32I-NEXT:    sb t2, 6(a2)
+; RV32I-NEXT:    sb a6, 7(a2)
+; RV32I-NEXT:    lw s0, 108(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 104(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 100(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 96(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s5, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s6, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s7, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s8, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s9, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s10, 68(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s11, 64(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 112
 ; RV32I-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
   %byteOff = load i256, ptr %byteOff.ptr, align 1
@@ -5315,130 +5295,129 @@ define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
 ;
 ; RV32I-LABEL: ashr_32bytes_wordOff:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -128
-; RV32I-NEXT:    sw ra, 124(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 120(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 116(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 112(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s5, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s6, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s7, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s8, 88(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s9, 84(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s10, 80(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s11, 76(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a6, 0(a0)
-; RV32I-NEXT:    lbu t0, 1(a0)
-; RV32I-NEXT:    lbu t1, 2(a0)
-; RV32I-NEXT:    lbu t6, 3(a0)
-; RV32I-NEXT:    lbu s7, 4(a0)
-; RV32I-NEXT:    lbu s8, 5(a0)
-; RV32I-NEXT:    lbu s3, 6(a0)
-; RV32I-NEXT:    lbu s5, 7(a0)
-; RV32I-NEXT:    lbu s4, 8(a0)
-; RV32I-NEXT:    lbu s9, 9(a0)
-; RV32I-NEXT:    lbu s10, 10(a0)
-; RV32I-NEXT:    lbu s11, 11(a0)
-; RV32I-NEXT:    lbu s2, 12(a0)
-; RV32I-NEXT:    lbu s6, 13(a0)
-; RV32I-NEXT:    lbu a5, 14(a0)
-; RV32I-NEXT:    lbu a7, 15(a0)
-; RV32I-NEXT:    lbu a3, 16(a0)
-; RV32I-NEXT:    lbu t2, 17(a0)
-; RV32I-NEXT:    lbu t3, 18(a0)
-; RV32I-NEXT:    lbu t4, 19(a0)
-; RV32I-NEXT:    lbu a4, 20(a0)
-; RV32I-NEXT:    lbu t5, 21(a0)
-; RV32I-NEXT:    lbu s0, 22(a0)
-; RV32I-NEXT:    lbu s1, 23(a0)
+; RV32I-NEXT:    addi sp, sp, -112
+; RV32I-NEXT:    sw s0, 108(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 104(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 100(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 96(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s5, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s6, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s7, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s8, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s9, 72(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s10, 68(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s11, 64(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a3, 0(a0)
+; RV32I-NEXT:    lbu a4, 1(a0)
+; RV32I-NEXT:    lbu a5, 2(a0)
+; RV32I-NEXT:    lbu a6, 3(a0)
+; RV32I-NEXT:    lbu a7, 4(a0)
+; RV32I-NEXT:    lbu t0, 5(a0)
+; RV32I-NEXT:    lbu t1, 6(a0)
+; RV32I-NEXT:    lbu t2, 7(a0)
+; RV32I-NEXT:    lbu t3, 8(a0)
+; RV32I-NEXT:    lbu t4, 9(a0)
+; RV32I-NEXT:    lbu t5, 10(a0)
+; RV32I-NEXT:    lbu t6, 11(a0)
+; RV32I-NEXT:    lbu s0, 12(a0)
+; RV32I-NEXT:    lbu s1, 13(a0)
+; RV32I-NEXT:    lbu s2, 14(a0)
+; RV32I-NEXT:    lbu s3, 15(a0)
+; RV32I-NEXT:    lbu s4, 16(a0)
+; RV32I-NEXT:    lbu s5, 17(a0)
+; RV32I-NEXT:    lbu s6, 18(a0)
+; RV32I-NEXT:    lbu s7, 19(a0)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a6, a6, 24
 ; RV32I-NEXT:    slli t0, t0, 8
 ; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t2, t2, 24
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a5, t0, a7
+; RV32I-NEXT:    or a6, t2, t1
+; RV32I-NEXT:    lbu s8, 20(a0)
+; RV32I-NEXT:    lbu s9, 21(a0)
+; RV32I-NEXT:    lbu s10, 22(a0)
+; RV32I-NEXT:    lbu s11, 23(a0)
+; RV32I-NEXT:    slli t4, t4, 8
+; RV32I-NEXT:    slli t5, t5, 16
 ; RV32I-NEXT:    slli t6, t6, 24
-; RV32I-NEXT:    slli s8, s8, 8
-; RV32I-NEXT:    or a6, t0, a6
-; RV32I-NEXT:    or t0, t6, t1
-; RV32I-NEXT:    or t1, s8, s7
-; RV32I-NEXT:    lbu t6, 24(a0)
-; RV32I-NEXT:    lbu s7, 25(a0)
-; RV32I-NEXT:    lbu s8, 26(a0)
-; RV32I-NEXT:    lbu ra, 27(a0)
-; RV32I-NEXT:    slli s3, s3, 16
-; RV32I-NEXT:    slli s5, s5, 24
+; RV32I-NEXT:    slli s1, s1, 8
+; RV32I-NEXT:    slli s2, s2, 16
+; RV32I-NEXT:    slli s3, s3, 24
+; RV32I-NEXT:    or a7, t4, t3
+; RV32I-NEXT:    or t0, t6, t5
+; RV32I-NEXT:    or t1, s1, s0
+; RV32I-NEXT:    or t2, s3, s2
+; RV32I-NEXT:    lbu t3, 24(a0)
+; RV32I-NEXT:    lbu t5, 25(a0)
+; RV32I-NEXT:    lbu t6, 26(a0)
+; RV32I-NEXT:    lbu s0, 27(a0)
+; RV32I-NEXT:    slli s5, s5, 8
+; RV32I-NEXT:    slli s6, s6, 16
+; RV32I-NEXT:    slli s7, s7, 24
 ; RV32I-NEXT:    slli s9, s9, 8
-; RV32I-NEXT:    slli s10, s10, 16
-; RV32I-NEXT:    slli s11, s11, 24
-; RV32I-NEXT:    or s3, s5, s3
-; RV32I-NEXT:    or s4, s9, s4
-; RV32I-NEXT:    or s5, s11, s10
-; RV32I-NEXT:    lbu s9, 28(a0)
-; RV32I-NEXT:    lbu s10, 29(a0)
-; RV32I-NEXT:    lbu s11, 30(a0)
+; RV32I-NEXT:    or t4, s5, s4
+; RV32I-NEXT:    or s1, s7, s6
+; RV32I-NEXT:    or s2, s9, s8
+; RV32I-NEXT:    lbu s3, 28(a0)
+; RV32I-NEXT:    lbu s4, 29(a0)
+; RV32I-NEXT:    lbu s5, 30(a0)
 ; RV32I-NEXT:    lbu a0, 31(a0)
 ; RV32I-NEXT:    lbu a1, 0(a1)
-; RV32I-NEXT:    slli s6, s6, 8
-; RV32I-NEXT:    or s2, s6, s2
-; RV32I-NEXT:    addi s6, sp, 8
-; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli a7, a7, 24
-; RV32I-NEXT:    slli t2, t2, 8
-; RV32I-NEXT:    slli t3, t3, 16
-; RV32I-NEXT:    slli t4, t4, 24
+; RV32I-NEXT:    slli s10, s10, 16
+; RV32I-NEXT:    slli s11, s11, 24
+; RV32I-NEXT:    or s6, s11, s10
+; RV32I-NEXT:    mv s7, sp
 ; RV32I-NEXT:    slli t5, t5, 8
-; RV32I-NEXT:    slli s0, s0, 16
-; RV32I-NEXT:    slli s1, s1, 24
-; RV32I-NEXT:    slli s7, s7, 8
-; RV32I-NEXT:    slli s8, s8, 16
-; RV32I-NEXT:    slli ra, ra, 24
-; RV32I-NEXT:    slli s10, s10, 8
-; RV32I-NEXT:    slli s11, s11, 16
+; RV32I-NEXT:    slli t6, t6, 16
+; RV32I-NEXT:    slli s0, s0, 24
+; RV32I-NEXT:    slli s4, s4, 8
+; RV32I-NEXT:    slli s5, s5, 16
 ; RV32I-NEXT:    slli a0, a0, 24
 ; RV32I-NEXT:    slli a1, a1, 2
-; RV32I-NEXT:    or a5, a7, a5
-; RV32I-NEXT:    or a3, t2, a3
-; RV32I-NEXT:    or a7, t4, t3
-; RV32I-NEXT:    or a4, t5, a4
-; RV32I-NEXT:    or s0, s1, s0
-; RV32I-NEXT:    or t2, s7, t6
-; RV32I-NEXT:    or t3, ra, s8
-; RV32I-NEXT:    or t4, s10, s9
-; RV32I-NEXT:    or t5, a0, s11
+; RV32I-NEXT:    or t3, t5, t3
+; RV32I-NEXT:    or t5, s0, t6
+; RV32I-NEXT:    or t6, s4, s3
+; RV32I-NEXT:    or s0, a0, s5
 ; RV32I-NEXT:    srai a0, a0, 31
 ; RV32I-NEXT:    andi a1, a1, 28
-; RV32I-NEXT:    or a6, t0, a6
-; RV32I-NEXT:    or t0, s3, t1
-; RV32I-NEXT:    or t1, s5, s4
-; RV32I-NEXT:    or a5, a5, s2
-; RV32I-NEXT:    or a3, a7, a3
-; RV32I-NEXT:    or a4, s0, a4
-; RV32I-NEXT:    or a7, t3, t2
-; RV32I-NEXT:    or t2, t5, t4
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a5, t0, a7
+; RV32I-NEXT:    or a6, t2, t1
+; RV32I-NEXT:    or a7, s1, t4
+; RV32I-NEXT:    or t0, s6, s2
+; RV32I-NEXT:    or t1, t5, t3
+; RV32I-NEXT:    or t2, s0, t6
+; RV32I-NEXT:    sw a0, 48(sp)
+; RV32I-NEXT:    sw a0, 52(sp)
 ; RV32I-NEXT:    sw a0, 56(sp)
 ; RV32I-NEXT:    sw a0, 60(sp)
-; RV32I-NEXT:    sw a0, 64(sp)
-; RV32I-NEXT:    sw a0, 68(sp)
+; RV32I-NEXT:    sw a0, 32(sp)
+; RV32I-NEXT:    sw a0, 36(sp)
 ; RV32I-NEXT:    sw a0, 40(sp)
 ; RV32I-NEXT:    sw a0, 44(sp)
-; RV32I-NEXT:    sw a0, 48(sp)
-; RV32I-NEXT:    sw a0, 52(sp)
-; RV32I-NEXT:    add s6, s6, a1
-; RV32I-NEXT:    sw a3, 24(sp)
-; RV32I-NEXT:    sw a4, 28(sp)
-; RV32I-NEXT:    sw a7, 32(sp)
-; RV32I-NEXT:    sw t2, 36(sp)
-; RV32I-NEXT:    sw a6, 8(sp)
-; RV32I-NEXT:    sw t0, 12(sp)
-; RV32I-NEXT:    sw t1, 16(sp)
-; RV32I-NEXT:    sw a5, 20(sp)
-; RV32I-NEXT:    lw a6, 16(s6)
-; RV32I-NEXT:    lw a5, 20(s6)
-; RV32I-NEXT:    lw a7, 24(s6)
-; RV32I-NEXT:    lw a1, 0(s6)
-; RV32I-NEXT:    lw a0, 4(s6)
-; RV32I-NEXT:    lw a4, 8(s6)
-; RV32I-NEXT:    lw a3, 12(s6)
-; RV32I-NEXT:    lw t0, 28(s6)
+; RV32I-NEXT:    add s7, s7, a1
+; RV32I-NEXT:    sw a7, 16(sp)
+; RV32I-NEXT:    sw t0, 20(sp)
+; RV32I-NEXT:    sw t1, 24(sp)
+; RV32I-NEXT:    sw t2, 28(sp)
+; RV32I-NEXT:    sw a3, 0(sp)
+; RV32I-NEXT:    sw a4, 4(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a6, 12(sp)
+; RV32I-NEXT:    lw a6, 16(s7)
+; RV32I-NEXT:    lw a5, 20(s7)
+; RV32I-NEXT:    lw a7, 24(s7)
+; RV32I-NEXT:    lw a1, 0(s7)
+; RV32I-NEXT:    lw a0, 4(s7)
+; RV32I-NEXT:    lw a4, 8(s7)
+; RV32I-NEXT:    lw a3, 12(s7)
+; RV32I-NEXT:    lw t0, 28(s7)
 ; RV32I-NEXT:    srli t1, a7, 24
 ; RV32I-NEXT:    srli t2, a7, 16
 ; RV32I-NEXT:    srli t3, a7, 8
@@ -5453,21 +5432,21 @@ define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
 ; RV32I-NEXT:    srli s5, a5, 8
 ; RV32I-NEXT:    srli s6, a4, 24
 ; RV32I-NEXT:    srli s7, a4, 16
-; RV32I-NEXT:    srli s8, a4, 8
-; RV32I-NEXT:    srli s9, a3, 24
-; RV32I-NEXT:    srli s10, a3, 16
-; RV32I-NEXT:    srli s11, a3, 8
-; RV32I-NEXT:    srli ra, a1, 24
 ; RV32I-NEXT:    sb a7, 24(a2)
+; RV32I-NEXT:    srli a7, a4, 8
 ; RV32I-NEXT:    sb t3, 25(a2)
+; RV32I-NEXT:    srli t3, a3, 24
 ; RV32I-NEXT:    sb t2, 26(a2)
+; RV32I-NEXT:    srli t2, a3, 16
 ; RV32I-NEXT:    sb t1, 27(a2)
-; RV32I-NEXT:    srli a7, a1, 16
+; RV32I-NEXT:    srli t1, a3, 8
 ; RV32I-NEXT:    sb t0, 28(a2)
+; RV32I-NEXT:    srli t0, a1, 24
 ; RV32I-NEXT:    sb t6, 29(a2)
+; RV32I-NEXT:    srli t6, a1, 16
 ; RV32I-NEXT:    sb t5, 30(a2)
 ; RV32I-NEXT:    sb t4, 31(a2)
-; RV32I-NEXT:    srli t0, a1, 8
+; RV32I-NEXT:    srli t4, a1, 8
 ; RV32I-NEXT:    sb a6, 16(a2)
 ; RV32I-NEXT:    sb s2, 17(a2)
 ; RV32I-NEXT:    sb s1, 18(a2)
@@ -5479,36 +5458,35 @@ define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
 ; RV32I-NEXT:    sb s3, 23(a2)
 ; RV32I-NEXT:    srli a5, a0, 16
 ; RV32I-NEXT:    sb a4, 8(a2)
-; RV32I-NEXT:    sb s8, 9(a2)
+; RV32I-NEXT:    sb a7, 9(a2)
 ; RV32I-NEXT:    sb s7, 10(a2)
 ; RV32I-NEXT:    sb s6, 11(a2)
 ; RV32I-NEXT:    srli a4, a0, 8
 ; RV32I-NEXT:    sb a3, 12(a2)
-; RV32I-NEXT:    sb s11, 13(a2)
-; RV32I-NEXT:    sb s10, 14(a2)
-; RV32I-NEXT:    sb s9, 15(a2)
+; RV32I-NEXT:    sb t1, 13(a2)
+; RV32I-NEXT:    sb t2, 14(a2)
+; RV32I-NEXT:    sb t3, 15(a2)
 ; RV32I-NEXT:    sb a1, 0(a2)
-; RV32I-NEXT:    sb t0, 1(a2)
-; RV32I-NEXT:    sb a7, 2(a2)
-; RV32I-NEXT:    sb ra, 3(a2)
+; RV32I-NEXT:    sb t4, 1(a2)
+; RV32I-NEXT:    sb t6, 2(a2)
+; RV32I-NEXT:    sb t0, 3(a2)
 ; RV32I-NEXT:    sb a0, 4(a2)
 ; RV32I-NEXT:    sb a4, 5(a2)
 ; RV32I-NEXT:    sb a5, 6(a2)
 ; RV32I-NEXT:    sb a6, 7(a2)
-; RV32I-NEXT:    lw ra, 124(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 120(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 116(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 112(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s4, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s5, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s6, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s7, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s8, 88(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s9, 84(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s10, 80(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s11, 76(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 128
+; RV32I-NEXT:    lw s0, 108(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 104(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 100(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 96(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s5, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s6, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s7, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s8, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s9, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s10, 68(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s11, 64(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 112
 ; RV32I-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
   %wordOff = load i256, ptr %wordOff.ptr, align 1
@@ -5534,112 +5512,112 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; RV64I-NEXT:    sd s9, 80(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s10, 72(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s11, 64(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a5, 0(a0)
-; RV64I-NEXT:    lbu a7, 1(a0)
-; RV64I-NEXT:    lbu t1, 2(a0)
-; RV64I-NEXT:    lbu s3, 3(a0)
-; RV64I-NEXT:    lbu t0, 4(a0)
-; RV64I-NEXT:    lbu s8, 5(a0)
-; RV64I-NEXT:    lbu s9, 6(a0)
-; RV64I-NEXT:    lbu s10, 7(a0)
-; RV64I-NEXT:    lbu s2, 8(a0)
-; RV64I-NEXT:    lbu s4, 9(a0)
-; RV64I-NEXT:    lbu s5, 10(a0)
-; RV64I-NEXT:    lbu s6, 11(a0)
-; RV64I-NEXT:    lbu s7, 12(a0)
-; RV64I-NEXT:    lbu s11, 13(a0)
-; RV64I-NEXT:    lbu t4, 14(a0)
-; RV64I-NEXT:    lbu t5, 15(a0)
-; RV64I-NEXT:    lbu a3, 16(a0)
-; RV64I-NEXT:    lbu a6, 17(a0)
-; RV64I-NEXT:    lbu t2, 18(a0)
-; RV64I-NEXT:    lbu t3, 19(a0)
-; RV64I-NEXT:    lbu a4, 20(a0)
-; RV64I-NEXT:    lbu t6, 21(a0)
-; RV64I-NEXT:    lbu s0, 22(a0)
-; RV64I-NEXT:    lbu s1, 23(a0)
-; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    lbu a3, 0(a0)
+; RV64I-NEXT:    lbu a4, 1(a0)
+; RV64I-NEXT:    lbu a5, 2(a0)
+; RV64I-NEXT:    lbu a6, 3(a0)
+; RV64I-NEXT:    lbu a7, 4(a0)
+; RV64I-NEXT:    lbu t0, 5(a0)
+; RV64I-NEXT:    lbu t1, 6(a0)
+; RV64I-NEXT:    lbu t2, 7(a0)
+; RV64I-NEXT:    lbu t3, 8(a0)
+; RV64I-NEXT:    lbu t4, 9(a0)
+; RV64I-NEXT:    lbu t5, 10(a0)
+; RV64I-NEXT:    lbu t6, 11(a0)
+; RV64I-NEXT:    lbu s0, 12(a0)
+; RV64I-NEXT:    lbu s1, 13(a0)
+; RV64I-NEXT:    lbu s2, 14(a0)
+; RV64I-NEXT:    lbu s3, 15(a0)
+; RV64I-NEXT:    lbu s4, 16(a0)
+; RV64I-NEXT:    lbu s5, 17(a0)
+; RV64I-NEXT:    lbu s6, 18(a0)
+; RV64I-NEXT:    lbu s7, 19(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    slli a5, a5, 16
+; RV64I-NEXT:    slli a6, a6, 24
+; RV64I-NEXT:    slli t0, t0, 8
 ; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t2, t2, 24
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a5, t0, a7
+; RV64I-NEXT:    or a6, t2, t1
+; RV64I-NEXT:    lbu s8, 20(a0)
+; RV64I-NEXT:    lbu s9, 21(a0)
+; RV64I-NEXT:    lbu s10, 22(a0)
+; RV64I-NEXT:    lbu s11, 23(a0)
+; RV64I-NEXT:    slli t4, t4, 8
+; RV64I-NEXT:    slli t5, t5, 16
+; RV64I-NEXT:    slli t6, t6, 24
+; RV64I-NEXT:    slli s1, s1, 8
+; RV64I-NEXT:    slli s2, s2, 16
 ; RV64I-NEXT:    slli s3, s3, 24
-; RV64I-NEXT:    slli s8, s8, 8
-; RV64I-NEXT:    slli s9, s9, 16
-; RV64I-NEXT:    slli s10, s10, 24
-; RV64I-NEXT:    or a5, a7, a5
-; RV64I-NEXT:    or a7, s3, t1
-; RV64I-NEXT:    or t0, s8, t0
-; RV64I-NEXT:    or t1, s10, s9
-; RV64I-NEXT:    lbu s3, 24(a0)
-; RV64I-NEXT:    lbu s8, 25(a0)
-; RV64I-NEXT:    lbu s9, 26(a0)
-; RV64I-NEXT:    lbu s10, 27(a0)
-; RV64I-NEXT:    slli s4, s4, 8
-; RV64I-NEXT:    slli s5, s5, 16
-; RV64I-NEXT:    slli s6, s6, 24
-; RV64I-NEXT:    slli s11, s11, 8
-; RV64I-NEXT:    or s2, s4, s2
-; RV64I-NEXT:    or s4, s6, s5
-; RV64I-NEXT:    or s5, s11, s7
-; RV64I-NEXT:    lbu s6, 28(a0)
-; RV64I-NEXT:    lbu s7, 29(a0)
-; RV64I-NEXT:    lbu s11, 30(a0)
+; RV64I-NEXT:    or a7, t4, t3
+; RV64I-NEXT:    or t0, t6, t5
+; RV64I-NEXT:    or t1, s1, s0
+; RV64I-NEXT:    or t2, s3, s2
+; RV64I-NEXT:    lbu t3, 24(a0)
+; RV64I-NEXT:    lbu t4, 25(a0)
+; RV64I-NEXT:    lbu t5, 26(a0)
+; RV64I-NEXT:    lbu t6, 27(a0)
+; RV64I-NEXT:    slli s5, s5, 8
+; RV64I-NEXT:    slli s6, s6, 16
+; RV64I-NEXT:    slli s7, s7, 24
+; RV64I-NEXT:    slli s9, s9, 8
+; RV64I-NEXT:    or s0, s5, s4
+; RV64I-NEXT:    or s1, s7, s6
+; RV64I-NEXT:    or s2, s9, s8
+; RV64I-NEXT:    lbu s3, 28(a0)
+; RV64I-NEXT:    lbu s4, 29(a0)
+; RV64I-NEXT:    lbu s5, 30(a0)
 ; RV64I-NEXT:    lbu a0, 31(a0)
 ; RV64I-NEXT:    lbu a1, 0(a1)
-; RV64I-NEXT:    slli t4, t4, 16
-; RV64I-NEXT:    slli t5, t5, 24
-; RV64I-NEXT:    or t4, t5, t4
-; RV64I-NEXT:    mv t5, sp
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    slli t2, t2, 16
-; RV64I-NEXT:    slli t3, t3, 24
-; RV64I-NEXT:    slli t6, t6, 8
-; RV64I-NEXT:    slli s0, s0, 16
-; RV64I-NEXT:    slli s1, s1, 24
-; RV64I-NEXT:    slli s8, s8, 8
-; RV64I-NEXT:    slli s9, s9, 16
-; RV64I-NEXT:    slli s10, s10, 24
-; RV64I-NEXT:    slli s7, s7, 8
-; RV64I-NEXT:    slli s11, s11, 16
+; RV64I-NEXT:    slli s10, s10, 16
+; RV64I-NEXT:    slli s11, s11, 24
+; RV64I-NEXT:    or s6, s11, s10
+; RV64I-NEXT:    mv s7, sp
+; RV64I-NEXT:    slli t4, t4, 8
+; RV64I-NEXT:    slli t5, t5, 16
+; RV64I-NEXT:    slli t6, t6, 24
+; RV64I-NEXT:    slli s4, s4, 8
+; RV64I-NEXT:    slli s5, s5, 16
 ; RV64I-NEXT:    slli a0, a0, 24
 ; RV64I-NEXT:    slli a1, a1, 3
-; RV64I-NEXT:    or a3, a6, a3
-; RV64I-NEXT:    or a6, t3, t2
-; RV64I-NEXT:    or a4, t6, a4
-; RV64I-NEXT:    or s0, s1, s0
-; RV64I-NEXT:    or t2, s8, s3
-; RV64I-NEXT:    or t3, s10, s9
-; RV64I-NEXT:    or t6, s7, s6
-; RV64I-NEXT:    or a0, a0, s11
+; RV64I-NEXT:    or t3, t4, t3
+; RV64I-NEXT:    or t4, t6, t5
+; RV64I-NEXT:    or t5, s4, s3
+; RV64I-NEXT:    or a0, a0, s5
 ; RV64I-NEXT:    andi a1, a1, 24
-; RV64I-NEXT:    or a5, a7, a5
-; RV64I-NEXT:    or a7, t1, t0
-; RV64I-NEXT:    or t0, s4, s2
-; RV64I-NEXT:    or t1, t4, s5
-; RV64I-NEXT:    or a3, a6, a3
-; RV64I-NEXT:    or a4, s0, a4
-; RV64I-NEXT:    or a6, t3, t2
-; RV64I-NEXT:    or a0, a0, t6
-; RV64I-NEXT:    add t5, t5, a1
-; RV64I-NEXT:    slli a7, a7, 32
-; RV64I-NEXT:    slli t1, t1, 32
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a5, t0, a7
+; RV64I-NEXT:    or a6, t2, t1
+; RV64I-NEXT:    or s0, s1, s0
+; RV64I-NEXT:    or a7, s6, s2
+; RV64I-NEXT:    or t0, t4, t3
+; RV64I-NEXT:    or a0, a0, t5
+; RV64I-NEXT:    add s7, s7, a1
 ; RV64I-NEXT:    slli a4, a4, 32
+; RV64I-NEXT:    slli a6, a6, 32
+; RV64I-NEXT:    slli a7, a7, 32
 ; RV64I-NEXT:    slli a1, a0, 32
 ; RV64I-NEXT:    sraiw a0, a0, 31
-; RV64I-NEXT:    or a5, a7, a5
-; RV64I-NEXT:    or a7, t1, t0
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    or a1, a1, a6
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a5, a7, s0
+; RV64I-NEXT:    or a1, a1, t0
 ; RV64I-NEXT:    sd a0, 32(sp)
 ; RV64I-NEXT:    sd a0, 40(sp)
 ; RV64I-NEXT:    sd a0, 48(sp)
 ; RV64I-NEXT:    sd a0, 56(sp)
-; RV64I-NEXT:    sd a5, 0(sp)
-; RV64I-NEXT:    sd a7, 8(sp)
-; RV64I-NEXT:    sd a3, 16(sp)
+; RV64I-NEXT:    sd a3, 0(sp)
+; RV64I-NEXT:    sd a4, 8(sp)
+; RV64I-NEXT:    sd a5, 16(sp)
 ; RV64I-NEXT:    sd a1, 24(sp)
-; RV64I-NEXT:    ld a4, 16(t5)
-; RV64I-NEXT:    ld a0, 8(t5)
-; RV64I-NEXT:    ld a1, 0(t5)
-; RV64I-NEXT:    ld a3, 24(t5)
+; RV64I-NEXT:    ld a4, 16(s7)
+; RV64I-NEXT:    ld a0, 8(s7)
+; RV64I-NEXT:    ld a1, 0(s7)
+; RV64I-NEXT:    ld a3, 24(s7)
 ; RV64I-NEXT:    srli a5, a4, 56
 ; RV64I-NEXT:    srli a6, a4, 48
 ; RV64I-NEXT:    srli a7, a4, 40
@@ -5658,25 +5636,25 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; RV64I-NEXT:    srli s5, a1, 48
 ; RV64I-NEXT:    srli s6, a1, 40
 ; RV64I-NEXT:    srli s7, a1, 32
-; RV64I-NEXT:    srli s8, a1, 24
-; RV64I-NEXT:    srli s9, a1, 16
-; RV64I-NEXT:    srli s10, a1, 8
-; RV64I-NEXT:    srli s11, a0, 56
 ; RV64I-NEXT:    sb t0, 20(a2)
+; RV64I-NEXT:    srli t0, a1, 24
 ; RV64I-NEXT:    sb a7, 21(a2)
+; RV64I-NEXT:    srli a7, a1, 16
 ; RV64I-NEXT:    sb a6, 22(a2)
+; RV64I-NEXT:    srli a6, a1, 8
 ; RV64I-NEXT:    sb a5, 23(a2)
-; RV64I-NEXT:    srli a5, a0, 48
+; RV64I-NEXT:    srli a5, a0, 56
 ; RV64I-NEXT:    sb a4, 16(a2)
+; RV64I-NEXT:    srli a4, a0, 48
 ; RV64I-NEXT:    sb t3, 17(a2)
 ; RV64I-NEXT:    sb t2, 18(a2)
 ; RV64I-NEXT:    sb t1, 19(a2)
-; RV64I-NEXT:    srli a4, a0, 40
+; RV64I-NEXT:    srli t1, a0, 40
 ; RV64I-NEXT:    sb s0, 28(a2)
 ; RV64I-NEXT:    sb t6, 29(a2)
 ; RV64I-NEXT:    sb t5, 30(a2)
 ; RV64I-NEXT:    sb t4, 31(a2)
-; RV64I-NEXT:    srli a6, a0, 32
+; RV64I-NEXT:    srli t2, a0, 32
 ; RV64I-NEXT:    sb a3, 24(a2)
 ; RV64I-NEXT:    sb s3, 25(a2)
 ; RV64I-NEXT:    sb s2, 26(a2)
@@ -5686,19 +5664,19 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; RV64I-NEXT:    sb s6, 5(a2)
 ; RV64I-NEXT:    sb s5, 6(a2)
 ; RV64I-NEXT:    sb s4, 7(a2)
-; RV64I-NEXT:    srli a7, a0, 16
+; RV64I-NEXT:    srli t3, a0, 16
 ; RV64I-NEXT:    sb a1, 0(a2)
-; RV64I-NEXT:    sb s10, 1(a2)
-; RV64I-NEXT:    sb s9, 2(a2)
-; RV64I-NEXT:    sb s8, 3(a2)
+; RV64I-NEXT:    sb a6, 1(a2)
+; RV64I-NEXT:    sb a7, 2(a2)
+; RV64I-NEXT:    sb t0, 3(a2)
 ; RV64I-NEXT:    srli a1, a0, 8
-; RV64I-NEXT:    sb a6, 12(a2)
-; RV64I-NEXT:    sb a4, 13(a2)
-; RV64I-NEXT:    sb a5, 14(a2)
-; RV64I-NEXT:    sb s11, 15(a2)
+; RV64I-NEXT:    sb t2, 12(a2)
+; RV64I-NEXT:    sb t1, 13(a2)
+; RV64I-NEXT:    sb a4, 14(a2)
+; RV64I-NEXT:    sb a5, 15(a2)
 ; RV64I-NEXT:    sb a0, 8(a2)
 ; RV64I-NEXT:    sb a1, 9(a2)
-; RV64I-NEXT:    sb a7, 10(a2)
+; RV64I-NEXT:    sb t3, 10(a2)
 ; RV64I-NEXT:    sb a3, 11(a2)
 ; RV64I-NEXT:    ld s0, 152(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 144(sp) # 8-byte Folded Reload
@@ -5717,130 +5695,129 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ;
 ; RV32I-LABEL: ashr_32bytes_dwordOff:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -128
-; RV32I-NEXT:    sw ra, 124(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 120(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 116(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 112(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s5, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s6, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s7, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s8, 88(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s9, 84(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s10, 80(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s11, 76(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a6, 0(a0)
-; RV32I-NEXT:    lbu t0, 1(a0)
-; RV32I-NEXT:    lbu t1, 2(a0)
-; RV32I-NEXT:    lbu t6, 3(a0)
-; RV32I-NEXT:    lbu s7, 4(a0)
-; RV32I-NEXT:    lbu s8, 5(a0)
-; RV32I-NEXT:    lbu s3, 6(a0)
-; RV32I-NEXT:    lbu s5, 7(a0)
-; RV32I-NEXT:    lbu s4, 8(a0)
-; RV32I-NEXT:    lbu s9, 9(a0)
-; RV32I-NEXT:    lbu s10, 10(a0)
-; RV32I-NEXT:    lbu s11, 11(a0)
-; RV32I-NEXT:    lbu s2, 12(a0)
-; RV32I-NEXT:    lbu s6, 13(a0)
-; RV32I-NEXT:    lbu a5, 14(a0)
-; RV32I-NEXT:    lbu a7, 15(a0)
-; RV32I-NEXT:    lbu a3, 16(a0)
-; RV32I-NEXT:    lbu t2, 17(a0)
-; RV32I-NEXT:    lbu t3, 18(a0)
-; RV32I-NEXT:    lbu t4, 19(a0)
-; RV32I-NEXT:    lbu a4, 20(a0)
-; RV32I-NEXT:    lbu t5, 21(a0)
-; RV32I-NEXT:    lbu s0, 22(a0)
-; RV32I-NEXT:    lbu s1, 23(a0)
+; RV32I-NEXT:    addi sp, sp, -112
+; RV32I-NEXT:    sw s0, 108(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 104(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 100(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 96(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s5, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s6, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s7, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s8, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s9, 72(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s10, 68(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s11, 64(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a3, 0(a0)
+; RV32I-NEXT:    lbu a4, 1(a0)
+; RV32I-NEXT:    lbu a5, 2(a0)
+; RV32I-NEXT:    lbu a6, 3(a0)
+; RV32I-NEXT:    lbu a7, 4(a0)
+; RV32I-NEXT:    lbu t0, 5(a0)
+; RV32I-NEXT:    lbu t1, 6(a0)
+; RV32I-NEXT:    lbu t2, 7(a0)
+; RV32I-NEXT:    lbu t3, 8(a0)
+; RV32I-NEXT:    lbu t4, 9(a0)
+; RV32I-NEXT:    lbu t5, 10(a0)
+; RV32I-NEXT:    lbu t6, 11(a0)
+; RV32I-NEXT:    lbu s0, 12(a0)
+; RV32I-NEXT:    lbu s1, 13(a0)
+; RV32I-NEXT:    lbu s2, 14(a0)
+; RV32I-NEXT:    lbu s3, 15(a0)
+; RV32I-NEXT:    lbu s4, 16(a0)
+; RV32I-NEXT:    lbu s5, 17(a0)
+; RV32I-NEXT:    lbu s6, 18(a0)
+; RV32I-NEXT:    lbu s7, 19(a0)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a6, a6, 24
 ; RV32I-NEXT:    slli t0, t0, 8
 ; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t2, t2, 24
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a5, t0, a7
+; RV32I-NEXT:    or a6, t2, t1
+; RV32I-NEXT:    lbu s8, 20(a0)
+; RV32I-NEXT:    lbu s9, 21(a0)
+; RV32I-NEXT:    lbu s10, 22(a0)
+; RV32I-NEXT:    lbu s11, 23(a0)
+; RV32I-NEXT:    slli t4, t4, 8
+; RV32I-NEXT:    slli t5, t5, 16
 ; RV32I-NEXT:    slli t6, t6, 24
-; RV32I-NEXT:    slli s8, s8, 8
-; RV32I-NEXT:    or a6, t0, a6
-; RV32I-NEXT:    or t0, t6, t1
-; RV32I-NEXT:    or t1, s8, s7
-; RV32I-NEXT:    lbu t6, 24(a0)
-; RV32I-NEXT:    lbu s7, 25(a0)
-; RV32I-NEXT:    lbu s8, 26(a0)
-; RV32I-NEXT:    lbu ra, 27(a0)
-; RV32I-NEXT:    slli s3, s3, 16
-; RV32I-NEXT:    slli s5, s5, 24
+; RV32I-NEXT:    slli s1, s1, 8
+; RV32I-NEXT:    slli s2, s2, 16
+; RV32I-NEXT:    slli s3, s3, 24
+; RV32I-NEXT:    or a7, t4, t3
+; RV32I-NEXT:    or t0, t6, t5
+; RV32I-NEXT:    or t1, s1, s0
+; RV32I-NEXT:    or t2, s3, s2
+; RV32I-NEXT:    lbu t3, 24(a0)
+; RV32I-NEXT:    lbu t5, 25(a0)
+; RV32I-NEXT:    lbu t6, 26(a0)
+; RV32I-NEXT:    lbu s0, 27(a0)
+; RV32I-NEXT:    slli s5, s5, 8
+; RV32I-NEXT:    slli s6, s6, 16
+; RV32I-NEXT:    slli s7, s7, 24
 ; RV32I-NEXT:    slli s9, s9, 8
-; RV32I-NEXT:    slli s10, s10, 16
-; RV32I-NEXT:    slli s11, s11, 24
-; RV32I-NEXT:    or s3, s5, s3
-; RV32I-NEXT:    or s4, s9, s4
-; RV32I-NEXT:    or s5, s11, s10
-; RV32I-NEXT:    lbu s9, 28(a0)
-; RV32I-NEXT:    lbu s10, 29(a0)
-; RV32I-NEXT:    lbu s11, 30(a0)
+; RV32I-NEXT:    or t4, s5, s4
+; RV32I-NEXT:    or s1, s7, s6
+; RV32I-NEXT:    or s2, s9, s8
+; RV32I-NEXT:    lbu s3, 28(a0)
+; RV32I-NEXT:    lbu s4, 29(a0)
+; RV32I-NEXT:    lbu s5, 30(a0)
 ; RV32I-NEXT:    lbu a0, 31(a0)
 ; RV32I-NEXT:    lbu a1, 0(a1)
-; RV32I-NEXT:    slli s6, s6, 8
-; RV32I-NEXT:    or s2, s6, s2
-; RV32I-NEXT:    addi s6, sp, 8
-; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli a7, a7, 24
-; RV32I-NEXT:    slli t2, t2, 8
-; RV32I-NEXT:    slli t3, t3, 16
-; RV32I-NEXT:    slli t4, t4, 24
+; RV32I-NEXT:    slli s10, s10, 16
+; RV32I-NEXT:    slli s11, s11, 24
+; RV32I-NEXT:    or s6, s11, s10
+; RV32I-NEXT:    mv s7, sp
 ; RV32I-NEXT:    slli t5, t5, 8
-; RV32I-NEXT:    slli s0, s0, 16
-; RV32I-NEXT:    slli s1, s1, 24
-; RV32I-NEXT:    slli s7, s7, 8
-; RV32I-NEXT:    slli s8, s8, 16
-; RV32I-NEXT:    slli ra, ra, 24
-; RV32I-NEXT:    slli s10, s10, 8
-; RV32I-NEXT:    slli s11, s11, 16
+; RV32I-NEXT:    slli t6, t6, 16
+; RV32I-NEXT:    slli s0, s0, 24
+; RV32I-NEXT:    slli s4, s4, 8
+; RV32I-NEXT:    slli s5, s5, 16
 ; RV32I-NEXT:    slli a0, a0, 24
 ; RV32I-NEXT:    slli a1, a1, 3
-; RV32I-NEXT:    or a5, a7, a5
-; RV32I-NEXT:    or a3, t2, a3
-; RV32I-NEXT:    or a7, t4, t3
-; RV32I-NEXT:    or a4, t5, a4
-; RV32I-NEXT:    or s0, s1, s0
-; RV32I-NEXT:    or t2, s7, t6
-; RV32I-NEXT:    or t3, ra, s8
-; RV32I-NEXT:    or t4, s10, s9
-; RV32I-NEXT:    or t5, a0, s11
+; RV32I-NEXT:    or t3, t5, t3
+; RV32I-NEXT:    or t5, s0, t6
+; RV32I-NEXT:    or t6, s4, s3
+; RV32I-NEXT:    or s0, a0, s5
 ; RV32I-NEXT:    srai a0, a0, 31
 ; RV32I-NEXT:    andi a1, a1, 24
-; RV32I-NEXT:    or a6, t0, a6
-; RV32I-NEXT:    or t0, s3, t1
-; RV32I-NEXT:    or t1, s5, s4
-; RV32I-NEXT:    or a5, a5, s2
-; RV32I-NEXT:    or a3, a7, a3
-; RV32I-NEXT:    or a4, s0, a4
-; RV32I-NEXT:    or a7, t3, t2
-; RV32I-NEXT:    or t2, t5, t4
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a5, t0, a7
+; RV32I-NEXT:    or a6, t2, t1
+; RV32I-NEXT:    or a7, s1, t4
+; RV32I-NEXT:    or t0, s6, s2
+; RV32I-NEXT:    or t1, t5, t3
+; RV32I-NEXT:    or t2, s0, t6
+; RV32I-NEXT:    sw a0, 48(sp)
+; RV32I-NEXT:    sw a0, 52(sp)
 ; RV32I-NEXT:    sw a0, 56(sp)
 ; RV32I-NEXT:    sw a0, 60(sp)
-; RV32I-NEXT:    sw a0, 64(sp)
-; RV32I-NEXT:    sw a0, 68(sp)
+; RV32I-NEXT:    sw a0, 32(sp)
+; RV32I-NEXT:    sw a0, 36(sp)
 ; RV32I-NEXT:    sw a0, 40(sp)
 ; RV32I-NEXT:    sw a0, 44(sp)
-; RV32I-NEXT:    sw a0, 48(sp)
-; RV32I-NEXT:    sw a0, 52(sp)
-; RV32I-NEXT:    add s6, s6, a1
-; RV32I-NEXT:    sw a3, 24(sp)
-; RV32I-NEXT:    sw a4, 28(sp)
-; RV32I-NEXT:    sw a7, 32(sp)
-; RV32I-NEXT:    sw t2, 36(sp)
-; RV32I-NEXT:    sw a6, 8(sp)
-; RV32I-NEXT:    sw t0, 12(sp)
-; RV32I-NEXT:    sw t1, 16(sp)
-; RV32I-NEXT:    sw a5, 20(sp)
-; RV32I-NEXT:    lw a6, 16(s6)
-; RV32I-NEXT:    lw a5, 20(s6)
-; RV32I-NEXT:    lw a7, 24(s6)
-; RV32I-NEXT:    lw a1, 0(s6)
-; RV32I-NEXT:    lw a0, 4(s6)
-; RV32I-NEXT:    lw a4, 8(s6)
-; RV32I-NEXT:    lw a3, 12(s6)
-; RV32I-NEXT:    lw t0, 28(s6)
+; RV32I-NEXT:    add s7, s7, a1
+; RV32I-NEXT:    sw a7, 16(sp)
+; RV32I-NEXT:    sw t0, 20(sp)
+; RV32I-NEXT:    sw t1, 24(sp)
+; RV32I-NEXT:    sw t2, 28(sp)
+; RV32I-NEXT:    sw a3, 0(sp)
+; RV32I-NEXT:    sw a4, 4(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a6, 12(sp)
+; RV32I-NEXT:    lw a6, 16(s7)
+; RV32I-NEXT:    lw a5, 20(s7)
+; RV32I-NEXT:    lw a7, 24(s7)
+; RV32I-NEXT:    lw a1, 0(s7)
+; RV32I-NEXT:    lw a0, 4(s7)
+; RV32I-NEXT:    lw a4, 8(s7)
+; RV32I-NEXT:    lw a3, 12(s7)
+; RV32I-NEXT:    lw t0, 28(s7)
 ; RV32I-NEXT:    srli t1, a7, 24
 ; RV32I-NEXT:    srli t2, a7, 16
 ; RV32I-NEXT:    srli t3, a7, 8
@@ -5855,21 +5832,21 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; RV32I-NEXT:    srli s5, a5, 8
 ; RV32I-NEXT:    srli s6, a4, 24
 ; RV32I-NEXT:    srli s7, a4, 16
-; RV32I-NEXT:    srli s8, a4, 8
-; RV32I-NEXT:    srli s9, a3, 24
-; RV32I-NEXT:    srli s10, a3, 16
-; RV32I-NEXT:    srli s11, a3, 8
-; RV32I-NEXT:    srli ra, a1, 24
 ; RV32I-NEXT:    sb a7, 24(a2)
+; RV32I-NEXT:    srli a7, a4, 8
 ; RV32I-NEXT:    sb t3, 25(a2)
+; RV32I-NEXT:    srli t3, a3, 24
 ; RV32I-NEXT:    sb t2, 26(a2)
+; RV32I-NEXT:    srli t2, a3, 16
 ; RV32I-NEXT:    sb t1, 27(a2)
-; RV32I-NEXT:    srli a7, a1, 16
+; RV32I-NEXT:    srli t1, a3, 8
 ; RV32I-NEXT:    sb t0, 28(a2)
+; RV32I-NEXT:    srli t0, a1, 24
 ; RV32I-NEXT:    sb t6, 29(a2)
+; RV32I-NEXT:    srli t6, a1, 16
 ; RV32I-NEXT:    sb t5, 30(a2)
 ; RV32I-NEXT:    sb t4, 31(a2)
-; RV32I-NEXT:    srli t0, a1, 8
+; RV32I-NEXT:    srli t4, a1, 8
 ; RV32I-NEXT:    sb a6, 16(a2)
 ; RV32I-NEXT:    sb s2, 17(a2)
 ; RV32I-NEXT:    sb s1, 18(a2)
@@ -5881,36 +5858,35 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; RV32I-NEXT:    sb s3, 23(a2)
 ; RV32I-NEXT:    srli a5, a0, 16
 ; RV32I-NEXT:    sb a4, 8(a2)
-; RV32I-NEXT:    sb s8, 9(a2)
+; RV32I-NEXT:    sb a7, 9(a2)
 ; RV32I-NEXT:    sb s7, 10(a2)
 ; RV32I-NEXT:    sb s6, 11(a2)
 ; RV32I-NEXT:    srli a4, a0, 8
 ; RV32I-NEXT:    sb a3, 12(a2)
-; RV32I-NEXT:    sb s11, 13(a2)
-; RV32I-NEXT:    sb s10, 14(a2)
-; RV32I-NEXT:    sb s9, 15(a2)
+; RV32I-NEXT:    sb t1, 13(a2)
+; RV32I-NEXT:    sb t2, 14(a2)
+; RV32I-NEXT:    sb t3, 15(a2)
 ; RV32I-NEXT:    sb a1, 0(a2)
-; RV32I-NEXT:    sb t0, 1(a2)
-; RV32I-NEXT:    sb a7, 2(a2)
-; RV32I-NEXT:    sb ra, 3(a2)
+; RV32I-NEXT:    sb t4, 1(a2)
+; RV32I-NEXT:    sb t6, 2(a2)
+; RV32I-NEXT:    sb t0, 3(a2)
 ; RV32I-NEXT:    sb a0, 4(a2)
 ; RV32I-NEXT:    sb a4, 5(a2)
 ; RV32I-NEXT:    sb a5, 6(a2)
 ; RV32I-NEXT:    sb a6, 7(a2)
-; RV32I-NEXT:    lw ra, 124(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 120(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 116(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 112(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s4, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s5, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s6, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s7, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s8, 88(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s9, 84(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s10, 80(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s11, 76(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 128
+; RV32I-NEXT:    lw s0, 108(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 104(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 100(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 96(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s5, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s6, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s7, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s8, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s9, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s10, 68(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s11, 64(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 112
 ; RV32I-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
   %dwordOff = load i256, ptr %dwordOff.ptr, align 1
diff --git a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
index b2c130c2d7c10a..b8952d2cb2b29e 100644
--- a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
+++ b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
@@ -1530,25 +1530,24 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ;
 ; RV32I-LABEL: lshr_32bytes:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -128
-; RV32I-NEXT:    sw ra, 124(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 120(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 116(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 112(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s5, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s6, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s7, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s8, 88(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s9, 84(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s10, 80(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s11, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    addi sp, sp, -112
+; RV32I-NEXT:    sw s0, 108(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 104(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 100(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 96(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s5, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s6, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s7, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s8, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s9, 72(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s10, 68(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s11, 64(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    lbu a3, 0(a0)
 ; RV32I-NEXT:    lbu a4, 1(a0)
-; RV32I-NEXT:    lbu a6, 2(a0)
-; RV32I-NEXT:    lbu a7, 3(a0)
-; RV32I-NEXT:    lbu a5, 4(a0)
+; RV32I-NEXT:    lbu a5, 2(a0)
+; RV32I-NEXT:    lbu a6, 3(a0)
+; RV32I-NEXT:    lbu a7, 4(a0)
 ; RV32I-NEXT:    lbu t0, 5(a0)
 ; RV32I-NEXT:    lbu t1, 6(a0)
 ; RV32I-NEXT:    lbu t2, 7(a0)
@@ -1557,107 +1556,105 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    lbu t5, 10(a0)
 ; RV32I-NEXT:    lbu t6, 11(a0)
 ; RV32I-NEXT:    lbu s0, 12(a0)
-; RV32I-NEXT:    lbu s2, 13(a0)
-; RV32I-NEXT:    lbu s4, 14(a0)
-; RV32I-NEXT:    lbu s5, 15(a0)
-; RV32I-NEXT:    lbu s6, 16(a0)
-; RV32I-NEXT:    lbu s7, 17(a0)
-; RV32I-NEXT:    lbu s8, 18(a0)
-; RV32I-NEXT:    lbu s9, 19(a0)
+; RV32I-NEXT:    lbu s1, 13(a0)
+; RV32I-NEXT:    lbu s2, 14(a0)
+; RV32I-NEXT:    lbu s3, 15(a0)
+; RV32I-NEXT:    lbu s4, 16(a0)
+; RV32I-NEXT:    lbu s5, 17(a0)
+; RV32I-NEXT:    lbu s6, 18(a0)
+; RV32I-NEXT:    lbu s7, 19(a0)
 ; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    slli a6, a6, 16
-; RV32I-NEXT:    slli a7, a7, 24
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    sw a3, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    or a4, a7, a6
-; RV32I-NEXT:    lbu s10, 20(a0)
-; RV32I-NEXT:    lbu s11, 21(a0)
-; RV32I-NEXT:    lbu ra, 22(a0)
-; RV32I-NEXT:    lbu a3, 23(a0)
+; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a6, a6, 24
 ; RV32I-NEXT:    slli t0, t0, 8
 ; RV32I-NEXT:    slli t1, t1, 16
 ; RV32I-NEXT:    slli t2, t2, 24
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a5, t0, a7
+; RV32I-NEXT:    or a6, t2, t1
+; RV32I-NEXT:    lbu s8, 20(a0)
+; RV32I-NEXT:    lbu s9, 21(a0)
+; RV32I-NEXT:    lbu s10, 22(a0)
+; RV32I-NEXT:    lbu s11, 23(a0)
 ; RV32I-NEXT:    slli t4, t4, 8
 ; RV32I-NEXT:    slli t5, t5, 16
 ; RV32I-NEXT:    slli t6, t6, 24
-; RV32I-NEXT:    or a5, t0, a5
-; RV32I-NEXT:    or a6, t2, t1
+; RV32I-NEXT:    slli s1, s1, 8
+; RV32I-NEXT:    slli s2, s2, 16
+; RV32I-NEXT:    slli s3, s3, 24
 ; RV32I-NEXT:    or a7, t4, t3
 ; RV32I-NEXT:    or t0, t6, t5
-; RV32I-NEXT:    lbu s1, 24(a0)
-; RV32I-NEXT:    lbu s3, 25(a0)
-; RV32I-NEXT:    lbu t4, 26(a0)
-; RV32I-NEXT:    lbu t5, 27(a0)
-; RV32I-NEXT:    slli s2, s2, 8
-; RV32I-NEXT:    slli s4, s4, 16
-; RV32I-NEXT:    slli s5, s5, 24
-; RV32I-NEXT:    slli s7, s7, 8
-; RV32I-NEXT:    or t1, s2, s0
-; RV32I-NEXT:    or t2, s5, s4
-; RV32I-NEXT:    or t3, s7, s6
-; RV32I-NEXT:    lbu t6, 28(a0)
+; RV32I-NEXT:    or t1, s1, s0
+; RV32I-NEXT:    or t2, s3, s2
+; RV32I-NEXT:    lbu t6, 24(a0)
+; RV32I-NEXT:    lbu s0, 25(a0)
+; RV32I-NEXT:    lbu s1, 26(a0)
+; RV32I-NEXT:    lbu s2, 27(a0)
+; RV32I-NEXT:    slli s5, s5, 8
+; RV32I-NEXT:    slli s6, s6, 16
+; RV32I-NEXT:    slli s7, s7, 24
+; RV32I-NEXT:    slli s9, s9, 8
+; RV32I-NEXT:    or t3, s5, s4
+; RV32I-NEXT:    or t4, s7, s6
+; RV32I-NEXT:    or t5, s9, s8
+; RV32I-NEXT:    lbu s3, 28(a0)
 ; RV32I-NEXT:    lbu s4, 29(a0)
 ; RV32I-NEXT:    lbu s5, 30(a0)
 ; RV32I-NEXT:    lbu s6, 31(a0)
-; RV32I-NEXT:    slli s8, s8, 16
-; RV32I-NEXT:    slli s9, s9, 24
-; RV32I-NEXT:    slli s11, s11, 8
-; RV32I-NEXT:    slli ra, ra, 16
-; RV32I-NEXT:    slli a3, a3, 24
-; RV32I-NEXT:    or a0, s9, s8
-; RV32I-NEXT:    or s0, s11, s10
-; RV32I-NEXT:    or s2, a3, ra
-; RV32I-NEXT:    lbu a3, 0(a1)
-; RV32I-NEXT:    lbu s7, 1(a1)
-; RV32I-NEXT:    lbu s8, 2(a1)
+; RV32I-NEXT:    slli s10, s10, 16
+; RV32I-NEXT:    slli s11, s11, 24
+; RV32I-NEXT:    slli s0, s0, 8
+; RV32I-NEXT:    slli s1, s1, 16
+; RV32I-NEXT:    slli s2, s2, 24
+; RV32I-NEXT:    or a0, s11, s10
+; RV32I-NEXT:    or t6, s0, t6
+; RV32I-NEXT:    or s0, s2, s1
+; RV32I-NEXT:    lbu s1, 0(a1)
+; RV32I-NEXT:    lbu s2, 1(a1)
+; RV32I-NEXT:    lbu s7, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    sw zero, 48(sp)
+; RV32I-NEXT:    sw zero, 52(sp)
 ; RV32I-NEXT:    sw zero, 56(sp)
 ; RV32I-NEXT:    sw zero, 60(sp)
-; RV32I-NEXT:    sw zero, 64(sp)
-; RV32I-NEXT:    sw zero, 68(sp)
+; RV32I-NEXT:    sw zero, 32(sp)
+; RV32I-NEXT:    sw zero, 36(sp)
 ; RV32I-NEXT:    sw zero, 40(sp)
 ; RV32I-NEXT:    sw zero, 44(sp)
-; RV32I-NEXT:    sw zero, 48(sp)
-; RV32I-NEXT:    sw zero, 52(sp)
-; RV32I-NEXT:    slli s3, s3, 8
-; RV32I-NEXT:    or s1, s3, s1
-; RV32I-NEXT:    addi s3, sp, 8
-; RV32I-NEXT:    slli t4, t4, 16
-; RV32I-NEXT:    slli t5, t5, 24
 ; RV32I-NEXT:    slli s4, s4, 8
+; RV32I-NEXT:    or s3, s4, s3
+; RV32I-NEXT:    mv s4, sp
 ; RV32I-NEXT:    slli s5, s5, 16
 ; RV32I-NEXT:    slli s6, s6, 24
-; RV32I-NEXT:    slli s7, s7, 8
-; RV32I-NEXT:    slli s8, s8, 16
+; RV32I-NEXT:    slli s2, s2, 8
+; RV32I-NEXT:    slli s7, s7, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or t4, t5, t4
-; RV32I-NEXT:    or t5, s4, t6
-; RV32I-NEXT:    or t6, s6, s5
-; RV32I-NEXT:    or a3, s7, a3
-; RV32I-NEXT:    or a1, a1, s8
-; RV32I-NEXT:    lw s4, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    or a4, a4, s4
-; RV32I-NEXT:    or a5, a6, a5
-; RV32I-NEXT:    or a6, t0, a7
-; RV32I-NEXT:    or a7, t2, t1
-; RV32I-NEXT:    or t0, a0, t3
-; RV32I-NEXT:    or t1, s2, s0
-; RV32I-NEXT:    or t2, t4, s1
-; RV32I-NEXT:    or t3, t6, t5
-; RV32I-NEXT:    or a0, a1, a3
-; RV32I-NEXT:    sw t0, 24(sp)
-; RV32I-NEXT:    sw t1, 28(sp)
-; RV32I-NEXT:    sw t2, 32(sp)
-; RV32I-NEXT:    sw t3, 36(sp)
-; RV32I-NEXT:    sw a4, 8(sp)
-; RV32I-NEXT:    sw a5, 12(sp)
-; RV32I-NEXT:    sw a6, 16(sp)
-; RV32I-NEXT:    sw a7, 20(sp)
+; RV32I-NEXT:    or s5, s6, s5
+; RV32I-NEXT:    or s1, s2, s1
+; RV32I-NEXT:    or a1, a1, s7
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a5, t0, a7
+; RV32I-NEXT:    or a6, t2, t1
+; RV32I-NEXT:    or a7, t4, t3
+; RV32I-NEXT:    or t0, a0, t5
+; RV32I-NEXT:    or t1, s0, t6
+; RV32I-NEXT:    or t2, s5, s3
+; RV32I-NEXT:    or a0, a1, s1
+; RV32I-NEXT:    sw a7, 16(sp)
+; RV32I-NEXT:    sw t0, 20(sp)
+; RV32I-NEXT:    sw t1, 24(sp)
+; RV32I-NEXT:    sw t2, 28(sp)
+; RV32I-NEXT:    sw a3, 0(sp)
+; RV32I-NEXT:    sw a4, 4(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a6, 12(sp)
 ; RV32I-NEXT:    srli a1, a0, 3
 ; RV32I-NEXT:    andi a3, a0, 31
 ; RV32I-NEXT:    andi a4, a1, 28
 ; RV32I-NEXT:    xori a1, a3, 31
-; RV32I-NEXT:    add a4, s3, a4
+; RV32I-NEXT:    add a4, s4, a4
 ; RV32I-NEXT:    lw a3, 0(a4)
 ; RV32I-NEXT:    lw a5, 4(a4)
 ; RV32I-NEXT:    lw a6, 8(a4)
@@ -1717,13 +1714,13 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    srli s5, a3, 24
 ; RV32I-NEXT:    srli s6, a3, 16
 ; RV32I-NEXT:    srli s7, a3, 8
-; RV32I-NEXT:    srli s8, a1, 24
-; RV32I-NEXT:    srli s9, a1, 16
 ; RV32I-NEXT:    sb a7, 24(a2)
+; RV32I-NEXT:    srli a7, a1, 24
 ; RV32I-NEXT:    sb t2, 25(a2)
+; RV32I-NEXT:    srli t2, a1, 16
 ; RV32I-NEXT:    sb t1, 26(a2)
 ; RV32I-NEXT:    sb t0, 27(a2)
-; RV32I-NEXT:    srli a7, a1, 8
+; RV32I-NEXT:    srli t0, a1, 8
 ; RV32I-NEXT:    sb a6, 16(a2)
 ; RV32I-NEXT:    sb t5, 17(a2)
 ; RV32I-NEXT:    sb t4, 18(a2)
@@ -1744,27 +1741,26 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sb s6, 14(a2)
 ; RV32I-NEXT:    sb s5, 15(a2)
 ; RV32I-NEXT:    sb a1, 0(a2)
-; RV32I-NEXT:    sb a7, 1(a2)
-; RV32I-NEXT:    sb s9, 2(a2)
-; RV32I-NEXT:    sb s8, 3(a2)
+; RV32I-NEXT:    sb t0, 1(a2)
+; RV32I-NEXT:    sb t2, 2(a2)
+; RV32I-NEXT:    sb a7, 3(a2)
 ; RV32I-NEXT:    sb a0, 4(a2)
 ; RV32I-NEXT:    sb a4, 5(a2)
 ; RV32I-NEXT:    sb a5, 6(a2)
 ; RV32I-NEXT:    sb a6, 7(a2)
-; RV32I-NEXT:    lw ra, 124(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 120(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 116(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 112(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s4, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s5, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s6, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s7, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s8, 88(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s9, 84(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s10, 80(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s11, 76(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 128
+; RV32I-NEXT:    lw s0, 108(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 104(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 100(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 96(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s5, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s6, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s7, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s8, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s9, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s10, 68(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s11, 64(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 112
 ; RV32I-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
   %bitOff = load i256, ptr %bitOff.ptr, align 1
@@ -2006,25 +2002,24 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ;
 ; RV32I-LABEL: shl_32bytes:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -128
-; RV32I-NEXT:    sw ra, 124(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 120(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 116(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 112(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s5, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s6, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s7, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s8, 88(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s9, 84(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s10, 80(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s11, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    addi sp, sp, -112
+; RV32I-NEXT:    sw s0, 108(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 104(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 100(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 96(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s5, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s6, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s7, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s8, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s9, 72(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s10, 68(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s11, 64(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    lbu a3, 0(a0)
 ; RV32I-NEXT:    lbu a4, 1(a0)
-; RV32I-NEXT:    lbu a6, 2(a0)
-; RV32I-NEXT:    lbu a7, 3(a0)
-; RV32I-NEXT:    lbu a5, 4(a0)
+; RV32I-NEXT:    lbu a5, 2(a0)
+; RV32I-NEXT:    lbu a6, 3(a0)
+; RV32I-NEXT:    lbu a7, 4(a0)
 ; RV32I-NEXT:    lbu t0, 5(a0)
 ; RV32I-NEXT:    lbu t1, 6(a0)
 ; RV32I-NEXT:    lbu t2, 7(a0)
@@ -2033,107 +2028,105 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    lbu t5, 10(a0)
 ; RV32I-NEXT:    lbu t6, 11(a0)
 ; RV32I-NEXT:    lbu s0, 12(a0)
-; RV32I-NEXT:    lbu s2, 13(a0)
-; RV32I-NEXT:    lbu s4, 14(a0)
-; RV32I-NEXT:    lbu s5, 15(a0)
-; RV32I-NEXT:    lbu s6, 16(a0)
-; RV32I-NEXT:    lbu s7, 17(a0)
-; RV32I-NEXT:    lbu s8, 18(a0)
-; RV32I-NEXT:    lbu s9, 19(a0)
+; RV32I-NEXT:    lbu s1, 13(a0)
+; RV32I-NEXT:    lbu s2, 14(a0)
+; RV32I-NEXT:    lbu s3, 15(a0)
+; RV32I-NEXT:    lbu s4, 16(a0)
+; RV32I-NEXT:    lbu s5, 17(a0)
+; RV32I-NEXT:    lbu s6, 18(a0)
+; RV32I-NEXT:    lbu s7, 19(a0)
 ; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    slli a6, a6, 16
-; RV32I-NEXT:    slli a7, a7, 24
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    sw a3, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    or a4, a7, a6
-; RV32I-NEXT:    lbu s10, 20(a0)
-; RV32I-NEXT:    lbu s11, 21(a0)
-; RV32I-NEXT:    lbu ra, 22(a0)
-; RV32I-NEXT:    lbu a3, 23(a0)
+; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a6, a6, 24
 ; RV32I-NEXT:    slli t0, t0, 8
 ; RV32I-NEXT:    slli t1, t1, 16
 ; RV32I-NEXT:    slli t2, t2, 24
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a5, t0, a7
+; RV32I-NEXT:    or a6, t2, t1
+; RV32I-NEXT:    lbu s8, 20(a0)
+; RV32I-NEXT:    lbu s9, 21(a0)
+; RV32I-NEXT:    lbu s10, 22(a0)
+; RV32I-NEXT:    lbu s11, 23(a0)
 ; RV32I-NEXT:    slli t4, t4, 8
 ; RV32I-NEXT:    slli t5, t5, 16
 ; RV32I-NEXT:    slli t6, t6, 24
-; RV32I-NEXT:    or a5, t0, a5
-; RV32I-NEXT:    or a6, t2, t1
+; RV32I-NEXT:    slli s1, s1, 8
+; RV32I-NEXT:    slli s2, s2, 16
+; RV32I-NEXT:    slli s3, s3, 24
 ; RV32I-NEXT:    or a7, t4, t3
 ; RV32I-NEXT:    or t0, t6, t5
-; RV32I-NEXT:    lbu s1, 24(a0)
-; RV32I-NEXT:    lbu s3, 25(a0)
-; RV32I-NEXT:    lbu t4, 26(a0)
-; RV32I-NEXT:    lbu t5, 27(a0)
-; RV32I-NEXT:    slli s2, s2, 8
-; RV32I-NEXT:    slli s4, s4, 16
-; RV32I-NEXT:    slli s5, s5, 24
-; RV32I-NEXT:    slli s7, s7, 8
-; RV32I-NEXT:    or t1, s2, s0
-; RV32I-NEXT:    or t2, s5, s4
-; RV32I-NEXT:    or t3, s7, s6
-; RV32I-NEXT:    lbu t6, 28(a0)
+; RV32I-NEXT:    or t1, s1, s0
+; RV32I-NEXT:    or t2, s3, s2
+; RV32I-NEXT:    lbu t6, 24(a0)
+; RV32I-NEXT:    lbu s0, 25(a0)
+; RV32I-NEXT:    lbu s1, 26(a0)
+; RV32I-NEXT:    lbu s2, 27(a0)
+; RV32I-NEXT:    slli s5, s5, 8
+; RV32I-NEXT:    slli s6, s6, 16
+; RV32I-NEXT:    slli s7, s7, 24
+; RV32I-NEXT:    slli s9, s9, 8
+; RV32I-NEXT:    or t3, s5, s4
+; RV32I-NEXT:    or t4, s7, s6
+; RV32I-NEXT:    or t5, s9, s8
+; RV32I-NEXT:    lbu s3, 28(a0)
 ; RV32I-NEXT:    lbu s4, 29(a0)
 ; RV32I-NEXT:    lbu s5, 30(a0)
 ; RV32I-NEXT:    lbu s6, 31(a0)
-; RV32I-NEXT:    slli s8, s8, 16
-; RV32I-NEXT:    slli s9, s9, 24
-; RV32I-NEXT:    slli s11, s11, 8
-; RV32I-NEXT:    slli ra, ra, 16
-; RV32I-NEXT:    slli a3, a3, 24
-; RV32I-NEXT:    or a0, s9, s8
-; RV32I-NEXT:    or s0, s11, s10
-; RV32I-NEXT:    or s2, a3, ra
-; RV32I-NEXT:    lbu a3, 0(a1)
-; RV32I-NEXT:    lbu s7, 1(a1)
-; RV32I-NEXT:    lbu s8, 2(a1)
+; RV32I-NEXT:    slli s10, s10, 16
+; RV32I-NEXT:    slli s11, s11, 24
+; RV32I-NEXT:    slli s0, s0, 8
+; RV32I-NEXT:    slli s1, s1, 16
+; RV32I-NEXT:    slli s2, s2, 24
+; RV32I-NEXT:    or a0, s11, s10
+; RV32I-NEXT:    or t6, s0, t6
+; RV32I-NEXT:    or s0, s2, s1
+; RV32I-NEXT:    lbu s1, 0(a1)
+; RV32I-NEXT:    lbu s2, 1(a1)
+; RV32I-NEXT:    lbu s7, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    sw zero, 16(sp)
+; RV32I-NEXT:    sw zero, 20(sp)
 ; RV32I-NEXT:    sw zero, 24(sp)
 ; RV32I-NEXT:    sw zero, 28(sp)
-; RV32I-NEXT:    sw zero, 32(sp)
-; RV32I-NEXT:    sw zero, 36(sp)
+; RV32I-NEXT:    sw zero, 0(sp)
+; RV32I-NEXT:    sw zero, 4(sp)
 ; RV32I-NEXT:    sw zero, 8(sp)
 ; RV32I-NEXT:    sw zero, 12(sp)
-; RV32I-NEXT:    sw zero, 16(sp)
-; RV32I-NEXT:    sw zero, 20(sp)
-; RV32I-NEXT:    slli s3, s3, 8
-; RV32I-NEXT:    or s1, s3, s1
-; RV32I-NEXT:    addi s3, sp, 40
-; RV32I-NEXT:    slli t4, t4, 16
-; RV32I-NEXT:    slli t5, t5, 24
 ; RV32I-NEXT:    slli s4, s4, 8
+; RV32I-NEXT:    or s3, s4, s3
+; RV32I-NEXT:    addi s4, sp, 32
 ; RV32I-NEXT:    slli s5, s5, 16
 ; RV32I-NEXT:    slli s6, s6, 24
-; RV32I-NEXT:    slli s7, s7, 8
-; RV32I-NEXT:    slli s8, s8, 16
+; RV32I-NEXT:    slli s2, s2, 8
+; RV32I-NEXT:    slli s7, s7, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or t4, t5, t4
-; RV32I-NEXT:    or t5, s4, t6
-; RV32I-NEXT:    or t6, s6, s5
-; RV32I-NEXT:    or a3, s7, a3
-; RV32I-NEXT:    or a1, a1, s8
-; RV32I-NEXT:    lw s4, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    or a4, a4, s4
-; RV32I-NEXT:    or a5, a6, a5
-; RV32I-NEXT:    or a6, t0, a7
-; RV32I-NEXT:    or a7, t2, t1
-; RV32I-NEXT:    or t0, a0, t3
-; RV32I-NEXT:    or t1, s2, s0
-; RV32I-NEXT:    or t2, t4, s1
-; RV32I-NEXT:    or t3, t6, t5
-; RV32I-NEXT:    or a0, a1, a3
-; RV32I-NEXT:    sw t0, 56(sp)
-; RV32I-NEXT:    sw t1, 60(sp)
-; RV32I-NEXT:    sw t2, 64(sp)
-; RV32I-NEXT:    sw t3, 68(sp)
-; RV32I-NEXT:    sw a4, 40(sp)
-; RV32I-NEXT:    sw a5, 44(sp)
-; RV32I-NEXT:    sw a6, 48(sp)
-; RV32I-NEXT:    sw a7, 52(sp)
+; RV32I-NEXT:    or s5, s6, s5
+; RV32I-NEXT:    or s1, s2, s1
+; RV32I-NEXT:    or a1, a1, s7
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a5, t0, a7
+; RV32I-NEXT:    or a6, t2, t1
+; RV32I-NEXT:    or a7, t4, t3
+; RV32I-NEXT:    or t0, a0, t5
+; RV32I-NEXT:    or t1, s0, t6
+; RV32I-NEXT:    or t2, s5, s3
+; RV32I-NEXT:    or a0, a1, s1
+; RV32I-NEXT:    sw a7, 48(sp)
+; RV32I-NEXT:    sw t0, 52(sp)
+; RV32I-NEXT:    sw t1, 56(sp)
+; RV32I-NEXT:    sw t2, 60(sp)
+; RV32I-NEXT:    sw a3, 32(sp)
+; RV32I-NEXT:    sw a4, 36(sp)
+; RV32I-NEXT:    sw a5, 40(sp)
+; RV32I-NEXT:    sw a6, 44(sp)
 ; RV32I-NEXT:    srli a1, a0, 3
 ; RV32I-NEXT:    andi a3, a0, 31
 ; RV32I-NEXT:    andi a4, a1, 28
 ; RV32I-NEXT:    xori a1, a3, 31
-; RV32I-NEXT:    sub a3, s3, a4
+; RV32I-NEXT:    sub a3, s4, a4
 ; RV32I-NEXT:    lw a4, 0(a3)
 ; RV32I-NEXT:    lw a5, 4(a3)
 ; RV32I-NEXT:    lw a6, 8(a3)
@@ -2193,13 +2186,13 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    srli s5, a3, 24
 ; RV32I-NEXT:    srli s6, a3, 16
 ; RV32I-NEXT:    srli s7, a3, 8
-; RV32I-NEXT:    srli s8, a1, 24
-; RV32I-NEXT:    srli s9, a1, 16
 ; RV32I-NEXT:    sb a7, 24(a2)
+; RV32I-NEXT:    srli a7, a1, 24
 ; RV32I-NEXT:    sb t2, 25(a2)
+; RV32I-NEXT:    srli t2, a1, 16
 ; RV32I-NEXT:    sb t1, 26(a2)
 ; RV32I-NEXT:    sb t0, 27(a2)
-; RV32I-NEXT:    srli a7, a1, 8
+; RV32I-NEXT:    srli t0, a1, 8
 ; RV32I-NEXT:    sb a6, 28(a2)
 ; RV32I-NEXT:    sb t5, 29(a2)
 ; RV32I-NEXT:    sb t4, 30(a2)
@@ -2220,27 +2213,26 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sb s6, 10(a2)
 ; RV32I-NEXT:    sb s5, 11(a2)
 ; RV32I-NEXT:    sb a1, 12(a2)
-; RV32I-NEXT:    sb a7, 13(a2)
-; RV32I-NEXT:    sb s9, 14(a2)
-; RV32I-NEXT:    sb s8, 15(a2)
+; RV32I-NEXT:    sb t0, 13(a2)
+; RV32I-NEXT:    sb t2, 14(a2)
+; RV32I-NEXT:    sb a7, 15(a2)
 ; RV32I-NEXT:    sb a0, 4(a2)
 ; RV32I-NEXT:    sb a4, 5(a2)
 ; RV32I-NEXT:    sb a5, 6(a2)
 ; RV32I-NEXT:    sb a6, 7(a2)
-; RV32I-NEXT:    lw ra, 124(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 120(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 116(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 112(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s4, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s5, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s6, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s7, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s8, 88(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s9, 84(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s10, 80(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s11, 76(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 128
+; RV32I-NEXT:    lw s0, 108(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 104(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 100(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 96(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s5, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s6, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s7, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s8, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s9, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s10, 68(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s11, 64(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 112
 ; RV32I-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
   %bitOff = load i256, ptr %bitOff.ptr, align 1
@@ -2483,25 +2475,24 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ;
 ; RV32I-LABEL: ashr_32bytes:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -128
-; RV32I-NEXT:    sw ra, 124(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 120(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 116(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 112(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s5, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s6, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s7, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s8, 88(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s9, 84(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s10, 80(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s11, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    addi sp, sp, -112
+; RV32I-NEXT:    sw s0, 108(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 104(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 100(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 96(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s5, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s6, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s7, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s8, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s9, 72(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s10, 68(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s11, 64(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    lbu a3, 0(a0)
 ; RV32I-NEXT:    lbu a4, 1(a0)
-; RV32I-NEXT:    lbu a6, 2(a0)
-; RV32I-NEXT:    lbu a7, 3(a0)
-; RV32I-NEXT:    lbu a5, 4(a0)
+; RV32I-NEXT:    lbu a5, 2(a0)
+; RV32I-NEXT:    lbu a6, 3(a0)
+; RV32I-NEXT:    lbu a7, 4(a0)
 ; RV32I-NEXT:    lbu t0, 5(a0)
 ; RV32I-NEXT:    lbu t1, 6(a0)
 ; RV32I-NEXT:    lbu t2, 7(a0)
@@ -2518,100 +2509,98 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    lbu s6, 18(a0)
 ; RV32I-NEXT:    lbu s7, 19(a0)
 ; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    slli a6, a6, 16
-; RV32I-NEXT:    slli a7, a7, 24
+; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a6, a6, 24
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t2, t2, 24
 ; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    sw a3, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    or a4, a7, a6
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a5, t0, a7
+; RV32I-NEXT:    or a6, t2, t1
 ; RV32I-NEXT:    lbu s8, 20(a0)
 ; RV32I-NEXT:    lbu s9, 21(a0)
 ; RV32I-NEXT:    lbu s10, 22(a0)
 ; RV32I-NEXT:    lbu s11, 23(a0)
-; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    slli t1, t1, 16
-; RV32I-NEXT:    slli t2, t2, 24
 ; RV32I-NEXT:    slli t4, t4, 8
 ; RV32I-NEXT:    slli t5, t5, 16
 ; RV32I-NEXT:    slli t6, t6, 24
-; RV32I-NEXT:    or a5, t0, a5
-; RV32I-NEXT:    or a6, t2, t1
-; RV32I-NEXT:    or a7, t4, t3
-; RV32I-NEXT:    or t0, t6, t5
-; RV32I-NEXT:    lbu ra, 24(a0)
-; RV32I-NEXT:    lbu a3, 25(a0)
-; RV32I-NEXT:    lbu t4, 26(a0)
-; RV32I-NEXT:    lbu t5, 27(a0)
 ; RV32I-NEXT:    slli s1, s1, 8
 ; RV32I-NEXT:    slli s2, s2, 16
 ; RV32I-NEXT:    slli s3, s3, 24
-; RV32I-NEXT:    slli s5, s5, 8
+; RV32I-NEXT:    or a7, t4, t3
+; RV32I-NEXT:    or t0, t6, t5
 ; RV32I-NEXT:    or t1, s1, s0
 ; RV32I-NEXT:    or t2, s3, s2
-; RV32I-NEXT:    or t3, s5, s4
-; RV32I-NEXT:    lbu t6, 28(a0)
-; RV32I-NEXT:    lbu s0, 29(a0)
-; RV32I-NEXT:    lbu s1, 30(a0)
-; RV32I-NEXT:    lbu a0, 31(a0)
+; RV32I-NEXT:    lbu t6, 24(a0)
+; RV32I-NEXT:    lbu s0, 25(a0)
+; RV32I-NEXT:    lbu s1, 26(a0)
+; RV32I-NEXT:    lbu s2, 27(a0)
+; RV32I-NEXT:    slli s5, s5, 8
 ; RV32I-NEXT:    slli s6, s6, 16
 ; RV32I-NEXT:    slli s7, s7, 24
 ; RV32I-NEXT:    slli s9, s9, 8
+; RV32I-NEXT:    or t3, s5, s4
+; RV32I-NEXT:    or t4, s7, s6
+; RV32I-NEXT:    or t5, s9, s8
+; RV32I-NEXT:    lbu s3, 28(a0)
+; RV32I-NEXT:    lbu s4, 29(a0)
+; RV32I-NEXT:    lbu s5, 30(a0)
+; RV32I-NEXT:    lbu a0, 31(a0)
 ; RV32I-NEXT:    slli s10, s10, 16
 ; RV32I-NEXT:    slli s11, s11, 24
-; RV32I-NEXT:    or s2, s7, s6
-; RV32I-NEXT:    or s3, s9, s8
-; RV32I-NEXT:    or s4, s11, s10
-; RV32I-NEXT:    lbu s5, 0(a1)
-; RV32I-NEXT:    lbu s6, 1(a1)
-; RV32I-NEXT:    lbu s7, 2(a1)
-; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, ra
-; RV32I-NEXT:    addi s8, sp, 8
-; RV32I-NEXT:    slli t4, t4, 16
-; RV32I-NEXT:    slli t5, t5, 24
 ; RV32I-NEXT:    slli s0, s0, 8
 ; RV32I-NEXT:    slli s1, s1, 16
+; RV32I-NEXT:    slli s2, s2, 24
+; RV32I-NEXT:    or s6, s11, s10
+; RV32I-NEXT:    or t6, s0, t6
+; RV32I-NEXT:    or s0, s2, s1
+; RV32I-NEXT:    lbu s1, 0(a1)
+; RV32I-NEXT:    lbu s2, 1(a1)
+; RV32I-NEXT:    lbu s7, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli s4, s4, 8
+; RV32I-NEXT:    or s3, s4, s3
+; RV32I-NEXT:    mv s4, sp
+; RV32I-NEXT:    slli s5, s5, 16
 ; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    slli s6, s6, 8
+; RV32I-NEXT:    slli s2, s2, 8
 ; RV32I-NEXT:    slli s7, s7, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or t4, t5, t4
-; RV32I-NEXT:    or t5, s0, t6
-; RV32I-NEXT:    or s1, a0, s1
-; RV32I-NEXT:    or t6, s6, s5
+; RV32I-NEXT:    or s5, a0, s5
+; RV32I-NEXT:    or s1, s2, s1
 ; RV32I-NEXT:    or a1, a1, s7
-; RV32I-NEXT:    srai s0, a0, 31
-; RV32I-NEXT:    lw a0, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    or a4, a4, a0
-; RV32I-NEXT:    or a5, a6, a5
-; RV32I-NEXT:    or a6, t0, a7
-; RV32I-NEXT:    or a7, t2, t1
-; RV32I-NEXT:    or t0, s2, t3
-; RV32I-NEXT:    or t1, s4, s3
-; RV32I-NEXT:    or a3, t4, a3
-; RV32I-NEXT:    or t2, s1, t5
-; RV32I-NEXT:    or a0, a1, t6
-; RV32I-NEXT:    sw s0, 56(sp)
-; RV32I-NEXT:    sw s0, 60(sp)
-; RV32I-NEXT:    sw s0, 64(sp)
-; RV32I-NEXT:    sw s0, 68(sp)
-; RV32I-NEXT:    sw s0, 40(sp)
-; RV32I-NEXT:    sw s0, 44(sp)
-; RV32I-NEXT:    sw s0, 48(sp)
-; RV32I-NEXT:    sw s0, 52(sp)
-; RV32I-NEXT:    sw t0, 24(sp)
-; RV32I-NEXT:    sw t1, 28(sp)
-; RV32I-NEXT:    sw a3, 32(sp)
-; RV32I-NEXT:    sw t2, 36(sp)
-; RV32I-NEXT:    sw a4, 8(sp)
-; RV32I-NEXT:    sw a5, 12(sp)
-; RV32I-NEXT:    sw a6, 16(sp)
-; RV32I-NEXT:    sw a7, 20(sp)
+; RV32I-NEXT:    srai s2, a0, 31
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a5, t0, a7
+; RV32I-NEXT:    or a6, t2, t1
+; RV32I-NEXT:    or a7, t4, t3
+; RV32I-NEXT:    or t0, s6, t5
+; RV32I-NEXT:    or t1, s0, t6
+; RV32I-NEXT:    or t2, s5, s3
+; RV32I-NEXT:    or a0, a1, s1
+; RV32I-NEXT:    sw s2, 48(sp)
+; RV32I-NEXT:    sw s2, 52(sp)
+; RV32I-NEXT:    sw s2, 56(sp)
+; RV32I-NEXT:    sw s2, 60(sp)
+; RV32I-NEXT:    sw s2, 32(sp)
+; RV32I-NEXT:    sw s2, 36(sp)
+; RV32I-NEXT:    sw s2, 40(sp)
+; RV32I-NEXT:    sw s2, 44(sp)
+; RV32I-NEXT:    sw a7, 16(sp)
+; RV32I-NEXT:    sw t0, 20(sp)
+; RV32I-NEXT:    sw t1, 24(sp)
+; RV32I-NEXT:    sw t2, 28(sp)
+; RV32I-NEXT:    sw a3, 0(sp)
+; RV32I-NEXT:    sw a4, 4(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a6, 12(sp)
 ; RV32I-NEXT:    srli a1, a0, 3
 ; RV32I-NEXT:    andi a3, a0, 31
 ; RV32I-NEXT:    andi a4, a1, 28
 ; RV32I-NEXT:    xori a1, a3, 31
-; RV32I-NEXT:    add a4, s8, a4
+; RV32I-NEXT:    add a4, s4, a4
 ; RV32I-NEXT:    lw a3, 0(a4)
 ; RV32I-NEXT:    lw a5, 4(a4)
 ; RV32I-NEXT:    lw a6, 8(a4)
@@ -2671,13 +2660,13 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    srli s5, a3, 24
 ; RV32I-NEXT:    srli s6, a3, 16
 ; RV32I-NEXT:    srli s7, a3, 8
-; RV32I-NEXT:    srli s8, a1, 24
-; RV32I-NEXT:    srli s9, a1, 16
 ; RV32I-NEXT:    sb a7, 24(a2)
+; RV32I-NEXT:    srli a7, a1, 24
 ; RV32I-NEXT:    sb t2, 25(a2)
+; RV32I-NEXT:    srli t2, a1, 16
 ; RV32I-NEXT:    sb t1, 26(a2)
 ; RV32I-NEXT:    sb t0, 27(a2)
-; RV32I-NEXT:    srli a7, a1, 8
+; RV32I-NEXT:    srli t0, a1, 8
 ; RV32I-NEXT:    sb a6, 16(a2)
 ; RV32I-NEXT:    sb t5, 17(a2)
 ; RV32I-NEXT:    sb t4, 18(a2)
@@ -2698,27 +2687,26 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sb s6, 14(a2)
 ; RV32I-NEXT:    sb s5, 15(a2)
 ; RV32I-NEXT:    sb a1, 0(a2)
-; RV32I-NEXT:    sb a7, 1(a2)
-; RV32I-NEXT:    sb s9, 2(a2)
-; RV32I-NEXT:    sb s8, 3(a2)
+; RV32I-NEXT:    sb t0, 1(a2)
+; RV32I-NEXT:    sb t2, 2(a2)
+; RV32I-NEXT:    sb a7, 3(a2)
 ; RV32I-NEXT:    sb a0, 4(a2)
 ; RV32I-NEXT:    sb a4, 5(a2)
 ; RV32I-NEXT:    sb a5, 6(a2)
 ; RV32I-NEXT:    sb a6, 7(a2)
-; RV32I-NEXT:    lw ra, 124(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 120(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 116(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 112(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s4, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s5, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s6, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s7, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s8, 88(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s9, 84(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s10, 80(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s11, 76(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 128
+; RV32I-NEXT:    lw s0, 108(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 104(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 100(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 96(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s5, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s6, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s7, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s8, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s9, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s10, 68(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s11, 64(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 112
 ; RV32I-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
   %bitOff = load i256, ptr %bitOff.ptr, align 1

>From e96f7f7898790da1fe9cdc5cd3be7e3ae8eb8705 Mon Sep 17 00:00:00 2001
From: Wang Pengcheng <wangpengcheng.pp at bytedance.com>
Date: Tue, 3 Dec 2024 21:44:29 +0800
Subject: [PATCH 2/3] Test commit: add a parameter to keep reserved

---
 .../include/llvm/CodeGen/TargetRegisterInfo.h |    4 +-
 llvm/lib/CodeGen/RegisterClassInfo.cpp        |    3 +-
 llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp     |    3 +-
 llvm/lib/Target/AMDGPU/SIRegisterInfo.h       |    4 +-
 llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp   |    8 +-
 llvm/lib/Target/RISCV/RISCVRegisterInfo.h     |    4 +-
 llvm/test/CodeGen/RISCV/pr69586.ll            |  844 +++--
 .../RISCV/rvv/fixed-vectors-masked-scatter.ll |   78 +-
 .../RISCV/rvv/fixed-vectors-setcc-fp-vp.ll    | 2104 +++++------
 .../RISCV/rvv/intrinsic-vector-match.ll       |  472 +--
 ...lar-shift-by-byte-multiple-legalization.ll | 3238 +++++++++--------
 .../RISCV/wide-scalar-shift-legalization.ll   |  646 ++--
 llvm/unittests/CodeGen/MFCommon.inc           |    4 +-
 llvm/utils/TableGen/RegisterInfoEmitter.cpp   |    5 +-
 14 files changed, 3813 insertions(+), 3604 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
index 292fa3c94969be..eaed26e33c4eb5 100644
--- a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
@@ -914,8 +914,10 @@ class TargetRegisterInfo : public MCRegisterInfo {
 
   /// Get the register unit pressure limit for this dimension.
   /// This limit must be adjusted dynamically for reserved registers.
+  /// If RemoveReserved is true, the target should remove reserved registers.
   virtual unsigned getRegPressureSetLimit(const MachineFunction &MF,
-                                          unsigned Idx) const = 0;
+                                          unsigned Idx,
+                                          bool RemoveReserved = true) const = 0;
 
   /// Get the dimensions of register pressure impacted by this register class.
   /// Returns a -1 terminated array of pressure set IDs.
diff --git a/llvm/lib/CodeGen/RegisterClassInfo.cpp b/llvm/lib/CodeGen/RegisterClassInfo.cpp
index 9312bc03bc522a..0a33915ed1e40b 100644
--- a/llvm/lib/CodeGen/RegisterClassInfo.cpp
+++ b/llvm/lib/CodeGen/RegisterClassInfo.cpp
@@ -222,7 +222,8 @@ unsigned RegisterClassInfo::computePSetLimit(unsigned Idx) const {
   assert(RC && "Failed to find register class");
   compute(RC);
   unsigned NAllocatableRegs = getNumAllocatableRegs(RC);
-  unsigned RegPressureSetLimit = TRI->getRegPressureSetLimit(*MF, Idx);
+  unsigned RegPressureSetLimit =
+      TRI->getRegPressureSetLimit(*MF, Idx, /*RemoveReserved=*/false);
   // If all the regs are reserved, return raw RegPressureSetLimit.
   // One example is VRSAVERC in PowerPC.
   // Avoid returning zero, getRegPressureSetLimit(Idx) assumes computePSetLimit
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 049f4af4dd2f93..9883454ed78298 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -3640,7 +3640,8 @@ unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
 }
 
 unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF,
-                                                unsigned Idx) const {
+                                                unsigned Idx,
+                                                bool RemoveReserved) const {
   if (Idx == AMDGPU::RegisterPressureSets::VGPR_32 ||
       Idx == AMDGPU::RegisterPressureSets::AGPR_32)
     return getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index 8e481e3ac23043..b55f5f2c418b09 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -331,8 +331,8 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
   unsigned getRegPressureLimit(const TargetRegisterClass *RC,
                                MachineFunction &MF) const override;
 
-  unsigned getRegPressureSetLimit(const MachineFunction &MF,
-                                  unsigned Idx) const override;
+  unsigned getRegPressureSetLimit(const MachineFunction &MF, unsigned Idx,
+                                  bool RemoveReserved = true) const override;
 
   const int *getRegUnitPressureSets(unsigned RegUnit) const override;
 
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
index a73bd1621a739d..d5a769b6c78c7c 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
@@ -936,8 +936,12 @@ bool RISCVRegisterInfo::getRegAllocationHints(
 }
 
 unsigned RISCVRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF,
-                                                   unsigned Idx) const {
+                                                   unsigned Idx,
+                                                   bool RemoveReserved) const {
   if (Idx == RISCV::RegisterPressureSets::GPRAll) {
+    if (!RemoveReserved)
+      return 32;
+
     unsigned Reserved = 0;
     BitVector ReservedRegs = getReservedRegs(MF);
     for (MCPhysReg Reg = RISCV::X0_H; Reg <= RISCV::X31_H; Reg++)
@@ -946,5 +950,5 @@ unsigned RISCVRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF,
 
     return 32 - Reserved;
   }
-  return RISCVGenRegisterInfo::getRegPressureSetLimit(MF, Idx);
+  return RISCVGenRegisterInfo::getRegPressureSetLimit(MF, Idx, RemoveReserved);
 }
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.h b/llvm/lib/Target/RISCV/RISCVRegisterInfo.h
index ca4934de2f52d2..58f97394ec559b 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.h
@@ -144,8 +144,8 @@ struct RISCVRegisterInfo : public RISCVGenRegisterInfo {
   static bool isRVVRegClass(const TargetRegisterClass *RC) {
     return RISCVRI::isVRegClass(RC->TSFlags);
   }
-  unsigned getRegPressureSetLimit(const MachineFunction &MF,
-                                  unsigned Idx) const override;
+  unsigned getRegPressureSetLimit(const MachineFunction &MF, unsigned Idx,
+                                  bool RemoveReserved = true) const override;
 };
 } // namespace llvm
 
diff --git a/llvm/test/CodeGen/RISCV/pr69586.ll b/llvm/test/CodeGen/RISCV/pr69586.ll
index 21e64ada7061aa..8e6a7add781c93 100644
--- a/llvm/test/CodeGen/RISCV/pr69586.ll
+++ b/llvm/test/CodeGen/RISCV/pr69586.ll
@@ -39,384 +39,388 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; NOREMAT-NEXT:    slli a2, a2, 1
 ; NOREMAT-NEXT:    sub sp, sp, a2
 ; NOREMAT-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xf0, 0x05, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 752 + 2 * vlenb
-; NOREMAT-NEXT:    mv a7, a0
-; NOREMAT-NEXT:    li a0, 32
-; NOREMAT-NEXT:    addi a5, a7, 512
-; NOREMAT-NEXT:    addi a4, a7, 1024
-; NOREMAT-NEXT:    addi a6, a7, 1536
-; NOREMAT-NEXT:    li t1, 1
+; NOREMAT-NEXT:    li a7, 32
+; NOREMAT-NEXT:    addi a6, a0, 512
+; NOREMAT-NEXT:    addi a4, a0, 1024
+; NOREMAT-NEXT:    addi a5, a0, 1536
+; NOREMAT-NEXT:    li t0, 1
 ; NOREMAT-NEXT:    li a3, 5
-; NOREMAT-NEXT:    li t0, 3
+; NOREMAT-NEXT:    li t1, 3
 ; NOREMAT-NEXT:    li a2, 7
 ; NOREMAT-NEXT:    lui t2, 1
-; NOREMAT-NEXT:    li s5, 9
-; NOREMAT-NEXT:    li s8, 11
-; NOREMAT-NEXT:    lui s1, 2
-; NOREMAT-NEXT:    lui t5, 3
-; NOREMAT-NEXT:    lui s11, 4
-; NOREMAT-NEXT:    lui ra, 5
-; NOREMAT-NEXT:    lui t3, 6
-; NOREMAT-NEXT:    lui s0, 7
-; NOREMAT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; NOREMAT-NEXT:    slli t4, t1, 11
-; NOREMAT-NEXT:    slli t6, a3, 9
-; NOREMAT-NEXT:    slli s2, t0, 10
-; NOREMAT-NEXT:    slli s4, a2, 9
-; NOREMAT-NEXT:    add a0, a7, t2
-; NOREMAT-NEXT:    vle32.v v8, (a5)
-; NOREMAT-NEXT:    slli s5, s5, 9
+; NOREMAT-NEXT:    li s4, 9
+; NOREMAT-NEXT:    li s6, 11
+; NOREMAT-NEXT:    li s9, 13
+; NOREMAT-NEXT:    lui s7, 2
+; NOREMAT-NEXT:    lui s1, 3
+; NOREMAT-NEXT:    lui ra, 4
+; NOREMAT-NEXT:    lui t3, 5
+; NOREMAT-NEXT:    lui s0, 6
+; NOREMAT-NEXT:    lui s3, 7
+; NOREMAT-NEXT:    vsetvli zero, a7, e32, m2, ta, ma
+; NOREMAT-NEXT:    slli t0, t0, 11
+; NOREMAT-NEXT:    sd t0, 504(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    slli t5, a3, 9
+; NOREMAT-NEXT:    slli t6, t1, 10
+; NOREMAT-NEXT:    slli s2, a2, 9
+; NOREMAT-NEXT:    add a7, a0, t2
+; NOREMAT-NEXT:    lui s11, 1
+; NOREMAT-NEXT:    slli s4, s4, 9
+; NOREMAT-NEXT:    slli s5, a3, 10
+; NOREMAT-NEXT:    vle32.v v8, (a6)
+; NOREMAT-NEXT:    slli s6, s6, 9
+; NOREMAT-NEXT:    slli s8, t1, 11
 ; NOREMAT-NEXT:    vle32.v v10, (a4)
 ; NOREMAT-NEXT:    vle32.v v2, (a4)
-; NOREMAT-NEXT:    slli s6, a3, 10
-; NOREMAT-NEXT:    vle32.v v0, (a6)
-; NOREMAT-NEXT:    vle32.v v12, (a6)
-; NOREMAT-NEXT:    slli s8, s8, 9
-; NOREMAT-NEXT:    slli s9, t0, 11
-; NOREMAT-NEXT:    vle32.v v4, (a0)
-; NOREMAT-NEXT:    vle32.v v20, (a0)
-; NOREMAT-NEXT:    add a4, a7, s1
+; NOREMAT-NEXT:    slli s9, s9, 9
+; NOREMAT-NEXT:    vle32.v v0, (a5)
+; NOREMAT-NEXT:    vle32.v v12, (a5)
+; NOREMAT-NEXT:    slli s10, a2, 10
+; NOREMAT-NEXT:    vle32.v v4, (a7)
+; NOREMAT-NEXT:    vle32.v v20, (a7)
+; NOREMAT-NEXT:    add a4, a0, s7
 ; NOREMAT-NEXT:    vle32.v v6, (a4)
 ; NOREMAT-NEXT:    vle32.v v30, (a4)
-; NOREMAT-NEXT:    add a4, a7, t5
+; NOREMAT-NEXT:    add a4, a0, s1
 ; NOREMAT-NEXT:    vle32.v v28, (a4)
 ; NOREMAT-NEXT:    vle32.v v26, (a4)
-; NOREMAT-NEXT:    add a4, a7, s11
+; NOREMAT-NEXT:    add a4, a0, ra
 ; NOREMAT-NEXT:    vle32.v v24, (a4)
 ; NOREMAT-NEXT:    vle32.v v22, (a4)
-; NOREMAT-NEXT:    add a4, a7, ra
-; NOREMAT-NEXT:    vle32.v v14, (a7)
+; NOREMAT-NEXT:    add a4, a0, t3
+; NOREMAT-NEXT:    vle32.v v14, (a0)
 ; NOREMAT-NEXT:    vle32.v v18, (a4)
 ; NOREMAT-NEXT:    vle32.v v16, (a4)
-; NOREMAT-NEXT:    add a4, a7, t3
+; NOREMAT-NEXT:    add a4, a0, s0
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v14, v8
 ; NOREMAT-NEXT:    vle32.v v14, (a4)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v8, v10
 ; NOREMAT-NEXT:    vle32.v v8, (a4)
-; NOREMAT-NEXT:    addi a0, sp, 640
-; NOREMAT-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
-; NOREMAT-NEXT:    add a4, a7, t4
+; NOREMAT-NEXT:    addi a4, sp, 640
+; NOREMAT-NEXT:    vs2r.v v8, (a4) # Unknown-size Folded Spill
+; NOREMAT-NEXT:    add a4, a0, t0
 ; NOREMAT-NEXT:    vle32.v v10, (a4)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v2, v0
 ; NOREMAT-NEXT:    vle32.v v2, (a4)
-; NOREMAT-NEXT:    add a4, a7, t6
+; NOREMAT-NEXT:    add a4, a0, t5
 ; NOREMAT-NEXT:    vle32.v v0, (a4)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v12, v10
 ; NOREMAT-NEXT:    vle32.v v10, (a4)
-; NOREMAT-NEXT:    add a4, a7, s2
+; NOREMAT-NEXT:    add a4, a0, t6
 ; NOREMAT-NEXT:    vle32.v v12, (a4)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v2, v0
 ; NOREMAT-NEXT:    vle32.v v2, (a4)
-; NOREMAT-NEXT:    add a4, a7, s4
+; NOREMAT-NEXT:    add a4, a0, s2
 ; NOREMAT-NEXT:    vle32.v v8, (a4)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
 ; NOREMAT-NEXT:    vle32.v v12, (a4)
-; NOREMAT-NEXT:    add a4, a7, s0
+; NOREMAT-NEXT:    add a4, a0, s3
 ; NOREMAT-NEXT:    vle32.v v0, (a4)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v2, v8
 ; NOREMAT-NEXT:    vle32.v v10, (a4)
-; NOREMAT-NEXT:    add a4, a7, s5
+; NOREMAT-NEXT:    add a4, a0, s4
 ; NOREMAT-NEXT:    vle32.v v8, (a4)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v12, v4
 ; NOREMAT-NEXT:    vle32.v v12, (a4)
-; NOREMAT-NEXT:    add a4, a7, s6
+; NOREMAT-NEXT:    add a4, a0, s5
 ; NOREMAT-NEXT:    vle32.v v4, (a4)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v20, v8
 ; NOREMAT-NEXT:    vle32.v v8, (a4)
-; NOREMAT-NEXT:    add a4, a7, s8
+; NOREMAT-NEXT:    add a4, a0, s6
 ; NOREMAT-NEXT:    vle32.v v20, (a4)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v12, v4
 ; NOREMAT-NEXT:    vle32.v v12, (a4)
-; NOREMAT-NEXT:    add a4, a7, s9
+; NOREMAT-NEXT:    add a4, a0, s8
 ; NOREMAT-NEXT:    vle32.v v4, (a4)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v8, v20
 ; NOREMAT-NEXT:    vle32.v v8, (a4)
-; NOREMAT-NEXT:    li t5, 13
-; NOREMAT-NEXT:    slli a4, t5, 9
-; NOREMAT-NEXT:    sd a4, 624(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a4, a7, a4
+; NOREMAT-NEXT:    add a4, a0, s9
 ; NOREMAT-NEXT:    vle32.v v20, (a4)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v12, v4
 ; NOREMAT-NEXT:    vle32.v v12, (a4)
-; NOREMAT-NEXT:    slli a4, a2, 10
-; NOREMAT-NEXT:    sd a4, 616(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a4, a7, a4
+; NOREMAT-NEXT:    add a4, a0, s10
 ; NOREMAT-NEXT:    vle32.v v4, (a4)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v8, v20
 ; NOREMAT-NEXT:    vle32.v v8, (a4)
-; NOREMAT-NEXT:    li a6, 15
-; NOREMAT-NEXT:    slli a4, a6, 9
-; NOREMAT-NEXT:    sd a4, 608(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a4, a7, a4
+; NOREMAT-NEXT:    li t2, 15
+; NOREMAT-NEXT:    slli a4, t2, 9
+; NOREMAT-NEXT:    sd a4, 624(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a4, a0, a4
 ; NOREMAT-NEXT:    vle32.v v2, (a4)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v12, v4
-; NOREMAT-NEXT:    lui t1, 8
-; NOREMAT-NEXT:    add a5, a7, t1
+; NOREMAT-NEXT:    lui t4, 8
+; NOREMAT-NEXT:    add a5, a0, t4
 ; NOREMAT-NEXT:    vle32.v v20, (a5)
 ; NOREMAT-NEXT:    vle32.v v12, (a4)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v8, v2
 ; NOREMAT-NEXT:    li a4, 17
 ; NOREMAT-NEXT:    slli a4, a4, 9
-; NOREMAT-NEXT:    li t2, 17
-; NOREMAT-NEXT:    sd a4, 600(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a4, a7, a4
+; NOREMAT-NEXT:    li s1, 17
+; NOREMAT-NEXT:    sd a4, 616(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a4, a0, a4
 ; NOREMAT-NEXT:    vle32.v v8, (a4)
 ; NOREMAT-NEXT:    vle32.v v4, (a4)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v12, v6
 ; NOREMAT-NEXT:    li a5, 9
 ; NOREMAT-NEXT:    slli a4, a5, 10
-; NOREMAT-NEXT:    sd a4, 592(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a4, a7, a4
+; NOREMAT-NEXT:    sd a4, 608(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a4, a0, a4
 ; NOREMAT-NEXT:    vle32.v v12, (a4)
 ; NOREMAT-NEXT:    vle32.v v6, (a4)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v30, v8
 ; NOREMAT-NEXT:    li a4, 19
 ; NOREMAT-NEXT:    slli a4, a4, 9
-; NOREMAT-NEXT:    li s1, 19
-; NOREMAT-NEXT:    sd a4, 584(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a4, a7, a4
+; NOREMAT-NEXT:    li t1, 19
+; NOREMAT-NEXT:    sd a4, 600(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a4, a0, a4
 ; NOREMAT-NEXT:    vle32.v v8, (a4)
 ; NOREMAT-NEXT:    vle32.v v30, (a4)
 ; NOREMAT-NEXT:    slli a3, a3, 11
-; NOREMAT-NEXT:    sd a3, 576(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd a3, 592(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v4, v12
-; NOREMAT-NEXT:    add a3, a7, a3
+; NOREMAT-NEXT:    add a3, a0, a3
 ; NOREMAT-NEXT:    vle32.v v12, (a3)
 ; NOREMAT-NEXT:    vle32.v v4, (a3)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v6, v8
 ; NOREMAT-NEXT:    li s7, 21
 ; NOREMAT-NEXT:    slli a3, s7, 9
-; NOREMAT-NEXT:    sd a3, 568(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a3, a7, a3
+; NOREMAT-NEXT:    sd a3, 584(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a3, a0, a3
 ; NOREMAT-NEXT:    vle32.v v8, (a3)
 ; NOREMAT-NEXT:    vle32.v v6, (a3)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v30, v12
-; NOREMAT-NEXT:    li a4, 11
-; NOREMAT-NEXT:    slli a3, a4, 10
-; NOREMAT-NEXT:    sd a3, 560(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a3, a7, a3
+; NOREMAT-NEXT:    li a6, 11
+; NOREMAT-NEXT:    slli a3, a6, 10
+; NOREMAT-NEXT:    sd a3, 576(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a3, a0, a3
 ; NOREMAT-NEXT:    vle32.v v12, (a3)
 ; NOREMAT-NEXT:    vle32.v v30, (a3)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v4, v8
 ; NOREMAT-NEXT:    li s3, 23
-; NOREMAT-NEXT:    slli s10, s3, 9
-; NOREMAT-NEXT:    add a3, a7, s10
+; NOREMAT-NEXT:    slli a3, s3, 9
+; NOREMAT-NEXT:    sd a3, 568(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a3, a0, a3
 ; NOREMAT-NEXT:    vle32.v v8, (a3)
 ; NOREMAT-NEXT:    vle32.v v4, (a3)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v6, v12
 ; NOREMAT-NEXT:    li s0, 25
 ; NOREMAT-NEXT:    slli a3, s0, 9
-; NOREMAT-NEXT:    sd a3, 552(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a3, a7, a3
+; NOREMAT-NEXT:    sd a3, 560(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a3, a0, a3
 ; NOREMAT-NEXT:    vle32.v v12, (a3)
 ; NOREMAT-NEXT:    vle32.v v6, (a3)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v30, v8
-; NOREMAT-NEXT:    slli a3, t5, 10
-; NOREMAT-NEXT:    sd a3, 544(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a3, a7, a3
+; NOREMAT-NEXT:    li a7, 13
+; NOREMAT-NEXT:    slli a3, a7, 10
+; NOREMAT-NEXT:    sd a3, 552(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a3, a0, a3
 ; NOREMAT-NEXT:    vle32.v v8, (a3)
 ; NOREMAT-NEXT:    vle32.v v30, (a3)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v4, v28
 ; NOREMAT-NEXT:    li t3, 27
 ; NOREMAT-NEXT:    slli a3, t3, 9
-; NOREMAT-NEXT:    sd a3, 536(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a3, a7, a3
+; NOREMAT-NEXT:    sd a3, 544(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a3, a0, a3
 ; NOREMAT-NEXT:    vle32.v v28, (a3)
 ; NOREMAT-NEXT:    vle32.v v4, (a3)
 ; NOREMAT-NEXT:    slli a2, a2, 11
-; NOREMAT-NEXT:    sd a2, 528(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd a2, 536(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v26, v12
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v12, (a2)
 ; NOREMAT-NEXT:    vle32.v v26, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v6, v8
 ; NOREMAT-NEXT:    li t0, 29
 ; NOREMAT-NEXT:    slli a2, t0, 9
-; NOREMAT-NEXT:    sd a2, 520(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    sd a2, 528(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v8, (a2)
 ; NOREMAT-NEXT:    vle32.v v6, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v30, v28
-; NOREMAT-NEXT:    slli a2, a6, 10
-; NOREMAT-NEXT:    sd a2, 512(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    slli a2, t2, 10
+; NOREMAT-NEXT:    sd a2, 520(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    li t2, 15
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v28, (a2)
 ; NOREMAT-NEXT:    vle32.v v30, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v4, v12
 ; NOREMAT-NEXT:    li a3, 31
-; NOREMAT-NEXT:    slli a0, a3, 9
-; NOREMAT-NEXT:    sd a0, 504(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a0, a7, a0
-; NOREMAT-NEXT:    vle32.v v12, (a0)
-; NOREMAT-NEXT:    vle32.v v4, (a0)
+; NOREMAT-NEXT:    slli a2, a3, 9
+; NOREMAT-NEXT:    sd a2, 512(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a2, a0, a2
+; NOREMAT-NEXT:    vle32.v v12, (a2)
+; NOREMAT-NEXT:    vle32.v v4, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v26, v8
-; NOREMAT-NEXT:    addiw a2, s11, 512
+; NOREMAT-NEXT:    addiw a2, ra, 512
 ; NOREMAT-NEXT:    sd a2, 496(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v8, (a2)
 ; NOREMAT-NEXT:    vle32.v v26, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v6, v28
-; NOREMAT-NEXT:    slli a2, t2, 10
+; NOREMAT-NEXT:    slli a2, s1, 10
 ; NOREMAT-NEXT:    sd a2, 488(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v28, (a2)
 ; NOREMAT-NEXT:    vle32.v v6, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v30, v12
-; NOREMAT-NEXT:    addiw a2, s11, 1536
+; NOREMAT-NEXT:    addiw a2, ra, 1536
 ; NOREMAT-NEXT:    sd a2, 480(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v12, (a2)
 ; NOREMAT-NEXT:    vle32.v v30, (a2)
 ; NOREMAT-NEXT:    slli a2, a5, 11
 ; NOREMAT-NEXT:    sd a2, 472(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v4, v24
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v24, (a2)
 ; NOREMAT-NEXT:    vle32.v v4, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v22, v8
-; NOREMAT-NEXT:    addiw a2, ra, -1536
+; NOREMAT-NEXT:    lui a4, 5
+; NOREMAT-NEXT:    addiw a2, a4, -1536
 ; NOREMAT-NEXT:    sd a2, 464(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v8, (a2)
 ; NOREMAT-NEXT:    vle32.v v22, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v26, v28
-; NOREMAT-NEXT:    slli a2, s1, 10
+; NOREMAT-NEXT:    slli a2, t1, 10
 ; NOREMAT-NEXT:    sd a2, 456(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    li t1, 19
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v26, (a2)
 ; NOREMAT-NEXT:    vle32.v v28, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v6, v12
-; NOREMAT-NEXT:    addiw a2, ra, -512
+; NOREMAT-NEXT:    addiw a2, a4, -512
 ; NOREMAT-NEXT:    sd a2, 448(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v12, (a2)
 ; NOREMAT-NEXT:    vle32.v v6, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v30, v24
-; NOREMAT-NEXT:    addiw a2, ra, 512
+; NOREMAT-NEXT:    addiw a2, a4, 512
 ; NOREMAT-NEXT:    sd a2, 440(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v24, (a2)
 ; NOREMAT-NEXT:    vle32.v v30, (a2)
 ; NOREMAT-NEXT:    slli a2, s7, 10
 ; NOREMAT-NEXT:    sd a2, 432(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v4, v8
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v8, (a2)
 ; NOREMAT-NEXT:    vle32.v v4, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v22, v26
-; NOREMAT-NEXT:    addiw a2, ra, 1536
+; NOREMAT-NEXT:    addiw a2, a4, 1536
 ; NOREMAT-NEXT:    sd a2, 424(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v22, (a2)
 ; NOREMAT-NEXT:    vle32.v v26, (a2)
-; NOREMAT-NEXT:    slli a2, a4, 11
+; NOREMAT-NEXT:    slli a2, a6, 11
 ; NOREMAT-NEXT:    sd a2, 416(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v28, v12
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v12, (a2)
 ; NOREMAT-NEXT:    vle32.v v28, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v6, v18
-; NOREMAT-NEXT:    lui a4, 6
-; NOREMAT-NEXT:    addiw a2, a4, -1536
+; NOREMAT-NEXT:    lui a5, 6
+; NOREMAT-NEXT:    addiw a2, a5, -1536
 ; NOREMAT-NEXT:    sd a2, 408(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v18, (a2)
 ; NOREMAT-NEXT:    vle32.v v6, (a2)
 ; NOREMAT-NEXT:    slli a2, s3, 10
 ; NOREMAT-NEXT:    sd a2, 400(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v16, v24
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v16, (a2)
 ; NOREMAT-NEXT:    vle32.v v24, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v30, v8
-; NOREMAT-NEXT:    addiw a2, a4, -512
+; NOREMAT-NEXT:    addiw a2, a5, -512
 ; NOREMAT-NEXT:    sd a2, 392(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v8, (a2)
 ; NOREMAT-NEXT:    vle32.v v30, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v4, v22
-; NOREMAT-NEXT:    addiw a2, a4, 512
+; NOREMAT-NEXT:    addiw a2, a5, 512
 ; NOREMAT-NEXT:    sd a2, 384(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v22, (a2)
 ; NOREMAT-NEXT:    vle32.v v4, (a2)
 ; NOREMAT-NEXT:    slli a2, s0, 10
 ; NOREMAT-NEXT:    sd a2, 376(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v26, v12
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v26, (a2)
 ; NOREMAT-NEXT:    vle32.v v2, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v28, v18
-; NOREMAT-NEXT:    addiw a2, a4, 1536
+; NOREMAT-NEXT:    addiw a2, a5, 1536
 ; NOREMAT-NEXT:    sd a2, 368(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v18, (a2)
 ; NOREMAT-NEXT:    vle32.v v28, (a2)
-; NOREMAT-NEXT:    slli a2, t5, 11
+; NOREMAT-NEXT:    slli a2, a7, 11
 ; NOREMAT-NEXT:    sd a2, 360(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v6, v16
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v16, (a2)
 ; NOREMAT-NEXT:    vle32.v v6, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v24, v8
-; NOREMAT-NEXT:    lui a5, 7
-; NOREMAT-NEXT:    addiw a2, a5, -1536
+; NOREMAT-NEXT:    lui a7, 7
+; NOREMAT-NEXT:    addiw a2, a7, -1536
 ; NOREMAT-NEXT:    sd a2, 352(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v8, (a2)
 ; NOREMAT-NEXT:    vle32.v v24, (a2)
 ; NOREMAT-NEXT:    slli a2, t3, 10
 ; NOREMAT-NEXT:    sd a2, 344(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v30, v14
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v14, (a2)
 ; NOREMAT-NEXT:    vle32.v v30, (a2)
-; NOREMAT-NEXT:    addi a0, sp, 640
-; NOREMAT-NEXT:    vl2r.v v12, (a0) # Unknown-size Folded Reload
+; NOREMAT-NEXT:    addi a2, sp, 640
+; NOREMAT-NEXT:    vl2r.v v12, (a2) # Unknown-size Folded Reload
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v12, v22
-; NOREMAT-NEXT:    addiw a2, a5, -512
+; NOREMAT-NEXT:    addiw a2, a7, -512
 ; NOREMAT-NEXT:    sd a2, 336(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v22, (a2)
 ; NOREMAT-NEXT:    vle32.v v12, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v4, v26
-; NOREMAT-NEXT:    addiw a2, a5, 512
+; NOREMAT-NEXT:    addiw a2, a7, 512
 ; NOREMAT-NEXT:    sd a2, 328(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v26, (a2)
 ; NOREMAT-NEXT:    vle32.v v4, (a2)
 ; NOREMAT-NEXT:    slli a2, t0, 10
 ; NOREMAT-NEXT:    sd a2, 320(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v2, v18
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v18, (a2)
 ; NOREMAT-NEXT:    vle32.v v2, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v28, v16
-; NOREMAT-NEXT:    addiw a2, a5, 1536
+; NOREMAT-NEXT:    addiw a2, a7, 1536
 ; NOREMAT-NEXT:    sd a2, 312(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v16, (a2)
 ; NOREMAT-NEXT:    vle32.v v28, (a2)
-; NOREMAT-NEXT:    slli a2, a6, 11
+; NOREMAT-NEXT:    slli a2, t2, 11
 ; NOREMAT-NEXT:    sd a2, 304(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v6, v8
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v8, (a2)
 ; NOREMAT-NEXT:    vle32.v v6, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v24, v14
-; NOREMAT-NEXT:    addiw a2, t1, -1536
+; NOREMAT-NEXT:    addiw a2, t4, -1536
 ; NOREMAT-NEXT:    sd a2, 296(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v14, (a2)
 ; NOREMAT-NEXT:    vle32.v v24, (a2)
 ; NOREMAT-NEXT:    slli a2, a3, 10
 ; NOREMAT-NEXT:    sd a2, 288(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v30, v22
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v22, (a2)
 ; NOREMAT-NEXT:    vle32.v v30, (a2)
-; NOREMAT-NEXT:    addiw a0, t1, -512
-; NOREMAT-NEXT:    sd a0, 280(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a0, a7, a0
+; NOREMAT-NEXT:    addiw a2, t4, -512
+; NOREMAT-NEXT:    sd a2, 280(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a0, a0, a2
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v12, v0
 ; NOREMAT-NEXT:    vle32.v v12, (a0)
 ; NOREMAT-NEXT:    vle32.v v0, (a0)
@@ -431,33 +435,32 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; NOREMAT-NEXT:    addi a0, a1, 1024
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
-; NOREMAT-NEXT:    lui a0, 1
-; NOREMAT-NEXT:    add a0, a1, a0
-; NOREMAT-NEXT:    sd a0, 272(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add s11, a1, s11
+; NOREMAT-NEXT:    sd s11, 272(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    lui a0, 2
 ; NOREMAT-NEXT:    add a0, a1, a0
 ; NOREMAT-NEXT:    sd a0, 264(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    lui a0, 3
 ; NOREMAT-NEXT:    add a0, a1, a0
 ; NOREMAT-NEXT:    sd a0, 256(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add s11, a1, s11
-; NOREMAT-NEXT:    sd s11, 248(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    add ra, a1, ra
-; NOREMAT-NEXT:    sd ra, 240(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd ra, 248(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    add a4, a1, a4
-; NOREMAT-NEXT:    sd a4, 232(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd a4, 240(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    add a5, a1, a5
-; NOREMAT-NEXT:    sd a5, 224(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a0, a1, t1
+; NOREMAT-NEXT:    sd a5, 232(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a7, a1, a7
+; NOREMAT-NEXT:    sd a7, 224(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a0, a1, t4
 ; NOREMAT-NEXT:    sd a0, 216(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    addiw a0, t1, 512
+; NOREMAT-NEXT:    addiw a0, t4, 512
 ; NOREMAT-NEXT:    sd a0, 192(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    addiw a0, t1, 1024
+; NOREMAT-NEXT:    addiw a0, t4, 1024
 ; NOREMAT-NEXT:    sd a0, 176(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    addiw a0, t1, 1536
+; NOREMAT-NEXT:    addiw a0, t4, 1536
 ; NOREMAT-NEXT:    sd a0, 160(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    slli t2, t2, 11
-; NOREMAT-NEXT:    sd t2, 128(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    slli s1, s1, 11
+; NOREMAT-NEXT:    sd s1, 128(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    lui a0, 9
 ; NOREMAT-NEXT:    addiw a2, a0, -1536
 ; NOREMAT-NEXT:    sd a2, 88(sp) # 8-byte Folded Spill
@@ -470,7 +473,7 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; NOREMAT-NEXT:    addiw s11, a0, 512
 ; NOREMAT-NEXT:    addiw s7, a0, 1024
 ; NOREMAT-NEXT:    addiw s3, a0, 1536
-; NOREMAT-NEXT:    slli s1, s1, 11
+; NOREMAT-NEXT:    slli s1, t1, 11
 ; NOREMAT-NEXT:    lui a0, 10
 ; NOREMAT-NEXT:    addiw t2, a0, -1536
 ; NOREMAT-NEXT:    addiw a7, a0, -1024
@@ -478,52 +481,52 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; NOREMAT-NEXT:    add a2, a1, a0
 ; NOREMAT-NEXT:    sd a2, 200(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    addiw a0, a0, 512
-; NOREMAT-NEXT:    add a2, a1, t4
-; NOREMAT-NEXT:    add a3, a1, t6
-; NOREMAT-NEXT:    add a5, a1, s2
-; NOREMAT-NEXT:    add a6, a1, s4
-; NOREMAT-NEXT:    add t0, a1, s5
-; NOREMAT-NEXT:    add t1, a1, s6
-; NOREMAT-NEXT:    add t3, a1, s8
-; NOREMAT-NEXT:    add t4, a1, s9
-; NOREMAT-NEXT:    ld t5, 624(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    add t5, a1, t5
-; NOREMAT-NEXT:    ld t6, 616(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    add t6, a1, t6
-; NOREMAT-NEXT:    ld s0, 608(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld a2, 504(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add a2, a1, a2
+; NOREMAT-NEXT:    add a3, a1, t5
+; NOREMAT-NEXT:    add a5, a1, t6
+; NOREMAT-NEXT:    add a6, a1, s2
+; NOREMAT-NEXT:    add t0, a1, s4
+; NOREMAT-NEXT:    add t1, a1, s5
+; NOREMAT-NEXT:    add t3, a1, s6
+; NOREMAT-NEXT:    add t4, a1, s8
+; NOREMAT-NEXT:    add t5, a1, s9
+; NOREMAT-NEXT:    add t6, a1, s10
+; NOREMAT-NEXT:    ld s0, 624(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add s0, a1, s0
-; NOREMAT-NEXT:    ld s2, 600(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld s2, 616(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add s2, a1, s2
-; NOREMAT-NEXT:    ld s4, 592(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld s4, 608(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add s4, a1, s4
-; NOREMAT-NEXT:    ld s5, 584(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld s5, 600(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add s5, a1, s5
-; NOREMAT-NEXT:    ld s6, 576(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld s6, 592(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add s6, a1, s6
-; NOREMAT-NEXT:    ld s8, 568(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld s8, 584(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add s8, a1, s8
-; NOREMAT-NEXT:    ld s9, 560(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld s9, 576(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add s9, a1, s9
+; NOREMAT-NEXT:    ld s10, 568(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add s10, a1, s10
-; NOREMAT-NEXT:    ld ra, 552(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld ra, 560(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add ra, a1, ra
 ; NOREMAT-NEXT:    sd ra, 16(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    ld ra, 544(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld ra, 552(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add ra, a1, ra
 ; NOREMAT-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    ld ra, 536(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld ra, 544(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add ra, a1, ra
 ; NOREMAT-NEXT:    sd ra, 32(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    ld ra, 528(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld ra, 536(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add ra, a1, ra
 ; NOREMAT-NEXT:    sd ra, 48(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    ld ra, 520(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld ra, 528(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add ra, a1, ra
 ; NOREMAT-NEXT:    sd ra, 56(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    ld ra, 512(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld ra, 520(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add ra, a1, ra
 ; NOREMAT-NEXT:    sd ra, 64(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    ld ra, 504(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld ra, 512(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add ra, a1, ra
 ; NOREMAT-NEXT:    sd ra, 80(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    ld ra, 496(sp) # 8-byte Folded Reload
@@ -917,9 +920,10 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; REMAT-NEXT:    .cfi_offset s10, -96
 ; REMAT-NEXT:    .cfi_offset s11, -104
 ; REMAT-NEXT:    csrr a2, vlenb
-; REMAT-NEXT:    slli a2, a2, 3
+; REMAT-NEXT:    li a3, 14
+; REMAT-NEXT:    mul a2, a2, a3
 ; REMAT-NEXT:    sub sp, sp, a2
-; REMAT-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xa0, 0x04, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 544 + 8 * vlenb
+; REMAT-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xa0, 0x04, 0x22, 0x11, 0x0e, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 544 + 14 * vlenb
 ; REMAT-NEXT:    li a4, 32
 ; REMAT-NEXT:    addi a5, a0, 512
 ; REMAT-NEXT:    addi a3, a0, 1024
@@ -956,13 +960,20 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; REMAT-NEXT:    slli s6, s6, 9
 ; REMAT-NEXT:    li s7, 5
 ; REMAT-NEXT:    slli s7, s7, 11
+; REMAT-NEXT:    li s8, 21
+; REMAT-NEXT:    slli s8, s8, 9
+; REMAT-NEXT:    li s9, 11
+; REMAT-NEXT:    slli s9, s9, 10
+; REMAT-NEXT:    li s10, 23
+; REMAT-NEXT:    slli s10, s10, 9
+; REMAT-NEXT:    lui s11, 3
 ; REMAT-NEXT:    vsetvli zero, a4, e32, m2, ta, ma
 ; REMAT-NEXT:    vle32.v v8, (a5)
-; REMAT-NEXT:    li a4, 21
+; REMAT-NEXT:    li a4, 25
 ; REMAT-NEXT:    slli a4, a4, 9
 ; REMAT-NEXT:    vle32.v v10, (a3)
 ; REMAT-NEXT:    vle32.v v12, (a3)
-; REMAT-NEXT:    li a3, 11
+; REMAT-NEXT:    li a3, 13
 ; REMAT-NEXT:    slli a3, a3, 10
 ; REMAT-NEXT:    vle32.v v14, (a2)
 ; REMAT-NEXT:    vle32.v v16, (a2)
@@ -979,7 +990,7 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; REMAT-NEXT:    vle32.v v30, (a2)
 ; REMAT-NEXT:    vle32.v v6, (a2)
 ; REMAT-NEXT:    csrr a2, vlenb
-; REMAT-NEXT:    li a5, 6
+; REMAT-NEXT:    li a5, 12
 ; REMAT-NEXT:    mul a2, a2, a5
 ; REMAT-NEXT:    add a2, sp, a2
 ; REMAT-NEXT:    addi a2, a2, 432
@@ -989,7 +1000,8 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; REMAT-NEXT:    vle32.v v2, (a2)
 ; REMAT-NEXT:    vle32.v v6, (a2)
 ; REMAT-NEXT:    csrr a2, vlenb
-; REMAT-NEXT:    slli a2, a2, 2
+; REMAT-NEXT:    li a5, 10
+; REMAT-NEXT:    mul a2, a2, a5
 ; REMAT-NEXT:    add a2, sp, a2
 ; REMAT-NEXT:    addi a2, a2, 432
 ; REMAT-NEXT:    vs2r.v v6, (a2) # Unknown-size Folded Spill
@@ -1003,11 +1015,16 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v12, v14
 ; REMAT-NEXT:    vle32.v v0, (a2)
 ; REMAT-NEXT:    add a2, a0, t5
-; REMAT-NEXT:    vle32.v v8, (a2)
+; REMAT-NEXT:    vle32.v v14, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v16, v18
-; REMAT-NEXT:    vle32.v v18, (a2)
+; REMAT-NEXT:    vle32.v v8, (a2)
+; REMAT-NEXT:    csrr a2, vlenb
+; REMAT-NEXT:    slli a2, a2, 3
+; REMAT-NEXT:    add a2, sp, a2
+; REMAT-NEXT:    addi a2, a2, 432
+; REMAT-NEXT:    vs2r.v v8, (a2) # Unknown-size Folded Spill
 ; REMAT-NEXT:    add a2, a0, t6
-; REMAT-NEXT:    vle32.v v16, (a2)
+; REMAT-NEXT:    vle32.v v18, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v20, v22
 ; REMAT-NEXT:    vle32.v v20, (a2)
 ; REMAT-NEXT:    add a2, a0, s0
@@ -1017,340 +1034,383 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; REMAT-NEXT:    add a2, a0, s1
 ; REMAT-NEXT:    vle32.v v26, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v28, v30
-; REMAT-NEXT:    vle32.v v14, (a2)
+; REMAT-NEXT:    vle32.v v28, (a2)
 ; REMAT-NEXT:    add a2, a0, s2
-; REMAT-NEXT:    vle32.v v12, (a2)
+; REMAT-NEXT:    vle32.v v8, (a2)
 ; REMAT-NEXT:    csrr a5, vlenb
-; REMAT-NEXT:    li a6, 6
+; REMAT-NEXT:    li a6, 12
 ; REMAT-NEXT:    mul a5, a5, a6
 ; REMAT-NEXT:    add a5, sp, a5
 ; REMAT-NEXT:    addi a5, a5, 432
-; REMAT-NEXT:    vl2r.v v28, (a5) # Unknown-size Folded Reload
-; REMAT-NEXT:    sf.vc.vv 3, 0, v28, v2
+; REMAT-NEXT:    vl2r.v v12, (a5) # Unknown-size Folded Reload
+; REMAT-NEXT:    sf.vc.vv 3, 0, v12, v2
 ; REMAT-NEXT:    vle32.v v2, (a2)
 ; REMAT-NEXT:    add a2, a0, s3
-; REMAT-NEXT:    vle32.v v28, (a2)
+; REMAT-NEXT:    vle32.v v12, (a2)
 ; REMAT-NEXT:    csrr a5, vlenb
-; REMAT-NEXT:    slli a5, a5, 2
+; REMAT-NEXT:    li a6, 10
+; REMAT-NEXT:    mul a5, a5, a6
 ; REMAT-NEXT:    add a5, sp, a5
 ; REMAT-NEXT:    addi a5, a5, 432
-; REMAT-NEXT:    vl2r.v v30, (a5) # Unknown-size Folded Reload
-; REMAT-NEXT:    sf.vc.vv 3, 0, v30, v4
-; REMAT-NEXT:    vle32.v v4, (a2)
-; REMAT-NEXT:    add a2, a0, s4
+; REMAT-NEXT:    vl2r.v v16, (a5) # Unknown-size Folded Reload
+; REMAT-NEXT:    sf.vc.vv 3, 0, v16, v4
 ; REMAT-NEXT:    vle32.v v30, (a2)
+; REMAT-NEXT:    add a2, a0, s4
+; REMAT-NEXT:    vle32.v v16, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v6, v10
-; REMAT-NEXT:    vle32.v v10, (a2)
-; REMAT-NEXT:    add a2, a0, s5
 ; REMAT-NEXT:    vle32.v v6, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v0, v8
-; REMAT-NEXT:    vle32.v v0, (a2)
+; REMAT-NEXT:    add a2, a0, s5
+; REMAT-NEXT:    vle32.v v10, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v0, v14
+; REMAT-NEXT:    vle32.v v4, (a2)
 ; REMAT-NEXT:    add a2, a0, s6
-; REMAT-NEXT:    vle32.v v8, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v18, v16
-; REMAT-NEXT:    vle32.v v18, (a2)
+; REMAT-NEXT:    vle32.v v14, (a2)
+; REMAT-NEXT:    csrr a5, vlenb
+; REMAT-NEXT:    slli a5, a5, 3
+; REMAT-NEXT:    add a5, sp, a5
+; REMAT-NEXT:    addi a5, a5, 432
+; REMAT-NEXT:    vl2r.v v0, (a5) # Unknown-size Folded Reload
+; REMAT-NEXT:    sf.vc.vv 3, 0, v0, v18
+; REMAT-NEXT:    vle32.v v0, (a2)
 ; REMAT-NEXT:    add a2, a0, s7
-; REMAT-NEXT:    vle32.v v16, (a2)
+; REMAT-NEXT:    vle32.v v18, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v20, v22
-; REMAT-NEXT:    vle32.v v22, (a2)
-; REMAT-NEXT:    add a2, a0, a4
+; REMAT-NEXT:    vle32.v v20, (a2)
+; REMAT-NEXT:    csrr a2, vlenb
+; REMAT-NEXT:    slli a2, a2, 3
+; REMAT-NEXT:    add a2, sp, a2
+; REMAT-NEXT:    addi a2, a2, 432
+; REMAT-NEXT:    vs2r.v v20, (a2) # Unknown-size Folded Spill
+; REMAT-NEXT:    add a2, a0, s8
 ; REMAT-NEXT:    vle32.v v20, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v24, v26
 ; REMAT-NEXT:    vle32.v v24, (a2)
+; REMAT-NEXT:    add a2, a0, s9
+; REMAT-NEXT:    vle32.v v22, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v28, v8
+; REMAT-NEXT:    vle32.v v26, (a2)
+; REMAT-NEXT:    add a2, a0, s10
+; REMAT-NEXT:    vle32.v v8, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v2, v12
+; REMAT-NEXT:    vle32.v v12, (a2)
+; REMAT-NEXT:    add a2, a0, s11
+; REMAT-NEXT:    vle32.v v2, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v30, v16
+; REMAT-NEXT:    vle32.v v16, (a2)
 ; REMAT-NEXT:    addi a2, sp, 432
-; REMAT-NEXT:    vs2r.v v24, (a2) # Unknown-size Folded Spill
+; REMAT-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
+; REMAT-NEXT:    add a2, a0, a4
+; REMAT-NEXT:    vle32.v v16, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v6, v10
+; REMAT-NEXT:    vle32.v v10, (a2)
+; REMAT-NEXT:    csrr a2, vlenb
+; REMAT-NEXT:    slli a2, a2, 1
+; REMAT-NEXT:    add a2, sp, a2
+; REMAT-NEXT:    addi a2, a2, 432
+; REMAT-NEXT:    vs2r.v v10, (a2) # Unknown-size Folded Spill
 ; REMAT-NEXT:    add a2, a0, a3
-; REMAT-NEXT:    vle32.v v24, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v14, v12
-; REMAT-NEXT:    vle32.v v12, (a2)
-; REMAT-NEXT:    li a5, 23
+; REMAT-NEXT:    vle32.v v10, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v4, v14
+; REMAT-NEXT:    vle32.v v14, (a2)
+; REMAT-NEXT:    csrr a2, vlenb
+; REMAT-NEXT:    li a3, 12
+; REMAT-NEXT:    mul a2, a2, a3
+; REMAT-NEXT:    add a2, sp, a2
+; REMAT-NEXT:    addi a2, a2, 432
+; REMAT-NEXT:    vs2r.v v14, (a2) # Unknown-size Folded Spill
+; REMAT-NEXT:    li a5, 27
 ; REMAT-NEXT:    slli a5, a5, 9
 ; REMAT-NEXT:    add a2, a0, a5
-; REMAT-NEXT:    vle32.v v26, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v2, v28
 ; REMAT-NEXT:    vle32.v v14, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v0, v18
+; REMAT-NEXT:    vle32.v v18, (a2)
 ; REMAT-NEXT:    csrr a2, vlenb
-; REMAT-NEXT:    li a3, 6
+; REMAT-NEXT:    li a3, 10
 ; REMAT-NEXT:    mul a2, a2, a3
 ; REMAT-NEXT:    add a2, sp, a2
 ; REMAT-NEXT:    addi a2, a2, 432
-; REMAT-NEXT:    vs2r.v v14, (a2) # Unknown-size Folded Spill
-; REMAT-NEXT:    lui s8, 3
-; REMAT-NEXT:    add a2, a0, s8
+; REMAT-NEXT:    vs2r.v v18, (a2) # Unknown-size Folded Spill
+; REMAT-NEXT:    li ra, 7
+; REMAT-NEXT:    slli ra, ra, 11
+; REMAT-NEXT:    add a2, a0, ra
 ; REMAT-NEXT:    vle32.v v28, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v4, v30
-; REMAT-NEXT:    vle32.v v14, (a2)
+; REMAT-NEXT:    csrr a3, vlenb
+; REMAT-NEXT:    slli a3, a3, 3
+; REMAT-NEXT:    add a3, sp, a3
+; REMAT-NEXT:    addi a3, a3, 432
+; REMAT-NEXT:    vl2r.v v18, (a3) # Unknown-size Folded Reload
+; REMAT-NEXT:    sf.vc.vv 3, 0, v18, v20
+; REMAT-NEXT:    vle32.v v18, (a2)
 ; REMAT-NEXT:    csrr a2, vlenb
-; REMAT-NEXT:    slli a2, a2, 2
+; REMAT-NEXT:    slli a2, a2, 3
 ; REMAT-NEXT:    add a2, sp, a2
 ; REMAT-NEXT:    addi a2, a2, 432
-; REMAT-NEXT:    vs2r.v v14, (a2) # Unknown-size Folded Spill
-; REMAT-NEXT:    li s9, 25
-; REMAT-NEXT:    slli s9, s9, 9
-; REMAT-NEXT:    add a2, a0, s9
+; REMAT-NEXT:    vs2r.v v18, (a2) # Unknown-size Folded Spill
+; REMAT-NEXT:    li a2, 29
+; REMAT-NEXT:    slli a2, a2, 9
+; REMAT-NEXT:    add a2, a0, a2
 ; REMAT-NEXT:    vle32.v v30, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v10, v6
-; REMAT-NEXT:    vle32.v v14, (a2)
-; REMAT-NEXT:    li s10, 13
-; REMAT-NEXT:    slli s10, s10, 10
-; REMAT-NEXT:    add a2, a0, s10
+; REMAT-NEXT:    sf.vc.vv 3, 0, v24, v22
+; REMAT-NEXT:    vle32.v v18, (a2)
+; REMAT-NEXT:    csrr a2, vlenb
+; REMAT-NEXT:    li a3, 6
+; REMAT-NEXT:    mul a2, a2, a3
+; REMAT-NEXT:    add a2, sp, a2
+; REMAT-NEXT:    addi a2, a2, 432
+; REMAT-NEXT:    vs2r.v v18, (a2) # Unknown-size Folded Spill
+; REMAT-NEXT:    li a2, 15
+; REMAT-NEXT:    slli a2, a2, 10
+; REMAT-NEXT:    add a2, a0, a2
 ; REMAT-NEXT:    vle32.v v6, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v0, v8
+; REMAT-NEXT:    sf.vc.vv 3, 0, v26, v8
 ; REMAT-NEXT:    vle32.v v8, (a2)
 ; REMAT-NEXT:    csrr a2, vlenb
-; REMAT-NEXT:    slli a2, a2, 1
+; REMAT-NEXT:    slli a2, a2, 2
 ; REMAT-NEXT:    add a2, sp, a2
 ; REMAT-NEXT:    addi a2, a2, 432
 ; REMAT-NEXT:    vs2r.v v8, (a2) # Unknown-size Folded Spill
-; REMAT-NEXT:    li s11, 27
-; REMAT-NEXT:    slli s11, s11, 9
-; REMAT-NEXT:    add a2, a0, s11
+; REMAT-NEXT:    li a2, 31
+; REMAT-NEXT:    slli a2, a2, 9
+; REMAT-NEXT:    add a2, a0, a2
 ; REMAT-NEXT:    vle32.v v4, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v18, v16
+; REMAT-NEXT:    sf.vc.vv 3, 0, v12, v2
 ; REMAT-NEXT:    vle32.v v18, (a2)
-; REMAT-NEXT:    li ra, 7
-; REMAT-NEXT:    slli ra, ra, 11
-; REMAT-NEXT:    add a2, a0, ra
+; REMAT-NEXT:    lui a2, 4
+; REMAT-NEXT:    add a2, a0, a2
 ; REMAT-NEXT:    vle32.v v2, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v22, v20
+; REMAT-NEXT:    addi a3, sp, 432
+; REMAT-NEXT:    vl2r.v v8, (a3) # Unknown-size Folded Reload
+; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v16
 ; REMAT-NEXT:    vle32.v v20, (a2)
-; REMAT-NEXT:    li a2, 29
-; REMAT-NEXT:    slli a2, a2, 9
+; REMAT-NEXT:    lui a2, 4
+; REMAT-NEXT:    addiw a2, a2, 512
 ; REMAT-NEXT:    add a2, a0, a2
 ; REMAT-NEXT:    vle32.v v0, (a2)
-; REMAT-NEXT:    addi a3, sp, 432
+; REMAT-NEXT:    csrr a3, vlenb
+; REMAT-NEXT:    slli a3, a3, 1
+; REMAT-NEXT:    add a3, sp, a3
+; REMAT-NEXT:    addi a3, a3, 432
 ; REMAT-NEXT:    vl2r.v v8, (a3) # Unknown-size Folded Reload
-; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v24
+; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v10
 ; REMAT-NEXT:    vle32.v v22, (a2)
-; REMAT-NEXT:    li a2, 15
+; REMAT-NEXT:    li a2, 17
 ; REMAT-NEXT:    slli a2, a2, 10
 ; REMAT-NEXT:    add a2, a0, a2
 ; REMAT-NEXT:    vle32.v v24, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v12, v26
+; REMAT-NEXT:    csrr a3, vlenb
+; REMAT-NEXT:    li a4, 12
+; REMAT-NEXT:    mul a3, a3, a4
+; REMAT-NEXT:    add a3, sp, a3
+; REMAT-NEXT:    addi a3, a3, 432
+; REMAT-NEXT:    vl2r.v v8, (a3) # Unknown-size Folded Reload
+; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v14
 ; REMAT-NEXT:    vle32.v v8, (a2)
-; REMAT-NEXT:    li a2, 31
-; REMAT-NEXT:    slli a2, a2, 9
+; REMAT-NEXT:    lui a2, 4
+; REMAT-NEXT:    addiw a2, a2, 1536
 ; REMAT-NEXT:    add a2, a0, a2
 ; REMAT-NEXT:    vle32.v v26, (a2)
 ; REMAT-NEXT:    csrr a3, vlenb
-; REMAT-NEXT:    li a4, 6
+; REMAT-NEXT:    li a4, 10
 ; REMAT-NEXT:    mul a3, a3, a4
 ; REMAT-NEXT:    add a3, sp, a3
 ; REMAT-NEXT:    addi a3, a3, 432
 ; REMAT-NEXT:    vl2r.v v10, (a3) # Unknown-size Folded Reload
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v10, v28
 ; REMAT-NEXT:    vle32.v v10, (a2)
-; REMAT-NEXT:    lui a2, 4
+; REMAT-NEXT:    li a2, 9
+; REMAT-NEXT:    slli a2, a2, 11
 ; REMAT-NEXT:    add a2, a0, a2
 ; REMAT-NEXT:    vle32.v v28, (a2)
 ; REMAT-NEXT:    csrr a3, vlenb
-; REMAT-NEXT:    slli a3, a3, 2
+; REMAT-NEXT:    slli a3, a3, 3
 ; REMAT-NEXT:    add a3, sp, a3
 ; REMAT-NEXT:    addi a3, a3, 432
 ; REMAT-NEXT:    vl2r.v v12, (a3) # Unknown-size Folded Reload
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v12, v30
 ; REMAT-NEXT:    vle32.v v12, (a2)
-; REMAT-NEXT:    lui a2, 4
-; REMAT-NEXT:    addiw a2, a2, 512
+; REMAT-NEXT:    lui a2, 5
+; REMAT-NEXT:    addiw a2, a2, -1536
 ; REMAT-NEXT:    add a2, a0, a2
 ; REMAT-NEXT:    vle32.v v30, (a2)
+; REMAT-NEXT:    csrr a3, vlenb
+; REMAT-NEXT:    li a4, 6
+; REMAT-NEXT:    mul a3, a3, a4
+; REMAT-NEXT:    add a3, sp, a3
+; REMAT-NEXT:    addi a3, a3, 432
+; REMAT-NEXT:    vl2r.v v14, (a3) # Unknown-size Folded Reload
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v14, v6
 ; REMAT-NEXT:    vle32.v v14, (a2)
-; REMAT-NEXT:    li a2, 17
+; REMAT-NEXT:    li a2, 19
 ; REMAT-NEXT:    slli a2, a2, 10
 ; REMAT-NEXT:    add a2, a0, a2
 ; REMAT-NEXT:    vle32.v v6, (a2)
 ; REMAT-NEXT:    csrr a3, vlenb
-; REMAT-NEXT:    slli a3, a3, 1
+; REMAT-NEXT:    slli a3, a3, 2
 ; REMAT-NEXT:    add a3, sp, a3
 ; REMAT-NEXT:    addi a3, a3, 432
 ; REMAT-NEXT:    vl2r.v v16, (a3) # Unknown-size Folded Reload
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v16, v4
 ; REMAT-NEXT:    vle32.v v16, (a2)
-; REMAT-NEXT:    lui a2, 4
-; REMAT-NEXT:    addiw a2, a2, 1536
+; REMAT-NEXT:    lui a2, 5
+; REMAT-NEXT:    addiw a2, a2, -512
 ; REMAT-NEXT:    add a2, a0, a2
 ; REMAT-NEXT:    vle32.v v4, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v18, v2
 ; REMAT-NEXT:    vle32.v v18, (a2)
-; REMAT-NEXT:    li a2, 9
-; REMAT-NEXT:    slli a2, a2, 11
+; REMAT-NEXT:    lui a2, 5
 ; REMAT-NEXT:    add a2, a0, a2
 ; REMAT-NEXT:    vle32.v v2, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v20, v0
 ; REMAT-NEXT:    vle32.v v20, (a2)
 ; REMAT-NEXT:    lui a2, 5
-; REMAT-NEXT:    addiw a2, a2, -1536
+; REMAT-NEXT:    addiw a2, a2, 512
 ; REMAT-NEXT:    add a2, a0, a2
 ; REMAT-NEXT:    vle32.v v0, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v22, v24
 ; REMAT-NEXT:    vle32.v v22, (a2)
-; REMAT-NEXT:    li a2, 19
+; REMAT-NEXT:    li a2, 21
 ; REMAT-NEXT:    slli a2, a2, 10
 ; REMAT-NEXT:    add a2, a0, a2
 ; REMAT-NEXT:    vle32.v v24, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v26
 ; REMAT-NEXT:    vle32.v v8, (a2)
-; REMAT-NEXT:    lui a2, 5
-; REMAT-NEXT:    addiw a2, a2, -512
-; REMAT-NEXT:    add a2, a0, a2
+; REMAT-NEXT:    lui s4, 5
+; REMAT-NEXT:    addiw s4, s4, 1536
+; REMAT-NEXT:    add a2, a0, s4
 ; REMAT-NEXT:    vle32.v v26, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v10, v28
 ; REMAT-NEXT:    vle32.v v10, (a2)
-; REMAT-NEXT:    lui a2, 5
+; REMAT-NEXT:    li a2, 11
+; REMAT-NEXT:    slli a2, a2, 11
 ; REMAT-NEXT:    add a2, a0, a2
 ; REMAT-NEXT:    vle32.v v28, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v12, v30
 ; REMAT-NEXT:    vle32.v v12, (a2)
-; REMAT-NEXT:    lui a2, 5
-; REMAT-NEXT:    addiw a2, a2, 512
-; REMAT-NEXT:    add a2, a0, a2
+; REMAT-NEXT:    lui s3, 6
+; REMAT-NEXT:    addiw s3, s3, -1536
+; REMAT-NEXT:    add a2, a0, s3
 ; REMAT-NEXT:    vle32.v v30, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v14, v6
 ; REMAT-NEXT:    vle32.v v14, (a2)
-; REMAT-NEXT:    li a2, 21
-; REMAT-NEXT:    slli a2, a2, 10
-; REMAT-NEXT:    add a2, a0, a2
+; REMAT-NEXT:    li s2, 23
+; REMAT-NEXT:    slli s2, s2, 10
+; REMAT-NEXT:    add a2, a0, s2
 ; REMAT-NEXT:    vle32.v v6, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v16, v4
 ; REMAT-NEXT:    vle32.v v16, (a2)
-; REMAT-NEXT:    lui a2, 5
-; REMAT-NEXT:    addiw a2, a2, 1536
+; REMAT-NEXT:    lui a2, 6
+; REMAT-NEXT:    addiw a2, a2, -512
 ; REMAT-NEXT:    add a2, a0, a2
 ; REMAT-NEXT:    vle32.v v4, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v18, v2
 ; REMAT-NEXT:    vle32.v v18, (a2)
-; REMAT-NEXT:    li a2, 11
-; REMAT-NEXT:    slli a2, a2, 11
+; REMAT-NEXT:    lui a2, 6
 ; REMAT-NEXT:    add a2, a0, a2
+; REMAT-NEXT:    lui s1, 6
 ; REMAT-NEXT:    vle32.v v2, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v20, v0
 ; REMAT-NEXT:    vle32.v v20, (a2)
-; REMAT-NEXT:    lui a2, 6
-; REMAT-NEXT:    addiw a2, a2, -1536
-; REMAT-NEXT:    add a2, a0, a2
+; REMAT-NEXT:    lui s0, 6
+; REMAT-NEXT:    addiw s0, s0, 512
+; REMAT-NEXT:    add a2, a0, s0
 ; REMAT-NEXT:    vle32.v v0, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v22, v24
 ; REMAT-NEXT:    vle32.v v22, (a2)
-; REMAT-NEXT:    li a2, 23
+; REMAT-NEXT:    li a2, 25
 ; REMAT-NEXT:    slli a2, a2, 10
 ; REMAT-NEXT:    add a2, a0, a2
 ; REMAT-NEXT:    vle32.v v24, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v26
 ; REMAT-NEXT:    vle32.v v8, (a2)
-; REMAT-NEXT:    lui a2, 6
-; REMAT-NEXT:    addiw a2, a2, -512
-; REMAT-NEXT:    add a2, a0, a2
+; REMAT-NEXT:    lui t6, 6
+; REMAT-NEXT:    addiw t6, t6, 1536
+; REMAT-NEXT:    add a2, a0, t6
 ; REMAT-NEXT:    vle32.v v26, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v10, v28
 ; REMAT-NEXT:    vle32.v v10, (a2)
-; REMAT-NEXT:    lui a2, 6
-; REMAT-NEXT:    add a2, a0, a2
-; REMAT-NEXT:    lui s1, 6
+; REMAT-NEXT:    li t5, 13
+; REMAT-NEXT:    slli t5, t5, 11
+; REMAT-NEXT:    add a2, a0, t5
 ; REMAT-NEXT:    vle32.v v28, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v12, v30
 ; REMAT-NEXT:    vle32.v v12, (a2)
-; REMAT-NEXT:    lui s0, 6
-; REMAT-NEXT:    addiw s0, s0, 512
-; REMAT-NEXT:    add a2, a0, s0
+; REMAT-NEXT:    lui a2, 7
+; REMAT-NEXT:    addiw a2, a2, -1536
+; REMAT-NEXT:    add a2, a0, a2
 ; REMAT-NEXT:    vle32.v v30, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v14, v6
 ; REMAT-NEXT:    vle32.v v14, (a2)
-; REMAT-NEXT:    li a2, 25
-; REMAT-NEXT:    slli a2, a2, 10
-; REMAT-NEXT:    add a2, a0, a2
+; REMAT-NEXT:    li t4, 27
+; REMAT-NEXT:    slli t4, t4, 10
+; REMAT-NEXT:    add a2, a0, t4
 ; REMAT-NEXT:    vle32.v v6, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v16, v4
 ; REMAT-NEXT:    vle32.v v16, (a2)
-; REMAT-NEXT:    lui t6, 6
-; REMAT-NEXT:    addiw t6, t6, 1536
-; REMAT-NEXT:    add a2, a0, t6
+; REMAT-NEXT:    lui a2, 7
+; REMAT-NEXT:    addiw a2, a2, -512
+; REMAT-NEXT:    add a2, a0, a2
 ; REMAT-NEXT:    vle32.v v4, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v18, v2
 ; REMAT-NEXT:    vle32.v v18, (a2)
-; REMAT-NEXT:    li t5, 13
-; REMAT-NEXT:    slli t5, t5, 11
-; REMAT-NEXT:    add a2, a0, t5
+; REMAT-NEXT:    lui a2, 7
+; REMAT-NEXT:    add a2, a0, a2
+; REMAT-NEXT:    lui t3, 7
 ; REMAT-NEXT:    vle32.v v2, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v20, v0
 ; REMAT-NEXT:    vle32.v v20, (a2)
-; REMAT-NEXT:    lui a2, 7
-; REMAT-NEXT:    addiw a2, a2, -1536
-; REMAT-NEXT:    add a2, a0, a2
+; REMAT-NEXT:    lui t2, 7
+; REMAT-NEXT:    addiw t2, t2, 512
+; REMAT-NEXT:    add a2, a0, t2
 ; REMAT-NEXT:    vle32.v v0, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v22, v24
 ; REMAT-NEXT:    vle32.v v22, (a2)
-; REMAT-NEXT:    li t4, 27
-; REMAT-NEXT:    slli t4, t4, 10
-; REMAT-NEXT:    add a2, a0, t4
+; REMAT-NEXT:    li t1, 29
+; REMAT-NEXT:    slli t1, t1, 10
+; REMAT-NEXT:    add a2, a0, t1
 ; REMAT-NEXT:    vle32.v v24, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v26
 ; REMAT-NEXT:    vle32.v v8, (a2)
-; REMAT-NEXT:    lui a2, 7
-; REMAT-NEXT:    addiw a2, a2, -512
-; REMAT-NEXT:    add a2, a0, a2
+; REMAT-NEXT:    lui t0, 7
+; REMAT-NEXT:    addiw t0, t0, 1536
+; REMAT-NEXT:    add a2, a0, t0
 ; REMAT-NEXT:    vle32.v v26, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v10, v28
 ; REMAT-NEXT:    vle32.v v10, (a2)
-; REMAT-NEXT:    lui a2, 7
-; REMAT-NEXT:    add a2, a0, a2
-; REMAT-NEXT:    lui t3, 7
+; REMAT-NEXT:    li a7, 15
+; REMAT-NEXT:    slli a7, a7, 11
+; REMAT-NEXT:    add a2, a0, a7
 ; REMAT-NEXT:    vle32.v v28, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v12, v30
 ; REMAT-NEXT:    vle32.v v12, (a2)
-; REMAT-NEXT:    lui t2, 7
-; REMAT-NEXT:    addiw t2, t2, 512
-; REMAT-NEXT:    add a2, a0, t2
+; REMAT-NEXT:    lui a6, 8
+; REMAT-NEXT:    addiw a6, a6, -1536
+; REMAT-NEXT:    add a2, a0, a6
 ; REMAT-NEXT:    vle32.v v30, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v14, v6
 ; REMAT-NEXT:    vle32.v v14, (a2)
-; REMAT-NEXT:    li t1, 29
-; REMAT-NEXT:    slli t1, t1, 10
-; REMAT-NEXT:    add a2, a0, t1
+; REMAT-NEXT:    li a4, 31
+; REMAT-NEXT:    slli a4, a4, 10
+; REMAT-NEXT:    add a2, a0, a4
 ; REMAT-NEXT:    vle32.v v6, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v16, v4
 ; REMAT-NEXT:    vle32.v v16, (a2)
-; REMAT-NEXT:    lui t0, 7
-; REMAT-NEXT:    addiw t0, t0, 1536
-; REMAT-NEXT:    add a2, a0, t0
+; REMAT-NEXT:    lui a3, 8
+; REMAT-NEXT:    addiw a3, a3, -512
+; REMAT-NEXT:    add a2, a0, a3
 ; REMAT-NEXT:    vle32.v v4, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v18, v2
 ; REMAT-NEXT:    vle32.v v18, (a2)
-; REMAT-NEXT:    li a7, 15
-; REMAT-NEXT:    slli a7, a7, 11
-; REMAT-NEXT:    add a2, a0, a7
-; REMAT-NEXT:    vle32.v v2, (a2)
+; REMAT-NEXT:    lui a2, 8
+; REMAT-NEXT:    add a0, a0, a2
+; REMAT-NEXT:    vle32.v v2, (a0)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v20, v0
-; REMAT-NEXT:    vle32.v v20, (a2)
-; REMAT-NEXT:    lui a6, 8
-; REMAT-NEXT:    addiw a6, a6, -1536
-; REMAT-NEXT:    add a2, a0, a6
-; REMAT-NEXT:    vle32.v v0, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v22, v24
-; REMAT-NEXT:    vle32.v v22, (a2)
-; REMAT-NEXT:    li a4, 31
-; REMAT-NEXT:    slli a4, a4, 10
-; REMAT-NEXT:    add a2, a0, a4
-; REMAT-NEXT:    vle32.v v24, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v26
-; REMAT-NEXT:    vle32.v v8, (a2)
-; REMAT-NEXT:    lui a3, 8
-; REMAT-NEXT:    addiw a3, a3, -512
-; REMAT-NEXT:    add a2, a0, a3
-; REMAT-NEXT:    vle32.v v26, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v10, v28
-; REMAT-NEXT:    vle32.v v10, (a2)
-; REMAT-NEXT:    lui a2, 8
-; REMAT-NEXT:    add a0, a0, a2
-; REMAT-NEXT:    vle32.v v28, (a0)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v12, v30
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v14, v6
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v16, v4
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v18, v2
-; REMAT-NEXT:    sf.vc.vv 3, 0, v20, v0
-; REMAT-NEXT:    sf.vc.vv 3, 0, v22, v24
-; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v26
-; REMAT-NEXT:    sf.vc.vv 3, 0, v10, v28
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; REMAT-NEXT:    addi a0, a1, 1024
 ; REMAT-NEXT:    vse32.v v8, (a0)
@@ -1397,36 +1457,41 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; REMAT-NEXT:    slli a0, a0, 10
 ; REMAT-NEXT:    add a0, a1, a0
 ; REMAT-NEXT:    sd a0, 336(sp) # 8-byte Folded Spill
-; REMAT-NEXT:    add s2, a1, s2
-; REMAT-NEXT:    sd s2, 328(sp) # 8-byte Folded Spill
-; REMAT-NEXT:    add s3, a1, s3
-; REMAT-NEXT:    sd s3, 320(sp) # 8-byte Folded Spill
-; REMAT-NEXT:    add s4, a1, s4
-; REMAT-NEXT:    sd s4, 312(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    li a0, 15
+; REMAT-NEXT:    slli a0, a0, 9
+; REMAT-NEXT:    add a0, a1, a0
+; REMAT-NEXT:    sd a0, 328(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    lui a0, 2
+; REMAT-NEXT:    add a0, a1, a0
+; REMAT-NEXT:    sd a0, 320(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    li a0, 17
+; REMAT-NEXT:    slli a0, a0, 9
+; REMAT-NEXT:    add a0, a1, a0
+; REMAT-NEXT:    sd a0, 312(sp) # 8-byte Folded Spill
 ; REMAT-NEXT:    add s5, a1, s5
 ; REMAT-NEXT:    sd s5, 304(sp) # 8-byte Folded Spill
 ; REMAT-NEXT:    add s6, a1, s6
 ; REMAT-NEXT:    sd s6, 296(sp) # 8-byte Folded Spill
 ; REMAT-NEXT:    add s7, a1, s7
 ; REMAT-NEXT:    sd s7, 288(sp) # 8-byte Folded Spill
-; REMAT-NEXT:    li a0, 21
+; REMAT-NEXT:    add s8, a1, s8
+; REMAT-NEXT:    sd s8, 280(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    add s9, a1, s9
+; REMAT-NEXT:    sd s9, 272(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    add s10, a1, s10
+; REMAT-NEXT:    sd s10, 264(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    add s11, a1, s11
+; REMAT-NEXT:    sd s11, 256(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    li a0, 25
 ; REMAT-NEXT:    slli a0, a0, 9
 ; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    sd a0, 280(sp) # 8-byte Folded Spill
-; REMAT-NEXT:    li a0, 11
+; REMAT-NEXT:    sd a0, 248(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    li a0, 13
 ; REMAT-NEXT:    slli a0, a0, 10
 ; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    sd a0, 272(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    sd a0, 240(sp) # 8-byte Folded Spill
 ; REMAT-NEXT:    add a5, a1, a5
-; REMAT-NEXT:    sd a5, 264(sp) # 8-byte Folded Spill
-; REMAT-NEXT:    add s8, a1, s8
-; REMAT-NEXT:    sd s8, 256(sp) # 8-byte Folded Spill
-; REMAT-NEXT:    add s9, a1, s9
-; REMAT-NEXT:    sd s9, 248(sp) # 8-byte Folded Spill
-; REMAT-NEXT:    add s10, a1, s10
-; REMAT-NEXT:    sd s10, 240(sp) # 8-byte Folded Spill
-; REMAT-NEXT:    add s11, a1, s11
-; REMAT-NEXT:    sd s11, 232(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    sd a5, 232(sp) # 8-byte Folded Spill
 ; REMAT-NEXT:    add ra, a1, ra
 ; REMAT-NEXT:    sd ra, 224(sp) # 8-byte Folded Spill
 ; REMAT-NEXT:    li a0, 29
@@ -1483,22 +1548,16 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; REMAT-NEXT:    slli a0, a0, 10
 ; REMAT-NEXT:    add a0, a1, a0
 ; REMAT-NEXT:    sd a0, 112(sp) # 8-byte Folded Spill
-; REMAT-NEXT:    lui a0, 5
-; REMAT-NEXT:    addiw a0, a0, 1536
-; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    sd a0, 104(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    add s4, a1, s4
+; REMAT-NEXT:    sd s4, 104(sp) # 8-byte Folded Spill
 ; REMAT-NEXT:    li a0, 11
 ; REMAT-NEXT:    slli a0, a0, 11
 ; REMAT-NEXT:    add a0, a1, a0
 ; REMAT-NEXT:    sd a0, 96(sp) # 8-byte Folded Spill
-; REMAT-NEXT:    lui a0, 6
-; REMAT-NEXT:    addiw a0, a0, -1536
-; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    sd a0, 88(sp) # 8-byte Folded Spill
-; REMAT-NEXT:    li a0, 23
-; REMAT-NEXT:    slli a0, a0, 10
-; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    sd a0, 80(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    add s3, a1, s3
+; REMAT-NEXT:    sd s3, 88(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    add s2, a1, s2
+; REMAT-NEXT:    sd s2, 80(sp) # 8-byte Folded Spill
 ; REMAT-NEXT:    lui a0, 6
 ; REMAT-NEXT:    addiw a0, a0, -512
 ; REMAT-NEXT:    add a0, a1, a0
@@ -1795,7 +1854,8 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; REMAT-NEXT:    csrr a0, vlenb
-; REMAT-NEXT:    slli a0, a0, 3
+; REMAT-NEXT:    li a1, 14
+; REMAT-NEXT:    mul a0, a0, a1
 ; REMAT-NEXT:    add sp, sp, a0
 ; REMAT-NEXT:    .cfi_def_cfa sp, 544
 ; REMAT-NEXT:    ld ra, 536(sp) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll
index 0b5856a7000dd4..575a757149ebba 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll
@@ -5682,16 +5682,28 @@ define void @mscatter_baseidx_v8i64(<8 x i64> %val, ptr %base, <8 x i64> %idxs,
 ;
 ; RV32ZVE32F-LABEL: mscatter_baseidx_v8i64:
 ; RV32ZVE32F:       # %bb.0:
-; RV32ZVE32F-NEXT:    addi sp, sp, -16
-; RV32ZVE32F-NEXT:    .cfi_def_cfa_offset 16
-; RV32ZVE32F-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT:    sw s2, 4(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT:    sw s3, 0(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    addi sp, sp, -48
+; RV32ZVE32F-NEXT:    .cfi_def_cfa_offset 48
+; RV32ZVE32F-NEXT:    sw s0, 44(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw s1, 40(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw s2, 36(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw s3, 32(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw s4, 28(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw s5, 24(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw s6, 20(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw s7, 16(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw s8, 12(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw s9, 8(sp) # 4-byte Folded Spill
 ; RV32ZVE32F-NEXT:    .cfi_offset s0, -4
 ; RV32ZVE32F-NEXT:    .cfi_offset s1, -8
 ; RV32ZVE32F-NEXT:    .cfi_offset s2, -12
 ; RV32ZVE32F-NEXT:    .cfi_offset s3, -16
+; RV32ZVE32F-NEXT:    .cfi_offset s4, -20
+; RV32ZVE32F-NEXT:    .cfi_offset s5, -24
+; RV32ZVE32F-NEXT:    .cfi_offset s6, -28
+; RV32ZVE32F-NEXT:    .cfi_offset s7, -32
+; RV32ZVE32F-NEXT:    .cfi_offset s8, -36
+; RV32ZVE32F-NEXT:    .cfi_offset s9, -40
 ; RV32ZVE32F-NEXT:    .cfi_remember_state
 ; RV32ZVE32F-NEXT:    lw a3, 56(a0)
 ; RV32ZVE32F-NEXT:    lw a4, 60(a0)
@@ -5703,30 +5715,30 @@ define void @mscatter_baseidx_v8i64(<8 x i64> %val, ptr %base, <8 x i64> %idxs,
 ; RV32ZVE32F-NEXT:    lw t4, 28(a0)
 ; RV32ZVE32F-NEXT:    lw t1, 32(a0)
 ; RV32ZVE32F-NEXT:    lw t2, 36(a0)
-; RV32ZVE32F-NEXT:    lw t5, 0(a2)
-; RV32ZVE32F-NEXT:    lw t6, 8(a2)
-; RV32ZVE32F-NEXT:    lw s0, 16(a2)
-; RV32ZVE32F-NEXT:    lw s1, 24(a2)
-; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32ZVE32F-NEXT:    vmv.v.x v8, t5
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, t6
-; RV32ZVE32F-NEXT:    lw t5, 32(a2)
-; RV32ZVE32F-NEXT:    lw t6, 40(a2)
-; RV32ZVE32F-NEXT:    lw s2, 48(a2)
-; RV32ZVE32F-NEXT:    lw s3, 56(a2)
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s0
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s1
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, t5
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, t6
 ; RV32ZVE32F-NEXT:    lw s0, 8(a0)
 ; RV32ZVE32F-NEXT:    lw s1, 12(a0)
 ; RV32ZVE32F-NEXT:    lw t5, 16(a0)
 ; RV32ZVE32F-NEXT:    lw t6, 20(a0)
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s2
+; RV32ZVE32F-NEXT:    lw s2, 32(a2)
+; RV32ZVE32F-NEXT:    lw s3, 40(a2)
+; RV32ZVE32F-NEXT:    lw s4, 48(a2)
+; RV32ZVE32F-NEXT:    lw s5, 56(a2)
+; RV32ZVE32F-NEXT:    lw s6, 0(a2)
+; RV32ZVE32F-NEXT:    lw s7, 8(a2)
+; RV32ZVE32F-NEXT:    lw s8, 16(a2)
+; RV32ZVE32F-NEXT:    lw s9, 24(a2)
+; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32ZVE32F-NEXT:    vmv.v.x v8, s6
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s a2, v0
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s7
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s8
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s9
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s2
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s3
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s4
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s5
 ; RV32ZVE32F-NEXT:    vsll.vi v8, v8, 3
 ; RV32ZVE32F-NEXT:    andi s2, a2, 1
 ; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a1
@@ -5759,15 +5771,27 @@ define void @mscatter_baseidx_v8i64(<8 x i64> %val, ptr %base, <8 x i64> %idxs,
 ; RV32ZVE32F-NEXT:    sw a3, 0(a0)
 ; RV32ZVE32F-NEXT:    sw a4, 4(a0)
 ; RV32ZVE32F-NEXT:  .LBB51_9: # %else14
-; RV32ZVE32F-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    lw s2, 4(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    lw s3, 0(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    lw s0, 44(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    lw s1, 40(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    lw s2, 36(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    lw s3, 32(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    lw s4, 28(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    lw s5, 24(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    lw s6, 20(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    lw s7, 16(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    lw s8, 12(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    lw s9, 8(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    .cfi_restore s0
 ; RV32ZVE32F-NEXT:    .cfi_restore s1
 ; RV32ZVE32F-NEXT:    .cfi_restore s2
 ; RV32ZVE32F-NEXT:    .cfi_restore s3
-; RV32ZVE32F-NEXT:    addi sp, sp, 16
+; RV32ZVE32F-NEXT:    .cfi_restore s4
+; RV32ZVE32F-NEXT:    .cfi_restore s5
+; RV32ZVE32F-NEXT:    .cfi_restore s6
+; RV32ZVE32F-NEXT:    .cfi_restore s7
+; RV32ZVE32F-NEXT:    .cfi_restore s8
+; RV32ZVE32F-NEXT:    .cfi_restore s9
+; RV32ZVE32F-NEXT:    addi sp, sp, 48
 ; RV32ZVE32F-NEXT:    .cfi_def_cfa_offset 0
 ; RV32ZVE32F-NEXT:    ret
 ; RV32ZVE32F-NEXT:  .LBB51_10: # %cond.store
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll
index 036fee6a13ca4c..a11c02dd5e2cb4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll
@@ -1306,12 +1306,6 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN32-NEXT:    sb a0, 219(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 564(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 308(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 218(sp)
-; ZVFHMIN32-NEXT:    lh a0, 562(sp)
-; ZVFHMIN32-NEXT:    lh a1, 306(sp)
 ; ZVFHMIN32-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; ZVFHMIN32-NEXT:    vslidedown.vi v10, v8, 7
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
@@ -1364,82 +1358,86 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN32-NEXT:    vs1r.v v10, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN32-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
 ; ZVFHMIN32-NEXT:    vslidedown.vi v26, v8, 15
-; ZVFHMIN32-NEXT:    vslidedown.vi v28, v8, 14
-; ZVFHMIN32-NEXT:    vslidedown.vi v10, v8, 13
-; ZVFHMIN32-NEXT:    addi a2, sp, 848
+; ZVFHMIN32-NEXT:    vslidedown.vi v20, v8, 14
+; ZVFHMIN32-NEXT:    vslidedown.vi v28, v8, 13
+; ZVFHMIN32-NEXT:    vslidedown.vi v10, v8, 12
+; ZVFHMIN32-NEXT:    csrr a2, vlenb
+; ZVFHMIN32-NEXT:    slli a2, a2, 1
+; ZVFHMIN32-NEXT:    add a2, sp, a2
+; ZVFHMIN32-NEXT:    addi a2, a2, 848
 ; ZVFHMIN32-NEXT:    vs2r.v v10, (a2) # Unknown-size Folded Spill
-; ZVFHMIN32-NEXT:    vslidedown.vi v6, v8, 12
-; ZVFHMIN32-NEXT:    vslidedown.vi v2, v8, 11
-; ZVFHMIN32-NEXT:    vslidedown.vi v22, v8, 10
-; ZVFHMIN32-NEXT:    vslidedown.vi v20, v8, 9
-; ZVFHMIN32-NEXT:    vslidedown.vi v18, v8, 8
-; ZVFHMIN32-NEXT:    vmv.x.s a3, v16
+; ZVFHMIN32-NEXT:    vslidedown.vi v4, v8, 11
+; ZVFHMIN32-NEXT:    vslidedown.vi v2, v8, 10
+; ZVFHMIN32-NEXT:    vslidedown.vi v30, v8, 9
+; ZVFHMIN32-NEXT:    vslidedown.vi v22, v8, 8
+; ZVFHMIN32-NEXT:    vmv.x.s t5, v16
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 217(sp)
-; ZVFHMIN32-NEXT:    lh a0, 560(sp)
-; ZVFHMIN32-NEXT:    lh a1, 304(sp)
+; ZVFHMIN32-NEXT:    sb a0, 218(sp)
+; ZVFHMIN32-NEXT:    lh a0, 562(sp)
+; ZVFHMIN32-NEXT:    lh a1, 306(sp)
 ; ZVFHMIN32-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; ZVFHMIN32-NEXT:    vslidedown.vi v21, v16, 7
-; ZVFHMIN32-NEXT:    vslidedown.vi v3, v16, 6
-; ZVFHMIN32-NEXT:    vslidedown.vi v19, v16, 5
+; ZVFHMIN32-NEXT:    vslidedown.vi v3, v16, 7
+; ZVFHMIN32-NEXT:    vslidedown.vi v31, v16, 6
+; ZVFHMIN32-NEXT:    vslidedown.vi v5, v16, 5
 ; ZVFHMIN32-NEXT:    vslidedown.vi v23, v16, 4
 ; ZVFHMIN32-NEXT:    vslidedown.vi v8, v16, 3
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    li a4, 10
-; ZVFHMIN32-NEXT:    mul a2, a2, a4
+; ZVFHMIN32-NEXT:    li a3, 18
+; ZVFHMIN32-NEXT:    mul a2, a2, a3
 ; ZVFHMIN32-NEXT:    add a2, sp, a2
 ; ZVFHMIN32-NEXT:    addi a2, a2, 848
 ; ZVFHMIN32-NEXT:    vs1r.v v8, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN32-NEXT:    vslidedown.vi v8, v16, 2
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    slli a2, a2, 4
+; ZVFHMIN32-NEXT:    li a3, 22
+; ZVFHMIN32-NEXT:    mul a2, a2, a3
 ; ZVFHMIN32-NEXT:    add a2, sp, a2
 ; ZVFHMIN32-NEXT:    addi a2, a2, 848
 ; ZVFHMIN32-NEXT:    vs1r.v v8, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN32-NEXT:    vslidedown.vi v8, v16, 1
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    slli a4, a2, 4
-; ZVFHMIN32-NEXT:    sub a2, a4, a2
+; ZVFHMIN32-NEXT:    li a3, 21
+; ZVFHMIN32-NEXT:    mul a2, a2, a3
 ; ZVFHMIN32-NEXT:    add a2, sp, a2
 ; ZVFHMIN32-NEXT:    addi a2, a2, 848
 ; ZVFHMIN32-NEXT:    vs1r.v v8, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN32-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
-; ZVFHMIN32-NEXT:    vslidedown.vi v14, v16, 15
-; ZVFHMIN32-NEXT:    vslidedown.vi v8, v16, 14
-; ZVFHMIN32-NEXT:    vslidedown.vi v10, v16, 13
-; ZVFHMIN32-NEXT:    vslidedown.vi v12, v16, 12
-; ZVFHMIN32-NEXT:    vslidedown.vi v30, v16, 11
+; ZVFHMIN32-NEXT:    vslidedown.vi v18, v16, 15
+; ZVFHMIN32-NEXT:    vslidedown.vi v14, v16, 14
+; ZVFHMIN32-NEXT:    vslidedown.vi v12, v16, 13
+; ZVFHMIN32-NEXT:    vslidedown.vi v10, v16, 12
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v16, 11
+; ZVFHMIN32-NEXT:    vslidedown.vi v6, v16, 10
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    slli a4, a2, 4
-; ZVFHMIN32-NEXT:    add a2, a4, a2
+; ZVFHMIN32-NEXT:    li a3, 19
+; ZVFHMIN32-NEXT:    mul a2, a2, a3
 ; ZVFHMIN32-NEXT:    add a2, sp, a2
 ; ZVFHMIN32-NEXT:    addi a2, a2, 848
-; ZVFHMIN32-NEXT:    vs2r.v v30, (a2) # Unknown-size Folded Spill
-; ZVFHMIN32-NEXT:    vslidedown.vi v30, v16, 10
+; ZVFHMIN32-NEXT:    vs2r.v v6, (a2) # Unknown-size Folded Spill
+; ZVFHMIN32-NEXT:    vslidedown.vi v6, v16, 9
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    li a4, 11
-; ZVFHMIN32-NEXT:    mul a2, a2, a4
+; ZVFHMIN32-NEXT:    li a3, 14
+; ZVFHMIN32-NEXT:    mul a2, a2, a3
 ; ZVFHMIN32-NEXT:    add a2, sp, a2
 ; ZVFHMIN32-NEXT:    addi a2, a2, 848
-; ZVFHMIN32-NEXT:    vs2r.v v30, (a2) # Unknown-size Folded Spill
-; ZVFHMIN32-NEXT:    vslidedown.vi v4, v16, 9
-; ZVFHMIN32-NEXT:    vslidedown.vi v30, v16, 8
+; ZVFHMIN32-NEXT:    vs2r.v v6, (a2) # Unknown-size Folded Spill
+; ZVFHMIN32-NEXT:    vslidedown.vi v6, v16, 8
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 216(sp)
-; ZVFHMIN32-NEXT:    lh a0, 558(sp)
-; ZVFHMIN32-NEXT:    lh a1, 302(sp)
+; ZVFHMIN32-NEXT:    sb a0, 217(sp)
+; ZVFHMIN32-NEXT:    lh a0, 560(sp)
+; ZVFHMIN32-NEXT:    lh a1, 304(sp)
 ; ZVFHMIN32-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; ZVFHMIN32-NEXT:    vslidedown.vi v11, v0, 7
-; ZVFHMIN32-NEXT:    vslidedown.vi v7, v0, 6
-; ZVFHMIN32-NEXT:    vslidedown.vi v9, v0, 5
+; ZVFHMIN32-NEXT:    vslidedown.vi v9, v0, 7
+; ZVFHMIN32-NEXT:    vslidedown.vi v11, v0, 6
+; ZVFHMIN32-NEXT:    vslidedown.vi v13, v0, 5
 ; ZVFHMIN32-NEXT:    vslidedown.vi v29, v0, 4
-; ZVFHMIN32-NEXT:    vslidedown.vi v31, v0, 3
-; ZVFHMIN32-NEXT:    vslidedown.vi v5, v0, 2
-; ZVFHMIN32-NEXT:    vslidedown.vi v27, v0, 1
+; ZVFHMIN32-NEXT:    vslidedown.vi v27, v0, 3
+; ZVFHMIN32-NEXT:    vslidedown.vi v7, v0, 2
+; ZVFHMIN32-NEXT:    vslidedown.vi v21, v0, 1
 ; ZVFHMIN32-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
 ; ZVFHMIN32-NEXT:    vslidedown.vi v16, v0, 15
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
@@ -1449,88 +1447,99 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN32-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN32-NEXT:    vslidedown.vi v16, v0, 14
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    slli a2, a2, 1
+; ZVFHMIN32-NEXT:    slli a2, a2, 3
 ; ZVFHMIN32-NEXT:    add a2, sp, a2
 ; ZVFHMIN32-NEXT:    addi a2, a2, 848
 ; ZVFHMIN32-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN32-NEXT:    vslidedown.vi v16, v0, 13
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    li a4, 6
-; ZVFHMIN32-NEXT:    mul a2, a2, a4
+; ZVFHMIN32-NEXT:    li a3, 6
+; ZVFHMIN32-NEXT:    mul a2, a2, a3
 ; ZVFHMIN32-NEXT:    add a2, sp, a2
 ; ZVFHMIN32-NEXT:    addi a2, a2, 848
 ; ZVFHMIN32-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN32-NEXT:    vslidedown.vi v16, v0, 12
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    slli a2, a2, 3
+; ZVFHMIN32-NEXT:    li a3, 12
+; ZVFHMIN32-NEXT:    mul a2, a2, a3
 ; ZVFHMIN32-NEXT:    add a2, sp, a2
 ; ZVFHMIN32-NEXT:    addi a2, a2, 848
 ; ZVFHMIN32-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN32-NEXT:    vslidedown.vi v16, v0, 11
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    li a4, 13
-; ZVFHMIN32-NEXT:    mul a2, a2, a4
+; ZVFHMIN32-NEXT:    li a3, 10
+; ZVFHMIN32-NEXT:    mul a2, a2, a3
 ; ZVFHMIN32-NEXT:    add a2, sp, a2
 ; ZVFHMIN32-NEXT:    addi a2, a2, 848
 ; ZVFHMIN32-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN32-NEXT:    vslidedown.vi v16, v0, 10
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    li a4, 19
-; ZVFHMIN32-NEXT:    mul a2, a2, a4
+; ZVFHMIN32-NEXT:    slli a2, a2, 4
 ; ZVFHMIN32-NEXT:    add a2, sp, a2
 ; ZVFHMIN32-NEXT:    addi a2, a2, 848
 ; ZVFHMIN32-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN32-NEXT:    vslidedown.vi v16, v0, 9
-; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    li a4, 21
-; ZVFHMIN32-NEXT:    mul a2, a2, a4
-; ZVFHMIN32-NEXT:    add a2, sp, a2
-; ZVFHMIN32-NEXT:    addi a2, a2, 848
-; ZVFHMIN32-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN32-NEXT:    vslidedown.vi v0, v0, 8
+; ZVFHMIN32-NEXT:    addi a2, sp, 848
+; ZVFHMIN32-NEXT:    vs2r.v v0, (a2) # Unknown-size Folded Spill
+; ZVFHMIN32-NEXT:    vmv.x.s t4, v26
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a0, 216(sp)
+; ZVFHMIN32-NEXT:    lh a0, 558(sp)
+; ZVFHMIN32-NEXT:    lh a1, 302(sp)
+; ZVFHMIN32-NEXT:    vmv.x.s t3, v20
+; ZVFHMIN32-NEXT:    vmv.x.s t1, v28
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a0, 215(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 556(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 300(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s t3, v26
-; ZVFHMIN32-NEXT:    vmv.x.s t2, v28
+; ZVFHMIN32-NEXT:    csrr a2, vlenb
+; ZVFHMIN32-NEXT:    slli a2, a2, 1
+; ZVFHMIN32-NEXT:    add a2, sp, a2
+; ZVFHMIN32-NEXT:    addi a2, a2, 848
+; ZVFHMIN32-NEXT:    vl2r.v v0, (a2) # Unknown-size Folded Reload
+; ZVFHMIN32-NEXT:    vmv.x.s t2, v0
+; ZVFHMIN32-NEXT:    vmv.x.s t0, v4
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a0, 214(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 554(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 298(sp)
-; ZVFHMIN32-NEXT:    addi a2, sp, 848
-; ZVFHMIN32-NEXT:    vl2r.v v16, (a2) # Unknown-size Folded Reload
-; ZVFHMIN32-NEXT:    vmv.x.s t1, v16
-; ZVFHMIN32-NEXT:    vmv.x.s t0, v6
+; ZVFHMIN32-NEXT:    vmv.x.s a7, v2
+; ZVFHMIN32-NEXT:    vmv.x.s a6, v30
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a0, 213(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 552(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 296(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s a7, v2
-; ZVFHMIN32-NEXT:    vmv.x.s a6, v22
+; ZVFHMIN32-NEXT:    vmv.x.s a2, v22
+; ZVFHMIN32-NEXT:    sw a2, 104(sp) # 4-byte Folded Spill
+; ZVFHMIN32-NEXT:    vmv.x.s a2, v18
+; ZVFHMIN32-NEXT:    sw a2, 108(sp) # 4-byte Folded Spill
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a0, 212(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 550(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 294(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s a5, v20
-; ZVFHMIN32-NEXT:    vmv.x.s a2, v18
-; ZVFHMIN32-NEXT:    sw a2, 108(sp) # 4-byte Folded Spill
+; ZVFHMIN32-NEXT:    vmv.x.s a2, v14
+; ZVFHMIN32-NEXT:    sw a2, 112(sp) # 4-byte Folded Spill
+; ZVFHMIN32-NEXT:    vmv.x.s a2, v12
+; ZVFHMIN32-NEXT:    sw a2, 116(sp) # 4-byte Folded Spill
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a0, 211(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 548(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 292(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s a2, v14
-; ZVFHMIN32-NEXT:    sw a2, 116(sp) # 4-byte Folded Spill
+; ZVFHMIN32-NEXT:    vmv.x.s a2, v10
+; ZVFHMIN32-NEXT:    sw a2, 120(sp) # 4-byte Folded Spill
 ; ZVFHMIN32-NEXT:    vmv.x.s a2, v8
 ; ZVFHMIN32-NEXT:    sw a2, 124(sp) # 4-byte Folded Spill
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
@@ -1539,208 +1548,204 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN32-NEXT:    sb a0, 210(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 546(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 290(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a3
-; ZVFHMIN32-NEXT:    vmv.x.s a3, v24
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, t5
+; ZVFHMIN32-NEXT:    vmv.x.s t5, v24
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa3, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa4, fa3
 ; ZVFHMIN32-NEXT:    sb a0, 209(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 544(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 288(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a3
-; ZVFHMIN32-NEXT:    feq.h a3, fa5, fa4
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, t5
+; ZVFHMIN32-NEXT:    feq.h t5, fa5, fa4
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a3, 192(sp)
+; ZVFHMIN32-NEXT:    sb t5, 192(sp)
 ; ZVFHMIN32-NEXT:    sb a0, 208(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 738(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 482(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s a2, v10
-; ZVFHMIN32-NEXT:    sw a2, 112(sp) # 4-byte Folded Spill
-; ZVFHMIN32-NEXT:    vmv.x.s a2, v12
-; ZVFHMIN32-NEXT:    sw a2, 120(sp) # 4-byte Folded Spill
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 177(sp)
-; ZVFHMIN32-NEXT:    lh a0, 736(sp)
-; ZVFHMIN32-NEXT:    lh a1, 480(sp)
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
 ; ZVFHMIN32-NEXT:    li a3, 29
 ; ZVFHMIN32-NEXT:    mul a2, a2, a3
 ; ZVFHMIN32-NEXT:    add a2, sp, a2
-; ZVFHMIN32-NEXT:    lh s5, 848(a2) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT:    lh s7, 848(a2) # 8-byte Folded Reload
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
 ; ZVFHMIN32-NEXT:    li a3, 28
 ; ZVFHMIN32-NEXT:    mul a2, a2, a3
 ; ZVFHMIN32-NEXT:    add a2, sp, a2
-; ZVFHMIN32-NEXT:    lh s2, 848(a2) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT:    lh s4, 848(a2) # 8-byte Folded Reload
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 176(sp)
-; ZVFHMIN32-NEXT:    lh a0, 734(sp)
-; ZVFHMIN32-NEXT:    lh a1, 478(sp)
+; ZVFHMIN32-NEXT:    sb a0, 177(sp)
+; ZVFHMIN32-NEXT:    lh a0, 736(sp)
+; ZVFHMIN32-NEXT:    lh a1, 480(sp)
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
 ; ZVFHMIN32-NEXT:    li a3, 27
 ; ZVFHMIN32-NEXT:    mul a2, a2, a3
 ; ZVFHMIN32-NEXT:    add a2, sp, a2
-; ZVFHMIN32-NEXT:    lh s6, 848(a2) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT:    lh s8, 848(a2) # 8-byte Folded Reload
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
 ; ZVFHMIN32-NEXT:    li a3, 26
 ; ZVFHMIN32-NEXT:    mul a2, a2, a3
 ; ZVFHMIN32-NEXT:    add a2, sp, a2
-; ZVFHMIN32-NEXT:    lh s3, 848(a2) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT:    lh s5, 848(a2) # 8-byte Folded Reload
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 175(sp)
-; ZVFHMIN32-NEXT:    lh a0, 732(sp)
-; ZVFHMIN32-NEXT:    lh a1, 476(sp)
+; ZVFHMIN32-NEXT:    sb a0, 176(sp)
+; ZVFHMIN32-NEXT:    lh a0, 734(sp)
+; ZVFHMIN32-NEXT:    lh a1, 478(sp)
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
 ; ZVFHMIN32-NEXT:    li a3, 25
 ; ZVFHMIN32-NEXT:    mul a2, a2, a3
 ; ZVFHMIN32-NEXT:    add a2, sp, a2
-; ZVFHMIN32-NEXT:    lh s7, 848(a2) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT:    lh s9, 848(a2) # 8-byte Folded Reload
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
 ; ZVFHMIN32-NEXT:    li a3, 24
 ; ZVFHMIN32-NEXT:    mul a2, a2, a3
 ; ZVFHMIN32-NEXT:    add a2, sp, a2
-; ZVFHMIN32-NEXT:    lh s4, 848(a2) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT:    lh s6, 848(a2) # 8-byte Folded Reload
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 174(sp)
-; ZVFHMIN32-NEXT:    lh a0, 730(sp)
-; ZVFHMIN32-NEXT:    lh a1, 474(sp)
+; ZVFHMIN32-NEXT:    sb a0, 175(sp)
+; ZVFHMIN32-NEXT:    lh a0, 732(sp)
+; ZVFHMIN32-NEXT:    lh a1, 476(sp)
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
 ; ZVFHMIN32-NEXT:    li a3, 23
 ; ZVFHMIN32-NEXT:    mul a2, a2, a3
 ; ZVFHMIN32-NEXT:    add a2, sp, a2
-; ZVFHMIN32-NEXT:    lh s8, 848(a2) # 8-byte Folded Reload
-; ZVFHMIN32-NEXT:    vmv.x.s t4, v21
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 173(sp)
-; ZVFHMIN32-NEXT:    lh a0, 728(sp)
-; ZVFHMIN32-NEXT:    lh a1, 472(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s t6, v3
-; ZVFHMIN32-NEXT:    vmv.x.s t5, v19
+; ZVFHMIN32-NEXT:    lh s3, 848(a2) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT:    vmv.x.s t5, v3
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 172(sp)
-; ZVFHMIN32-NEXT:    lh a0, 726(sp)
-; ZVFHMIN32-NEXT:    lh a1, 470(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s s10, v11
-; ZVFHMIN32-NEXT:    vmv.x.s s11, v7
+; ZVFHMIN32-NEXT:    sb a0, 174(sp)
+; ZVFHMIN32-NEXT:    lh a0, 730(sp)
+; ZVFHMIN32-NEXT:    lh a1, 474(sp)
+; ZVFHMIN32-NEXT:    vmv.x.s s2, v31
+; ZVFHMIN32-NEXT:    vmv.x.s t6, v5
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 171(sp)
-; ZVFHMIN32-NEXT:    lh a0, 724(sp)
-; ZVFHMIN32-NEXT:    lh s9, 468(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s a4, v9
-; ZVFHMIN32-NEXT:    vmv.x.s ra, v29
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, s9
+; ZVFHMIN32-NEXT:    sb a0, 173(sp)
+; ZVFHMIN32-NEXT:    lh a1, 728(sp)
+; ZVFHMIN32-NEXT:    lh s10, 472(sp)
+; ZVFHMIN32-NEXT:    vmv.x.s a3, v9
+; ZVFHMIN32-NEXT:    vmv.x.s a4, v11
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, s10
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a1, 172(sp)
+; ZVFHMIN32-NEXT:    lh a1, 726(sp)
+; ZVFHMIN32-NEXT:    lh s10, 470(sp)
+; ZVFHMIN32-NEXT:    vmv.x.s a2, v13
+; ZVFHMIN32-NEXT:    vmv.x.s s11, v29
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, s10
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a1, 171(sp)
+; ZVFHMIN32-NEXT:    lh ra, 724(sp)
+; ZVFHMIN32-NEXT:    lh a0, 468(sp)
+; ZVFHMIN32-NEXT:    vmv.x.s a5, v27
+; ZVFHMIN32-NEXT:    vmv.x.s s10, v7
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, ra
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a0
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a0, 170(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 722(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 466(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s s9, v31
-; ZVFHMIN32-NEXT:    vmv.x.s a3, v5
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 169(sp)
-; ZVFHMIN32-NEXT:    lh a0, 720(sp)
-; ZVFHMIN32-NEXT:    lh a1, 464(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s a2, v27
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, s5
+; ZVFHMIN32-NEXT:    vmv.x.s ra, v21
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, s7
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa3, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa4, fa3
-; ZVFHMIN32-NEXT:    sb a0, 168(sp)
-; ZVFHMIN32-NEXT:    lh a0, 718(sp)
-; ZVFHMIN32-NEXT:    lh a1, 462(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, s2
-; ZVFHMIN32-NEXT:    fmv.h.x fa3, s6
+; ZVFHMIN32-NEXT:    sb a0, 169(sp)
+; ZVFHMIN32-NEXT:    lh a0, 720(sp)
+; ZVFHMIN32-NEXT:    lh a1, 464(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, s4
+; ZVFHMIN32-NEXT:    fmv.h.x fa3, s8
 ; ZVFHMIN32-NEXT:    fmv.h.x fa2, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa1, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa2, fa1
-; ZVFHMIN32-NEXT:    sb a0, 167(sp)
-; ZVFHMIN32-NEXT:    lh a0, 716(sp)
-; ZVFHMIN32-NEXT:    lh a1, 460(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa2, s3
-; ZVFHMIN32-NEXT:    fmv.h.x fa1, s7
+; ZVFHMIN32-NEXT:    sb a0, 168(sp)
+; ZVFHMIN32-NEXT:    lh a0, 718(sp)
+; ZVFHMIN32-NEXT:    lh a1, 462(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa2, s5
+; ZVFHMIN32-NEXT:    fmv.h.x fa1, s9
 ; ZVFHMIN32-NEXT:    fmv.h.x fa0, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x ft0, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa0, ft0
-; ZVFHMIN32-NEXT:    sb a0, 166(sp)
-; ZVFHMIN32-NEXT:    lh a0, 714(sp)
-; ZVFHMIN32-NEXT:    lh a1, 458(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa0, s4
-; ZVFHMIN32-NEXT:    fmv.h.x ft0, s8
+; ZVFHMIN32-NEXT:    sb a0, 167(sp)
+; ZVFHMIN32-NEXT:    lh a0, 716(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa0, s6
+; ZVFHMIN32-NEXT:    lh a1, 460(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x ft0, a3
 ; ZVFHMIN32-NEXT:    fmv.h.x ft1, a0
-; ZVFHMIN32-NEXT:    fmv.h.x ft2, a1
-; ZVFHMIN32-NEXT:    feq.h a0, ft1, ft2
-; ZVFHMIN32-NEXT:    sb a0, 165(sp)
-; ZVFHMIN32-NEXT:    lh a0, 712(sp)
-; ZVFHMIN32-NEXT:    lh a1, 456(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x ft1, s10
-; ZVFHMIN32-NEXT:    fmv.h.x ft2, s11
-; ZVFHMIN32-NEXT:    fmv.h.x ft3, a0
-; ZVFHMIN32-NEXT:    fmv.h.x ft4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, ft3, ft4
-; ZVFHMIN32-NEXT:    sb a0, 164(sp)
-; ZVFHMIN32-NEXT:    lh a0, 710(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x ft3, a4
-; ZVFHMIN32-NEXT:    lh a1, 454(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x ft4, ra
-; ZVFHMIN32-NEXT:    fmv.h.x ft5, a0
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, ft1
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, ft0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT:    feq.h a1, ft5, fa5
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a3
-; ZVFHMIN32-NEXT:    sb a1, 163(sp)
-; ZVFHMIN32-NEXT:    lh a1, 708(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x ft1, a2
-; ZVFHMIN32-NEXT:    lh a2, 452(sp)
-; ZVFHMIN32-NEXT:    feq.h a3, fa0, fa5
+; ZVFHMIN32-NEXT:    feq.h a1, ft1, fa5
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a4
+; ZVFHMIN32-NEXT:    sb a1, 166(sp)
+; ZVFHMIN32-NEXT:    lh a1, 714(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x ft0, a2
+; ZVFHMIN32-NEXT:    lh a2, 458(sp)
+; ZVFHMIN32-NEXT:    feq.h a3, fa4, fa5
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT:    feq.h a1, ft0, ft1
-; ZVFHMIN32-NEXT:    fmv.h.x fa0, a2
-; ZVFHMIN32-NEXT:    feq.h a2, fa5, fa0
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, s9
+; ZVFHMIN32-NEXT:    feq.h a1, fa3, ft0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
+; ZVFHMIN32-NEXT:    feq.h a2, fa5, fa4
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, s3
+; ZVFHMIN32-NEXT:    sb a2, 165(sp)
+; ZVFHMIN32-NEXT:    lh a2, 712(sp)
+; ZVFHMIN32-NEXT:    lh a4, 456(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, s11
+; ZVFHMIN32-NEXT:    feq.h s3, fa2, fa4
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
+; ZVFHMIN32-NEXT:    fmv.h.x fa3, a4
+; ZVFHMIN32-NEXT:    feq.h a2, fa4, fa3
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a5
+; ZVFHMIN32-NEXT:    sb a2, 164(sp)
+; ZVFHMIN32-NEXT:    lh a2, 710(sp)
+; ZVFHMIN32-NEXT:    lh a4, 454(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa3, s10
+; ZVFHMIN32-NEXT:    feq.h a5, fa1, fa4
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
+; ZVFHMIN32-NEXT:    fmv.h.x fa2, a4
+; ZVFHMIN32-NEXT:    feq.h a2, fa4, fa2
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, ra
+; ZVFHMIN32-NEXT:    sb a2, 163(sp)
+; ZVFHMIN32-NEXT:    lh a2, 708(sp)
+; ZVFHMIN32-NEXT:    lh a4, 452(sp)
+; ZVFHMIN32-NEXT:    feq.h s4, fa0, fa3
+; ZVFHMIN32-NEXT:    feq.h s5, fa5, fa4
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a2
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a4
+; ZVFHMIN32-NEXT:    feq.h a2, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a2, 162(sp)
 ; ZVFHMIN32-NEXT:    lh a2, 706(sp)
 ; ZVFHMIN32-NEXT:    lh a4, 450(sp)
-; ZVFHMIN32-NEXT:    sb a1, 129(sp)
-; ZVFHMIN32-NEXT:    feq.h a1, fa1, fa5
-; ZVFHMIN32-NEXT:    sb a3, 130(sp)
-; ZVFHMIN32-NEXT:    feq.h a3, fa2, ft4
-; ZVFHMIN32-NEXT:    sb a1, 131(sp)
-; ZVFHMIN32-NEXT:    feq.h a1, fa4, ft2
-; ZVFHMIN32-NEXT:    sb a3, 132(sp)
-; ZVFHMIN32-NEXT:    feq.h a3, fa3, ft3
+; ZVFHMIN32-NEXT:    sb s5, 129(sp)
+; ZVFHMIN32-NEXT:    sb s4, 130(sp)
+; ZVFHMIN32-NEXT:    sb a5, 131(sp)
+; ZVFHMIN32-NEXT:    sb s3, 132(sp)
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a2
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a4
 ; ZVFHMIN32-NEXT:    feq.h a2, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a3, 133(sp)
-; ZVFHMIN32-NEXT:    sb a1, 134(sp)
+; ZVFHMIN32-NEXT:    sb a1, 133(sp)
+; ZVFHMIN32-NEXT:    sb a3, 134(sp)
 ; ZVFHMIN32-NEXT:    sb a0, 135(sp)
 ; ZVFHMIN32-NEXT:    sb a2, 161(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 610(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 354(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s s4, v23
+; ZVFHMIN32-NEXT:    vmv.x.s s6, v23
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    li a3, 10
+; ZVFHMIN32-NEXT:    li a3, 18
 ; ZVFHMIN32-NEXT:    mul a2, a2, a3
 ; ZVFHMIN32-NEXT:    add a2, sp, a2
-; ZVFHMIN32-NEXT:    lh s2, 848(a2) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT:    lh s5, 848(a2) # 8-byte Folded Reload
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
@@ -1748,12 +1753,13 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN32-NEXT:    lh a0, 608(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 352(sp)
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    slli a2, a2, 4
+; ZVFHMIN32-NEXT:    li a3, 22
+; ZVFHMIN32-NEXT:    mul a2, a2, a3
 ; ZVFHMIN32-NEXT:    add a2, sp, a2
-; ZVFHMIN32-NEXT:    lh s5, 848(a2) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT:    lh s4, 848(a2) # 8-byte Folded Reload
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    slli a3, a2, 4
-; ZVFHMIN32-NEXT:    sub a2, a3, a2
+; ZVFHMIN32-NEXT:    li a3, 21
+; ZVFHMIN32-NEXT:    mul a2, a2, a3
 ; ZVFHMIN32-NEXT:    add a2, sp, a2
 ; ZVFHMIN32-NEXT:    lh s3, 848(a2) # 8-byte Folded Reload
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
@@ -1762,148 +1768,148 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN32-NEXT:    sb a0, 240(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 606(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 350(sp)
-; ZVFHMIN32-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 7
-; ZVFHMIN32-NEXT:    vmv.x.s s6, v8
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    fmv.h.x fa3, t5
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, s2
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa2, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa4, fa2
 ; ZVFHMIN32-NEXT:    sb a0, 239(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 604(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 348(sp)
-; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 6
-; ZVFHMIN32-NEXT:    vmv.x.s s7, v8
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, t6
+; ZVFHMIN32-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 7
+; ZVFHMIN32-NEXT:    fmv.h.x fa2, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa1, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa2, fa1
 ; ZVFHMIN32-NEXT:    sb a0, 238(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 602(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 346(sp)
-; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 5
-; ZVFHMIN32-NEXT:    vmv.x.s s8, v8
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    vmv.x.s a2, v8
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 6
+; ZVFHMIN32-NEXT:    fmv.h.x fa2, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa1, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa2, fa1
 ; ZVFHMIN32-NEXT:    sb a0, 237(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 600(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 344(sp)
-; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 4
-; ZVFHMIN32-NEXT:    vmv.x.s s9, v8
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    vmv.x.s a3, v8
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 5
+; ZVFHMIN32-NEXT:    fmv.h.x fa2, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa1, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa2, fa1
 ; ZVFHMIN32-NEXT:    sb a0, 236(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 598(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 342(sp)
-; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 3
-; ZVFHMIN32-NEXT:    vmv.x.s s10, v8
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    vmv.x.s a4, v8
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 4
+; ZVFHMIN32-NEXT:    fmv.h.x fa2, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa1, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa2, fa1
 ; ZVFHMIN32-NEXT:    sb a0, 235(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 596(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 340(sp)
-; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 2
-; ZVFHMIN32-NEXT:    vmv.x.s s11, v8
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    vmv.x.s a5, v8
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 3
+; ZVFHMIN32-NEXT:    fmv.h.x fa2, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa1, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa2, fa1
 ; ZVFHMIN32-NEXT:    sb a0, 234(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 594(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 338(sp)
-; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 1
-; ZVFHMIN32-NEXT:    vmv.x.s ra, v8
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    vmv.x.s t6, v8
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 2
+; ZVFHMIN32-NEXT:    fmv.h.x fa2, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa1, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa2, fa1
 ; ZVFHMIN32-NEXT:    sb a0, 233(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 592(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 336(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, t4
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, t6
-; ZVFHMIN32-NEXT:    fmv.h.x fa3, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa2, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa3, fa2
+; ZVFHMIN32-NEXT:    vmv.x.s s2, v8
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 1
+; ZVFHMIN32-NEXT:    fmv.h.x fa2, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa1, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa2, fa1
 ; ZVFHMIN32-NEXT:    sb a0, 232(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 590(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa2, a2
 ; ZVFHMIN32-NEXT:    lh a1, 334(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa3, t5
-; ZVFHMIN32-NEXT:    fmv.h.x fa2, s4
+; ZVFHMIN32-NEXT:    vmv.x.s a2, v8
 ; ZVFHMIN32-NEXT:    fmv.h.x fa1, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa0, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa1, fa0
+; ZVFHMIN32-NEXT:    feq.h t5, fa3, fa2
+; ZVFHMIN32-NEXT:    fmv.h.x fa3, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa1, fa3
+; ZVFHMIN32-NEXT:    fmv.h.x fa3, a3
 ; ZVFHMIN32-NEXT:    sb a0, 231(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 588(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa2, a4
 ; ZVFHMIN32-NEXT:    lh a1, 332(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa1, s2
-; ZVFHMIN32-NEXT:    fmv.h.x fa0, s5
-; ZVFHMIN32-NEXT:    fmv.h.x ft0, a0
-; ZVFHMIN32-NEXT:    fmv.h.x ft1, a1
-; ZVFHMIN32-NEXT:    feq.h a0, ft0, ft1
-; ZVFHMIN32-NEXT:    sb a0, 230(sp)
-; ZVFHMIN32-NEXT:    lh a0, 586(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x ft0, s3
-; ZVFHMIN32-NEXT:    lh a1, 330(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x ft1, s6
-; ZVFHMIN32-NEXT:    fmv.h.x ft2, a0
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, ft1
+; ZVFHMIN32-NEXT:    feq.h a3, fa5, fa3
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    feq.h a0, fa4, fa2
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, s6
+; ZVFHMIN32-NEXT:    sb a1, 230(sp)
+; ZVFHMIN32-NEXT:    lh a1, 586(sp)
+; ZVFHMIN32-NEXT:    lh a4, 330(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a5
+; ZVFHMIN32-NEXT:    feq.h a5, fa5, fa4
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT:    feq.h a1, ft2, fa5
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, s7
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a4
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, s5
 ; ZVFHMIN32-NEXT:    sb a1, 229(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 584(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x ft1, s8
-; ZVFHMIN32-NEXT:    lh a2, 328(sp)
-; ZVFHMIN32-NEXT:    feq.h a3, fa4, fa5
+; ZVFHMIN32-NEXT:    lh a4, 328(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, t6
+; ZVFHMIN32-NEXT:    feq.h t6, fa5, fa4
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT:    feq.h a1, fa3, ft1
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
-; ZVFHMIN32-NEXT:    feq.h a2, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, s9
-; ZVFHMIN32-NEXT:    sb a2, 228(sp)
-; ZVFHMIN32-NEXT:    lh a2, 582(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a4
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, s4
+; ZVFHMIN32-NEXT:    sb a1, 228(sp)
+; ZVFHMIN32-NEXT:    lh a1, 582(sp)
 ; ZVFHMIN32-NEXT:    lh a4, 326(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, s10
-; ZVFHMIN32-NEXT:    feq.h t4, fa2, fa5
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a2
-; ZVFHMIN32-NEXT:    fmv.h.x fa3, a4
-; ZVFHMIN32-NEXT:    feq.h a2, fa5, fa3
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, s11
-; ZVFHMIN32-NEXT:    fmv.h.x fa3, ra
-; ZVFHMIN32-NEXT:    sb a2, 227(sp)
-; ZVFHMIN32-NEXT:    lh a2, 580(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, s2
+; ZVFHMIN32-NEXT:    feq.h s2, fa5, fa4
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a4
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, s3
+; ZVFHMIN32-NEXT:    sb a1, 227(sp)
+; ZVFHMIN32-NEXT:    lh a1, 580(sp)
 ; ZVFHMIN32-NEXT:    lh a4, 324(sp)
-; ZVFHMIN32-NEXT:    feq.h t5, fa0, fa5
-; ZVFHMIN32-NEXT:    feq.h t6, ft0, fa3
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a2
-; ZVFHMIN32-NEXT:    fmv.h.x fa3, a4
-; ZVFHMIN32-NEXT:    feq.h a2, fa5, fa3
-; ZVFHMIN32-NEXT:    sb a2, 226(sp)
-; ZVFHMIN32-NEXT:    lh a2, 578(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
+; ZVFHMIN32-NEXT:    feq.h a2, fa5, fa4
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a4
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a1, 226(sp)
+; ZVFHMIN32-NEXT:    lh a1, 578(sp)
 ; ZVFHMIN32-NEXT:    lh a4, 322(sp)
-; ZVFHMIN32-NEXT:    sb t6, 193(sp)
-; ZVFHMIN32-NEXT:    feq.h t6, fa1, fa4
-; ZVFHMIN32-NEXT:    sb t5, 194(sp)
+; ZVFHMIN32-NEXT:    sb a2, 193(sp)
+; ZVFHMIN32-NEXT:    sb s2, 194(sp)
 ; ZVFHMIN32-NEXT:    sb t6, 195(sp)
-; ZVFHMIN32-NEXT:    sb t4, 196(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a2
+; ZVFHMIN32-NEXT:    sb a5, 196(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a4
-; ZVFHMIN32-NEXT:    feq.h a2, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a1, 197(sp)
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a0, 197(sp)
 ; ZVFHMIN32-NEXT:    sb a3, 198(sp)
-; ZVFHMIN32-NEXT:    sb a0, 199(sp)
-; ZVFHMIN32-NEXT:    sb a2, 225(sp)
+; ZVFHMIN32-NEXT:    sb t5, 199(sp)
+; ZVFHMIN32-NEXT:    sb a1, 225(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 766(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 510(sp)
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    slli a3, a2, 4
-; ZVFHMIN32-NEXT:    add a2, a3, a2
+; ZVFHMIN32-NEXT:    li a3, 19
+; ZVFHMIN32-NEXT:    mul a2, a2, a3
 ; ZVFHMIN32-NEXT:    add a2, sp, a2
 ; ZVFHMIN32-NEXT:    addi a2, a2, 848
 ; ZVFHMIN32-NEXT:    vl2r.v v8, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN32-NEXT:    vmv.x.s s2, v8
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    li a3, 11
+; ZVFHMIN32-NEXT:    li a3, 14
 ; ZVFHMIN32-NEXT:    mul a2, a2, a3
 ; ZVFHMIN32-NEXT:    add a2, sp, a2
 ; ZVFHMIN32-NEXT:    addi a2, a2, 848
@@ -1915,305 +1921,301 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN32-NEXT:    sb a0, 191(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 764(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 508(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s t5, v4
-; ZVFHMIN32-NEXT:    vmv.x.s t4, v30
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 190(sp)
-; ZVFHMIN32-NEXT:    lh a0, 762(sp)
-; ZVFHMIN32-NEXT:    lh a1, 506(sp)
+; ZVFHMIN32-NEXT:    vmv.x.s t5, v6
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
 ; ZVFHMIN32-NEXT:    slli a2, a2, 2
 ; ZVFHMIN32-NEXT:    add a2, sp, a2
 ; ZVFHMIN32-NEXT:    addi a2, a2, 848
 ; ZVFHMIN32-NEXT:    vl2r.v v8, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN32-NEXT:    vmv.x.s a2, v8
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a0, 190(sp)
+; ZVFHMIN32-NEXT:    lh a0, 762(sp)
+; ZVFHMIN32-NEXT:    lh a1, 506(sp)
 ; ZVFHMIN32-NEXT:    csrr a3, vlenb
-; ZVFHMIN32-NEXT:    slli a3, a3, 1
+; ZVFHMIN32-NEXT:    slli a3, a3, 3
 ; ZVFHMIN32-NEXT:    add a3, sp, a3
 ; ZVFHMIN32-NEXT:    addi a3, a3, 848
 ; ZVFHMIN32-NEXT:    vl2r.v v8, (a3) # Unknown-size Folded Reload
 ; ZVFHMIN32-NEXT:    vmv.x.s a3, v8
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 189(sp)
-; ZVFHMIN32-NEXT:    lh a0, 760(sp)
-; ZVFHMIN32-NEXT:    lh a1, 504(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, t3
 ; ZVFHMIN32-NEXT:    csrr a4, vlenb
-; ZVFHMIN32-NEXT:    li t3, 6
-; ZVFHMIN32-NEXT:    mul a4, a4, t3
+; ZVFHMIN32-NEXT:    li a5, 6
+; ZVFHMIN32-NEXT:    mul a4, a4, a5
 ; ZVFHMIN32-NEXT:    add a4, sp, a4
 ; ZVFHMIN32-NEXT:    addi a4, a4, 848
 ; ZVFHMIN32-NEXT:    vl2r.v v8, (a4) # Unknown-size Folded Reload
 ; ZVFHMIN32-NEXT:    vmv.x.s a4, v8
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa3, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa4, fa3
-; ZVFHMIN32-NEXT:    sb a0, 188(sp)
-; ZVFHMIN32-NEXT:    lh a0, 758(sp)
-; ZVFHMIN32-NEXT:    lh a1, 502(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, t2
-; ZVFHMIN32-NEXT:    csrr t2, vlenb
-; ZVFHMIN32-NEXT:    slli t2, t2, 3
-; ZVFHMIN32-NEXT:    add t2, sp, t2
-; ZVFHMIN32-NEXT:    addi t2, t2, 848
-; ZVFHMIN32-NEXT:    vl2r.v v8, (t2) # Unknown-size Folded Reload
-; ZVFHMIN32-NEXT:    vmv.x.s t2, v8
-; ZVFHMIN32-NEXT:    fmv.h.x fa3, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa2, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa3, fa2
-; ZVFHMIN32-NEXT:    sb a0, 187(sp)
-; ZVFHMIN32-NEXT:    lh a0, 756(sp)
-; ZVFHMIN32-NEXT:    lh a1, 500(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa3, t1
-; ZVFHMIN32-NEXT:    csrr t1, vlenb
-; ZVFHMIN32-NEXT:    li t3, 13
-; ZVFHMIN32-NEXT:    mul t1, t1, t3
-; ZVFHMIN32-NEXT:    add t1, sp, t1
-; ZVFHMIN32-NEXT:    addi t1, t1, 848
-; ZVFHMIN32-NEXT:    vl2r.v v8, (t1) # Unknown-size Folded Reload
-; ZVFHMIN32-NEXT:    vmv.x.s t3, v8
-; ZVFHMIN32-NEXT:    fmv.h.x fa2, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa1, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa2, fa1
-; ZVFHMIN32-NEXT:    sb a0, 186(sp)
-; ZVFHMIN32-NEXT:    lh a0, 754(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa2, t0
-; ZVFHMIN32-NEXT:    lh a1, 498(sp)
-; ZVFHMIN32-NEXT:    csrr t0, vlenb
-; ZVFHMIN32-NEXT:    li t1, 19
-; ZVFHMIN32-NEXT:    mul t0, t0, t1
-; ZVFHMIN32-NEXT:    add t0, sp, t0
-; ZVFHMIN32-NEXT:    addi t0, t0, 848
-; ZVFHMIN32-NEXT:    vl2r.v v8, (t0) # Unknown-size Folded Reload
-; ZVFHMIN32-NEXT:    vmv.x.s s3, v8
-; ZVFHMIN32-NEXT:    fmv.h.x fa1, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a0, 189(sp)
+; ZVFHMIN32-NEXT:    lh a1, 760(sp)
+; ZVFHMIN32-NEXT:    lh a5, 504(sp)
+; ZVFHMIN32-NEXT:    csrr a0, vlenb
+; ZVFHMIN32-NEXT:    li s3, 12
+; ZVFHMIN32-NEXT:    mul a0, a0, s3
+; ZVFHMIN32-NEXT:    add a0, sp, a0
+; ZVFHMIN32-NEXT:    addi a0, a0, 848
+; ZVFHMIN32-NEXT:    vl2r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN32-NEXT:    vmv.x.s s5, v8
 ; ZVFHMIN32-NEXT:    csrr a0, vlenb
-; ZVFHMIN32-NEXT:    li t0, 21
-; ZVFHMIN32-NEXT:    mul a0, a0, t0
+; ZVFHMIN32-NEXT:    li s3, 10
+; ZVFHMIN32-NEXT:    mul a0, a0, s3
 ; ZVFHMIN32-NEXT:    add a0, sp, a0
 ; ZVFHMIN32-NEXT:    addi a0, a0, 848
 ; ZVFHMIN32-NEXT:    vl2r.v v8, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN32-NEXT:    vmv.x.s a0, v8
-; ZVFHMIN32-NEXT:    fmv.h.x fa0, a1
-; ZVFHMIN32-NEXT:    feq.h a1, fa1, fa0
-; ZVFHMIN32-NEXT:    fmv.h.x fa1, a2
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a5
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a1, 188(sp)
+; ZVFHMIN32-NEXT:    lh a1, 758(sp)
+; ZVFHMIN32-NEXT:    lh a5, 502(sp)
+; ZVFHMIN32-NEXT:    csrr s3, vlenb
+; ZVFHMIN32-NEXT:    slli s3, s3, 4
+; ZVFHMIN32-NEXT:    add s3, sp, s3
+; ZVFHMIN32-NEXT:    addi s3, s3, 848
+; ZVFHMIN32-NEXT:    vl2r.v v8, (s3) # Unknown-size Folded Reload
+; ZVFHMIN32-NEXT:    vmv.x.s s4, v8
+; ZVFHMIN32-NEXT:    vmv.x.s s3, v16
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a5
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, t4
+; ZVFHMIN32-NEXT:    sb a1, 187(sp)
+; ZVFHMIN32-NEXT:    lh a1, 756(sp)
+; ZVFHMIN32-NEXT:    lh a5, 500(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
+; ZVFHMIN32-NEXT:    feq.h t4, fa5, fa4
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a5
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, t3
+; ZVFHMIN32-NEXT:    sb a1, 186(sp)
+; ZVFHMIN32-NEXT:    lh a1, 754(sp)
+; ZVFHMIN32-NEXT:    lh a2, 498(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a3
+; ZVFHMIN32-NEXT:    feq.h t3, fa5, fa4
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, t1
 ; ZVFHMIN32-NEXT:    sb a1, 185(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 752(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa0, a3
 ; ZVFHMIN32-NEXT:    lh a2, 496(sp)
-; ZVFHMIN32-NEXT:    feq.h t0, fa5, fa1
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a4
+; ZVFHMIN32-NEXT:    feq.h t1, fa5, fa4
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT:    feq.h t1, fa4, fa0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
 ; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a4
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, t2
 ; ZVFHMIN32-NEXT:    sb a1, 184(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 750(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, t2
 ; ZVFHMIN32-NEXT:    lh a2, 494(sp)
-; ZVFHMIN32-NEXT:    feq.h a3, fa3, fa5
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, s5
+; ZVFHMIN32-NEXT:    feq.h a3, fa5, fa4
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT:    feq.h a1, fa2, fa4
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
-; ZVFHMIN32-NEXT:    feq.h a2, fa5, fa4
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, t0
+; ZVFHMIN32-NEXT:    sb a1, 183(sp)
+; ZVFHMIN32-NEXT:    lh a1, 748(sp)
+; ZVFHMIN32-NEXT:    lh a2, 492(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a0
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a7
-; ZVFHMIN32-NEXT:    sb a2, 183(sp)
-; ZVFHMIN32-NEXT:    lh a2, 748(sp)
-; ZVFHMIN32-NEXT:    lh a4, 492(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, t3
-; ZVFHMIN32-NEXT:    feq.h a7, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a2
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a4
-; ZVFHMIN32-NEXT:    feq.h a2, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a1, 182(sp)
+; ZVFHMIN32-NEXT:    lh a1, 746(sp)
+; ZVFHMIN32-NEXT:    lh a2, 490(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, s4
+; ZVFHMIN32-NEXT:    feq.h a4, fa5, fa4
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a6
-; ZVFHMIN32-NEXT:    sb a2, 182(sp)
-; ZVFHMIN32-NEXT:    lh a2, 746(sp)
-; ZVFHMIN32-NEXT:    lh a4, 490(sp)
+; ZVFHMIN32-NEXT:    sb a1, 181(sp)
+; ZVFHMIN32-NEXT:    lh a1, 744(sp)
+; ZVFHMIN32-NEXT:    lh a2, 488(sp)
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, s3
 ; ZVFHMIN32-NEXT:    feq.h a6, fa5, fa4
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    lw a2, 104(sp) # 4-byte Folded Reload
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a2
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a4
-; ZVFHMIN32-NEXT:    feq.h a2, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a5
-; ZVFHMIN32-NEXT:    sb a2, 181(sp)
-; ZVFHMIN32-NEXT:    lh a2, 744(sp)
-; ZVFHMIN32-NEXT:    lh a4, 488(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a0
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a2
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a4
-; ZVFHMIN32-NEXT:    feq.h a2, fa5, fa4
-; ZVFHMIN32-NEXT:    lw a4, 108(sp) # 4-byte Folded Reload
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a4
-; ZVFHMIN32-NEXT:    vmv.x.s a5, v0
+; ZVFHMIN32-NEXT:    addi a2, sp, 848
+; ZVFHMIN32-NEXT:    vl2r.v v8, (a2) # Unknown-size Folded Reload
+; ZVFHMIN32-NEXT:    vmv.x.s a2, v8
 ; ZVFHMIN32-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
 ; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 15
-; ZVFHMIN32-NEXT:    vmv.x.s a4, v8
-; ZVFHMIN32-NEXT:    sb a2, 180(sp)
-; ZVFHMIN32-NEXT:    lh a2, 742(sp)
-; ZVFHMIN32-NEXT:    lh t2, 486(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a5
-; ZVFHMIN32-NEXT:    feq.h a5, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a2
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, t2
+; ZVFHMIN32-NEXT:    vmv.x.s a5, v8
+; ZVFHMIN32-NEXT:    sb a1, 180(sp)
+; ZVFHMIN32-NEXT:    lh a1, 742(sp)
+; ZVFHMIN32-NEXT:    lh a7, 486(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
 ; ZVFHMIN32-NEXT:    feq.h a2, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a2, 179(sp)
-; ZVFHMIN32-NEXT:    lh a2, 740(sp)
-; ZVFHMIN32-NEXT:    lh t2, 484(sp)
-; ZVFHMIN32-NEXT:    sb a1, 140(sp)
-; ZVFHMIN32-NEXT:    sb a3, 141(sp)
-; ZVFHMIN32-NEXT:    sb t1, 142(sp)
-; ZVFHMIN32-NEXT:    sb t0, 143(sp)
-; ZVFHMIN32-NEXT:    sb a5, 136(sp)
-; ZVFHMIN32-NEXT:    sb a0, 137(sp)
-; ZVFHMIN32-NEXT:    sb a6, 138(sp)
-; ZVFHMIN32-NEXT:    sb a7, 139(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a2
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, t2
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a7
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a1, 179(sp)
+; ZVFHMIN32-NEXT:    lh a1, 740(sp)
+; ZVFHMIN32-NEXT:    lh a7, 484(sp)
+; ZVFHMIN32-NEXT:    sb a3, 140(sp)
+; ZVFHMIN32-NEXT:    sb t1, 141(sp)
+; ZVFHMIN32-NEXT:    sb t3, 142(sp)
+; ZVFHMIN32-NEXT:    sb t4, 143(sp)
+; ZVFHMIN32-NEXT:    sb a2, 136(sp)
+; ZVFHMIN32-NEXT:    sb a6, 137(sp)
+; ZVFHMIN32-NEXT:    sb a4, 138(sp)
+; ZVFHMIN32-NEXT:    sb a0, 139(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a7
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a0, 178(sp)
-; ZVFHMIN32-NEXT:    lh a0, 638(sp)
-; ZVFHMIN32-NEXT:    lh a1, 382(sp)
+; ZVFHMIN32-NEXT:    lh a1, 638(sp)
+; ZVFHMIN32-NEXT:    lh a2, 382(sp)
 ; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 14
-; ZVFHMIN32-NEXT:    vmv.x.s t2, v8
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 255(sp)
-; ZVFHMIN32-NEXT:    lh a0, 636(sp)
-; ZVFHMIN32-NEXT:    lh a1, 380(sp)
+; ZVFHMIN32-NEXT:    vmv.x.s a0, v8
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a1, 255(sp)
+; ZVFHMIN32-NEXT:    lh a1, 636(sp)
+; ZVFHMIN32-NEXT:    lh a2, 380(sp)
 ; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 13
-; ZVFHMIN32-NEXT:    vmv.x.s t1, v8
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 254(sp)
-; ZVFHMIN32-NEXT:    lh a0, 634(sp)
-; ZVFHMIN32-NEXT:    lh a1, 378(sp)
+; ZVFHMIN32-NEXT:    vmv.x.s t2, v8
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a1, 254(sp)
+; ZVFHMIN32-NEXT:    lh a1, 634(sp)
+; ZVFHMIN32-NEXT:    lh a2, 378(sp)
 ; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 12
-; ZVFHMIN32-NEXT:    vmv.x.s t0, v8
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 253(sp)
-; ZVFHMIN32-NEXT:    lh a0, 632(sp)
-; ZVFHMIN32-NEXT:    lh a1, 376(sp)
+; ZVFHMIN32-NEXT:    vmv.x.s t1, v8
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a1, 253(sp)
+; ZVFHMIN32-NEXT:    lh a1, 632(sp)
+; ZVFHMIN32-NEXT:    lh a2, 376(sp)
 ; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 11
-; ZVFHMIN32-NEXT:    vmv.x.s a7, v8
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 252(sp)
-; ZVFHMIN32-NEXT:    lh a0, 630(sp)
-; ZVFHMIN32-NEXT:    lh a1, 374(sp)
+; ZVFHMIN32-NEXT:    vmv.x.s t0, v8
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a1, 252(sp)
+; ZVFHMIN32-NEXT:    lh a1, 630(sp)
+; ZVFHMIN32-NEXT:    lh a2, 374(sp)
 ; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 10
-; ZVFHMIN32-NEXT:    vmv.x.s a6, v8
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 251(sp)
-; ZVFHMIN32-NEXT:    lh a0, 628(sp)
-; ZVFHMIN32-NEXT:    lh a1, 372(sp)
+; ZVFHMIN32-NEXT:    vmv.x.s a7, v8
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a1, 251(sp)
+; ZVFHMIN32-NEXT:    lh a1, 628(sp)
+; ZVFHMIN32-NEXT:    lh a2, 372(sp)
 ; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 9
-; ZVFHMIN32-NEXT:    vmv.x.s a5, v8
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    vmv.x.s a6, v8
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    lw a2, 108(sp) # 4-byte Folded Reload
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a2
+; ZVFHMIN32-NEXT:    sb a1, 250(sp)
+; ZVFHMIN32-NEXT:    lh a1, 626(sp)
+; ZVFHMIN32-NEXT:    lh a2, 370(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a5
+; ZVFHMIN32-NEXT:    feq.h a4, fa5, fa4
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    lw a2, 112(sp) # 4-byte Folded Reload
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a2
+; ZVFHMIN32-NEXT:    sb a1, 249(sp)
+; ZVFHMIN32-NEXT:    lh a1, 624(sp)
+; ZVFHMIN32-NEXT:    lh a2, 368(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a0
+; ZVFHMIN32-NEXT:    feq.h a3, fa5, fa4
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    lw a1, 116(sp) # 4-byte Folded Reload
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT:    sb a0, 250(sp)
-; ZVFHMIN32-NEXT:    lh a0, 626(sp)
-; ZVFHMIN32-NEXT:    lh a1, 370(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a4
+; ZVFHMIN32-NEXT:    sb a0, 248(sp)
+; ZVFHMIN32-NEXT:    lh a0, 622(sp)
+; ZVFHMIN32-NEXT:    lh a1, 366(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, t2
 ; ZVFHMIN32-NEXT:    feq.h a2, fa5, fa4
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    lw a1, 124(sp) # 4-byte Folded Reload
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT:    sb a0, 249(sp)
-; ZVFHMIN32-NEXT:    lh a1, 624(sp)
-; ZVFHMIN32-NEXT:    lh a3, 368(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, t2
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    lw a1, 120(sp) # 4-byte Folded Reload
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a3
-; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
-; ZVFHMIN32-NEXT:    lw a3, 112(sp) # 4-byte Folded Reload
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a3
-; ZVFHMIN32-NEXT:    sb a1, 248(sp)
-; ZVFHMIN32-NEXT:    lh a1, 622(sp)
-; ZVFHMIN32-NEXT:    lh a3, 366(sp)
+; ZVFHMIN32-NEXT:    sb a0, 247(sp)
+; ZVFHMIN32-NEXT:    lh a0, 620(sp)
+; ZVFHMIN32-NEXT:    lh a1, 364(sp)
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, t1
-; ZVFHMIN32-NEXT:    feq.h a4, fa5, fa4
+; ZVFHMIN32-NEXT:    feq.h a5, fa5, fa4
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    lw a1, 124(sp) # 4-byte Folded Reload
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a3
-; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
-; ZVFHMIN32-NEXT:    lw a3, 120(sp) # 4-byte Folded Reload
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a3
-; ZVFHMIN32-NEXT:    sb a1, 247(sp)
-; ZVFHMIN32-NEXT:    lh a1, 620(sp)
-; ZVFHMIN32-NEXT:    lh a3, 364(sp)
+; ZVFHMIN32-NEXT:    sb a0, 246(sp)
+; ZVFHMIN32-NEXT:    lh a0, 618(sp)
+; ZVFHMIN32-NEXT:    lh a1, 362(sp)
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, t0
 ; ZVFHMIN32-NEXT:    feq.h t0, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a3
-; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, s2
-; ZVFHMIN32-NEXT:    sb a1, 246(sp)
-; ZVFHMIN32-NEXT:    lh a1, 618(sp)
-; ZVFHMIN32-NEXT:    lh a3, 362(sp)
+; ZVFHMIN32-NEXT:    sb a0, 245(sp)
+; ZVFHMIN32-NEXT:    lh a0, 616(sp)
+; ZVFHMIN32-NEXT:    lh a1, 360(sp)
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a7
 ; ZVFHMIN32-NEXT:    feq.h a7, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a3
-; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, t6
-; ZVFHMIN32-NEXT:    sb a1, 245(sp)
-; ZVFHMIN32-NEXT:    lh a1, 616(sp)
-; ZVFHMIN32-NEXT:    lh a3, 360(sp)
+; ZVFHMIN32-NEXT:    sb a0, 244(sp)
+; ZVFHMIN32-NEXT:    lh a0, 614(sp)
+; ZVFHMIN32-NEXT:    lh a1, 358(sp)
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a6
 ; ZVFHMIN32-NEXT:    feq.h a6, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a3
-; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, t5
-; ZVFHMIN32-NEXT:    sb a1, 244(sp)
-; ZVFHMIN32-NEXT:    lh a1, 614(sp)
-; ZVFHMIN32-NEXT:    lh a3, 358(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a5
-; ZVFHMIN32-NEXT:    feq.h a5, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a3
-; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, t4
 ; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 8
-; ZVFHMIN32-NEXT:    vmv.x.s a3, v8
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a3
-; ZVFHMIN32-NEXT:    sb a1, 243(sp)
-; ZVFHMIN32-NEXT:    lh a1, 612(sp)
-; ZVFHMIN32-NEXT:    lh a3, 356(sp)
-; ZVFHMIN32-NEXT:    sb t0, 204(sp)
-; ZVFHMIN32-NEXT:    sb a4, 205(sp)
-; ZVFHMIN32-NEXT:    sb a0, 206(sp)
-; ZVFHMIN32-NEXT:    sb a2, 207(sp)
+; ZVFHMIN32-NEXT:    vmv.x.s a1, v8
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    sb a0, 243(sp)
+; ZVFHMIN32-NEXT:    lh a0, 612(sp)
+; ZVFHMIN32-NEXT:    lh a1, 356(sp)
+; ZVFHMIN32-NEXT:    sb a5, 204(sp)
+; ZVFHMIN32-NEXT:    sb a2, 205(sp)
+; ZVFHMIN32-NEXT:    sb a3, 206(sp)
+; ZVFHMIN32-NEXT:    sb a4, 207(sp)
+; ZVFHMIN32-NEXT:    feq.h a2, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a2, 200(sp)
+; ZVFHMIN32-NEXT:    sb a6, 201(sp)
+; ZVFHMIN32-NEXT:    sb a7, 202(sp)
+; ZVFHMIN32-NEXT:    sb t0, 203(sp)
+; ZVFHMIN32-NEXT:    li a2, 128
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 200(sp)
-; ZVFHMIN32-NEXT:    sb a5, 201(sp)
-; ZVFHMIN32-NEXT:    sb a6, 202(sp)
-; ZVFHMIN32-NEXT:    sb a7, 203(sp)
-; ZVFHMIN32-NEXT:    li a0, 128
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a3
-; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a1, 242(sp)
-; ZVFHMIN32-NEXT:    addi a1, sp, 128
-; ZVFHMIN32-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
-; ZVFHMIN32-NEXT:    vle8.v v8, (a1)
+; ZVFHMIN32-NEXT:    sb a0, 242(sp)
+; ZVFHMIN32-NEXT:    addi a0, sp, 128
+; ZVFHMIN32-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
+; ZVFHMIN32-NEXT:    vle8.v v8, (a0)
 ; ZVFHMIN32-NEXT:    vand.vi v8, v8, 1
 ; ZVFHMIN32-NEXT:    vmsne.vi v0, v8, 0
 ; ZVFHMIN32-NEXT:    addi sp, s0, -896
@@ -2440,12 +2442,6 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN64-NEXT:    sb a0, 219(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 564(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 308(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 218(sp)
-; ZVFHMIN64-NEXT:    lh a0, 562(sp)
-; ZVFHMIN64-NEXT:    lh a1, 306(sp)
 ; ZVFHMIN64-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; ZVFHMIN64-NEXT:    vslidedown.vi v10, v8, 7
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
@@ -2498,82 +2494,86 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN64-NEXT:    vs1r.v v10, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN64-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
 ; ZVFHMIN64-NEXT:    vslidedown.vi v26, v8, 15
-; ZVFHMIN64-NEXT:    vslidedown.vi v28, v8, 14
-; ZVFHMIN64-NEXT:    vslidedown.vi v10, v8, 13
-; ZVFHMIN64-NEXT:    addi a2, sp, 800
+; ZVFHMIN64-NEXT:    vslidedown.vi v20, v8, 14
+; ZVFHMIN64-NEXT:    vslidedown.vi v28, v8, 13
+; ZVFHMIN64-NEXT:    vslidedown.vi v10, v8, 12
+; ZVFHMIN64-NEXT:    csrr a2, vlenb
+; ZVFHMIN64-NEXT:    slli a2, a2, 1
+; ZVFHMIN64-NEXT:    add a2, sp, a2
+; ZVFHMIN64-NEXT:    addi a2, a2, 800
 ; ZVFHMIN64-NEXT:    vs2r.v v10, (a2) # Unknown-size Folded Spill
-; ZVFHMIN64-NEXT:    vslidedown.vi v6, v8, 12
-; ZVFHMIN64-NEXT:    vslidedown.vi v2, v8, 11
-; ZVFHMIN64-NEXT:    vslidedown.vi v22, v8, 10
-; ZVFHMIN64-NEXT:    vslidedown.vi v20, v8, 9
-; ZVFHMIN64-NEXT:    vslidedown.vi v18, v8, 8
-; ZVFHMIN64-NEXT:    vmv.x.s a3, v16
+; ZVFHMIN64-NEXT:    vslidedown.vi v4, v8, 11
+; ZVFHMIN64-NEXT:    vslidedown.vi v2, v8, 10
+; ZVFHMIN64-NEXT:    vslidedown.vi v30, v8, 9
+; ZVFHMIN64-NEXT:    vslidedown.vi v22, v8, 8
+; ZVFHMIN64-NEXT:    vmv.x.s t5, v16
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 217(sp)
-; ZVFHMIN64-NEXT:    lh a0, 560(sp)
-; ZVFHMIN64-NEXT:    lh a1, 304(sp)
+; ZVFHMIN64-NEXT:    sb a0, 218(sp)
+; ZVFHMIN64-NEXT:    lh a0, 562(sp)
+; ZVFHMIN64-NEXT:    lh a1, 306(sp)
 ; ZVFHMIN64-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; ZVFHMIN64-NEXT:    vslidedown.vi v21, v16, 7
-; ZVFHMIN64-NEXT:    vslidedown.vi v3, v16, 6
-; ZVFHMIN64-NEXT:    vslidedown.vi v19, v16, 5
+; ZVFHMIN64-NEXT:    vslidedown.vi v3, v16, 7
+; ZVFHMIN64-NEXT:    vslidedown.vi v31, v16, 6
+; ZVFHMIN64-NEXT:    vslidedown.vi v5, v16, 5
 ; ZVFHMIN64-NEXT:    vslidedown.vi v23, v16, 4
 ; ZVFHMIN64-NEXT:    vslidedown.vi v8, v16, 3
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    li a4, 10
-; ZVFHMIN64-NEXT:    mul a2, a2, a4
+; ZVFHMIN64-NEXT:    li a3, 18
+; ZVFHMIN64-NEXT:    mul a2, a2, a3
 ; ZVFHMIN64-NEXT:    add a2, sp, a2
 ; ZVFHMIN64-NEXT:    addi a2, a2, 800
 ; ZVFHMIN64-NEXT:    vs1r.v v8, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN64-NEXT:    vslidedown.vi v8, v16, 2
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    slli a2, a2, 4
+; ZVFHMIN64-NEXT:    li a3, 22
+; ZVFHMIN64-NEXT:    mul a2, a2, a3
 ; ZVFHMIN64-NEXT:    add a2, sp, a2
 ; ZVFHMIN64-NEXT:    addi a2, a2, 800
 ; ZVFHMIN64-NEXT:    vs1r.v v8, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN64-NEXT:    vslidedown.vi v8, v16, 1
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    slli a4, a2, 4
-; ZVFHMIN64-NEXT:    sub a2, a4, a2
+; ZVFHMIN64-NEXT:    li a3, 21
+; ZVFHMIN64-NEXT:    mul a2, a2, a3
 ; ZVFHMIN64-NEXT:    add a2, sp, a2
 ; ZVFHMIN64-NEXT:    addi a2, a2, 800
 ; ZVFHMIN64-NEXT:    vs1r.v v8, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN64-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
-; ZVFHMIN64-NEXT:    vslidedown.vi v14, v16, 15
-; ZVFHMIN64-NEXT:    vslidedown.vi v8, v16, 14
-; ZVFHMIN64-NEXT:    vslidedown.vi v10, v16, 13
-; ZVFHMIN64-NEXT:    vslidedown.vi v12, v16, 12
-; ZVFHMIN64-NEXT:    vslidedown.vi v30, v16, 11
+; ZVFHMIN64-NEXT:    vslidedown.vi v18, v16, 15
+; ZVFHMIN64-NEXT:    vslidedown.vi v14, v16, 14
+; ZVFHMIN64-NEXT:    vslidedown.vi v12, v16, 13
+; ZVFHMIN64-NEXT:    vslidedown.vi v10, v16, 12
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v16, 11
+; ZVFHMIN64-NEXT:    vslidedown.vi v6, v16, 10
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    slli a4, a2, 4
-; ZVFHMIN64-NEXT:    add a2, a4, a2
+; ZVFHMIN64-NEXT:    li a3, 19
+; ZVFHMIN64-NEXT:    mul a2, a2, a3
 ; ZVFHMIN64-NEXT:    add a2, sp, a2
 ; ZVFHMIN64-NEXT:    addi a2, a2, 800
-; ZVFHMIN64-NEXT:    vs2r.v v30, (a2) # Unknown-size Folded Spill
-; ZVFHMIN64-NEXT:    vslidedown.vi v30, v16, 10
+; ZVFHMIN64-NEXT:    vs2r.v v6, (a2) # Unknown-size Folded Spill
+; ZVFHMIN64-NEXT:    vslidedown.vi v6, v16, 9
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    li a4, 11
-; ZVFHMIN64-NEXT:    mul a2, a2, a4
+; ZVFHMIN64-NEXT:    li a3, 14
+; ZVFHMIN64-NEXT:    mul a2, a2, a3
 ; ZVFHMIN64-NEXT:    add a2, sp, a2
 ; ZVFHMIN64-NEXT:    addi a2, a2, 800
-; ZVFHMIN64-NEXT:    vs2r.v v30, (a2) # Unknown-size Folded Spill
-; ZVFHMIN64-NEXT:    vslidedown.vi v4, v16, 9
-; ZVFHMIN64-NEXT:    vslidedown.vi v30, v16, 8
+; ZVFHMIN64-NEXT:    vs2r.v v6, (a2) # Unknown-size Folded Spill
+; ZVFHMIN64-NEXT:    vslidedown.vi v6, v16, 8
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 216(sp)
-; ZVFHMIN64-NEXT:    lh a0, 558(sp)
-; ZVFHMIN64-NEXT:    lh a1, 302(sp)
-; ZVFHMIN64-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; ZVFHMIN64-NEXT:    vslidedown.vi v11, v0, 7
-; ZVFHMIN64-NEXT:    vslidedown.vi v7, v0, 6
-; ZVFHMIN64-NEXT:    vslidedown.vi v9, v0, 5
-; ZVFHMIN64-NEXT:    vslidedown.vi v29, v0, 4
-; ZVFHMIN64-NEXT:    vslidedown.vi v31, v0, 3
-; ZVFHMIN64-NEXT:    vslidedown.vi v5, v0, 2
-; ZVFHMIN64-NEXT:    vslidedown.vi v27, v0, 1
+; ZVFHMIN64-NEXT:    sb a0, 217(sp)
+; ZVFHMIN64-NEXT:    lh a0, 560(sp)
+; ZVFHMIN64-NEXT:    lh a1, 304(sp)
+; ZVFHMIN64-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZVFHMIN64-NEXT:    vslidedown.vi v9, v0, 7
+; ZVFHMIN64-NEXT:    vslidedown.vi v11, v0, 6
+; ZVFHMIN64-NEXT:    vslidedown.vi v13, v0, 5
+; ZVFHMIN64-NEXT:    vslidedown.vi v29, v0, 4
+; ZVFHMIN64-NEXT:    vslidedown.vi v27, v0, 3
+; ZVFHMIN64-NEXT:    vslidedown.vi v7, v0, 2
+; ZVFHMIN64-NEXT:    vslidedown.vi v21, v0, 1
 ; ZVFHMIN64-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
 ; ZVFHMIN64-NEXT:    vslidedown.vi v16, v0, 15
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
@@ -2583,88 +2583,99 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN64-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN64-NEXT:    vslidedown.vi v16, v0, 14
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    slli a2, a2, 1
+; ZVFHMIN64-NEXT:    slli a2, a2, 3
 ; ZVFHMIN64-NEXT:    add a2, sp, a2
 ; ZVFHMIN64-NEXT:    addi a2, a2, 800
 ; ZVFHMIN64-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN64-NEXT:    vslidedown.vi v16, v0, 13
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    li a4, 6
-; ZVFHMIN64-NEXT:    mul a2, a2, a4
+; ZVFHMIN64-NEXT:    li a3, 6
+; ZVFHMIN64-NEXT:    mul a2, a2, a3
 ; ZVFHMIN64-NEXT:    add a2, sp, a2
 ; ZVFHMIN64-NEXT:    addi a2, a2, 800
 ; ZVFHMIN64-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN64-NEXT:    vslidedown.vi v16, v0, 12
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    slli a2, a2, 3
+; ZVFHMIN64-NEXT:    li a3, 12
+; ZVFHMIN64-NEXT:    mul a2, a2, a3
 ; ZVFHMIN64-NEXT:    add a2, sp, a2
 ; ZVFHMIN64-NEXT:    addi a2, a2, 800
 ; ZVFHMIN64-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN64-NEXT:    vslidedown.vi v16, v0, 11
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    li a4, 13
-; ZVFHMIN64-NEXT:    mul a2, a2, a4
+; ZVFHMIN64-NEXT:    li a3, 10
+; ZVFHMIN64-NEXT:    mul a2, a2, a3
 ; ZVFHMIN64-NEXT:    add a2, sp, a2
 ; ZVFHMIN64-NEXT:    addi a2, a2, 800
 ; ZVFHMIN64-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN64-NEXT:    vslidedown.vi v16, v0, 10
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    li a4, 19
-; ZVFHMIN64-NEXT:    mul a2, a2, a4
+; ZVFHMIN64-NEXT:    slli a2, a2, 4
 ; ZVFHMIN64-NEXT:    add a2, sp, a2
 ; ZVFHMIN64-NEXT:    addi a2, a2, 800
 ; ZVFHMIN64-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN64-NEXT:    vslidedown.vi v16, v0, 9
-; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    li a4, 21
-; ZVFHMIN64-NEXT:    mul a2, a2, a4
-; ZVFHMIN64-NEXT:    add a2, sp, a2
-; ZVFHMIN64-NEXT:    addi a2, a2, 800
-; ZVFHMIN64-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN64-NEXT:    vslidedown.vi v0, v0, 8
+; ZVFHMIN64-NEXT:    addi a2, sp, 800
+; ZVFHMIN64-NEXT:    vs2r.v v0, (a2) # Unknown-size Folded Spill
+; ZVFHMIN64-NEXT:    vmv.x.s t4, v26
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a0, 216(sp)
+; ZVFHMIN64-NEXT:    lh a0, 558(sp)
+; ZVFHMIN64-NEXT:    lh a1, 302(sp)
+; ZVFHMIN64-NEXT:    vmv.x.s t3, v20
+; ZVFHMIN64-NEXT:    vmv.x.s t1, v28
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a0, 215(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 556(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 300(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s t3, v26
-; ZVFHMIN64-NEXT:    vmv.x.s t2, v28
+; ZVFHMIN64-NEXT:    csrr a2, vlenb
+; ZVFHMIN64-NEXT:    slli a2, a2, 1
+; ZVFHMIN64-NEXT:    add a2, sp, a2
+; ZVFHMIN64-NEXT:    addi a2, a2, 800
+; ZVFHMIN64-NEXT:    vl2r.v v0, (a2) # Unknown-size Folded Reload
+; ZVFHMIN64-NEXT:    vmv.x.s t2, v0
+; ZVFHMIN64-NEXT:    vmv.x.s t0, v4
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a0, 214(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 554(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 298(sp)
-; ZVFHMIN64-NEXT:    addi a2, sp, 800
-; ZVFHMIN64-NEXT:    vl2r.v v16, (a2) # Unknown-size Folded Reload
-; ZVFHMIN64-NEXT:    vmv.x.s t1, v16
-; ZVFHMIN64-NEXT:    vmv.x.s t0, v6
+; ZVFHMIN64-NEXT:    vmv.x.s a7, v2
+; ZVFHMIN64-NEXT:    vmv.x.s a6, v30
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a0, 213(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 552(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 296(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s a7, v2
-; ZVFHMIN64-NEXT:    vmv.x.s a6, v22
+; ZVFHMIN64-NEXT:    vmv.x.s a2, v22
+; ZVFHMIN64-NEXT:    sd a2, 80(sp) # 8-byte Folded Spill
+; ZVFHMIN64-NEXT:    vmv.x.s a2, v18
+; ZVFHMIN64-NEXT:    sd a2, 88(sp) # 8-byte Folded Spill
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a0, 212(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 550(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 294(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s a5, v20
-; ZVFHMIN64-NEXT:    vmv.x.s a2, v18
-; ZVFHMIN64-NEXT:    sd a2, 88(sp) # 8-byte Folded Spill
+; ZVFHMIN64-NEXT:    vmv.x.s a2, v14
+; ZVFHMIN64-NEXT:    sd a2, 96(sp) # 8-byte Folded Spill
+; ZVFHMIN64-NEXT:    vmv.x.s a2, v12
+; ZVFHMIN64-NEXT:    sd a2, 104(sp) # 8-byte Folded Spill
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a0, 211(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 548(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 292(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s a2, v14
-; ZVFHMIN64-NEXT:    sd a2, 104(sp) # 8-byte Folded Spill
+; ZVFHMIN64-NEXT:    vmv.x.s a2, v10
+; ZVFHMIN64-NEXT:    sd a2, 112(sp) # 8-byte Folded Spill
 ; ZVFHMIN64-NEXT:    vmv.x.s a2, v8
 ; ZVFHMIN64-NEXT:    sd a2, 120(sp) # 8-byte Folded Spill
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
@@ -2673,208 +2684,204 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN64-NEXT:    sb a0, 210(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 546(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 290(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a3
-; ZVFHMIN64-NEXT:    vmv.x.s a3, v24
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, t5
+; ZVFHMIN64-NEXT:    vmv.x.s t5, v24
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa3, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa4, fa3
 ; ZVFHMIN64-NEXT:    sb a0, 209(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 544(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 288(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a3
-; ZVFHMIN64-NEXT:    feq.h a3, fa5, fa4
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, t5
+; ZVFHMIN64-NEXT:    feq.h t5, fa5, fa4
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a3, 192(sp)
+; ZVFHMIN64-NEXT:    sb t5, 192(sp)
 ; ZVFHMIN64-NEXT:    sb a0, 208(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 738(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 482(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s a2, v10
-; ZVFHMIN64-NEXT:    sd a2, 96(sp) # 8-byte Folded Spill
-; ZVFHMIN64-NEXT:    vmv.x.s a2, v12
-; ZVFHMIN64-NEXT:    sd a2, 112(sp) # 8-byte Folded Spill
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 177(sp)
-; ZVFHMIN64-NEXT:    lh a0, 736(sp)
-; ZVFHMIN64-NEXT:    lh a1, 480(sp)
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
 ; ZVFHMIN64-NEXT:    li a3, 29
 ; ZVFHMIN64-NEXT:    mul a2, a2, a3
 ; ZVFHMIN64-NEXT:    add a2, sp, a2
-; ZVFHMIN64-NEXT:    lh s5, 800(a2) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    lh s7, 800(a2) # 8-byte Folded Reload
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
 ; ZVFHMIN64-NEXT:    li a3, 28
 ; ZVFHMIN64-NEXT:    mul a2, a2, a3
 ; ZVFHMIN64-NEXT:    add a2, sp, a2
-; ZVFHMIN64-NEXT:    lh s2, 800(a2) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    lh s4, 800(a2) # 8-byte Folded Reload
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 176(sp)
-; ZVFHMIN64-NEXT:    lh a0, 734(sp)
-; ZVFHMIN64-NEXT:    lh a1, 478(sp)
+; ZVFHMIN64-NEXT:    sb a0, 177(sp)
+; ZVFHMIN64-NEXT:    lh a0, 736(sp)
+; ZVFHMIN64-NEXT:    lh a1, 480(sp)
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
 ; ZVFHMIN64-NEXT:    li a3, 27
 ; ZVFHMIN64-NEXT:    mul a2, a2, a3
 ; ZVFHMIN64-NEXT:    add a2, sp, a2
-; ZVFHMIN64-NEXT:    lh s6, 800(a2) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    lh s8, 800(a2) # 8-byte Folded Reload
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
 ; ZVFHMIN64-NEXT:    li a3, 26
 ; ZVFHMIN64-NEXT:    mul a2, a2, a3
 ; ZVFHMIN64-NEXT:    add a2, sp, a2
-; ZVFHMIN64-NEXT:    lh s3, 800(a2) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    lh s5, 800(a2) # 8-byte Folded Reload
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 175(sp)
-; ZVFHMIN64-NEXT:    lh a0, 732(sp)
-; ZVFHMIN64-NEXT:    lh a1, 476(sp)
+; ZVFHMIN64-NEXT:    sb a0, 176(sp)
+; ZVFHMIN64-NEXT:    lh a0, 734(sp)
+; ZVFHMIN64-NEXT:    lh a1, 478(sp)
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
 ; ZVFHMIN64-NEXT:    li a3, 25
 ; ZVFHMIN64-NEXT:    mul a2, a2, a3
 ; ZVFHMIN64-NEXT:    add a2, sp, a2
-; ZVFHMIN64-NEXT:    lh s7, 800(a2) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    lh s9, 800(a2) # 8-byte Folded Reload
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
 ; ZVFHMIN64-NEXT:    li a3, 24
 ; ZVFHMIN64-NEXT:    mul a2, a2, a3
 ; ZVFHMIN64-NEXT:    add a2, sp, a2
-; ZVFHMIN64-NEXT:    lh s4, 800(a2) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    lh s6, 800(a2) # 8-byte Folded Reload
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 174(sp)
-; ZVFHMIN64-NEXT:    lh a0, 730(sp)
-; ZVFHMIN64-NEXT:    lh a1, 474(sp)
+; ZVFHMIN64-NEXT:    sb a0, 175(sp)
+; ZVFHMIN64-NEXT:    lh a0, 732(sp)
+; ZVFHMIN64-NEXT:    lh a1, 476(sp)
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
 ; ZVFHMIN64-NEXT:    li a3, 23
 ; ZVFHMIN64-NEXT:    mul a2, a2, a3
 ; ZVFHMIN64-NEXT:    add a2, sp, a2
-; ZVFHMIN64-NEXT:    lh s8, 800(a2) # 8-byte Folded Reload
-; ZVFHMIN64-NEXT:    vmv.x.s t4, v21
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 173(sp)
-; ZVFHMIN64-NEXT:    lh a0, 728(sp)
-; ZVFHMIN64-NEXT:    lh a1, 472(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s t6, v3
-; ZVFHMIN64-NEXT:    vmv.x.s t5, v19
+; ZVFHMIN64-NEXT:    lh s3, 800(a2) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    vmv.x.s t5, v3
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 172(sp)
-; ZVFHMIN64-NEXT:    lh a0, 726(sp)
-; ZVFHMIN64-NEXT:    lh a1, 470(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s s10, v11
-; ZVFHMIN64-NEXT:    vmv.x.s s11, v7
+; ZVFHMIN64-NEXT:    sb a0, 174(sp)
+; ZVFHMIN64-NEXT:    lh a0, 730(sp)
+; ZVFHMIN64-NEXT:    lh a1, 474(sp)
+; ZVFHMIN64-NEXT:    vmv.x.s s2, v31
+; ZVFHMIN64-NEXT:    vmv.x.s t6, v5
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 171(sp)
-; ZVFHMIN64-NEXT:    lh a0, 724(sp)
-; ZVFHMIN64-NEXT:    lh s9, 468(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s a4, v9
-; ZVFHMIN64-NEXT:    vmv.x.s ra, v29
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, s9
+; ZVFHMIN64-NEXT:    sb a0, 173(sp)
+; ZVFHMIN64-NEXT:    lh a1, 728(sp)
+; ZVFHMIN64-NEXT:    lh s10, 472(sp)
+; ZVFHMIN64-NEXT:    vmv.x.s a3, v9
+; ZVFHMIN64-NEXT:    vmv.x.s a4, v11
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, s10
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a1, 172(sp)
+; ZVFHMIN64-NEXT:    lh a1, 726(sp)
+; ZVFHMIN64-NEXT:    lh s10, 470(sp)
+; ZVFHMIN64-NEXT:    vmv.x.s a2, v13
+; ZVFHMIN64-NEXT:    vmv.x.s s11, v29
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, s10
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a1, 171(sp)
+; ZVFHMIN64-NEXT:    lh ra, 724(sp)
+; ZVFHMIN64-NEXT:    lh a0, 468(sp)
+; ZVFHMIN64-NEXT:    vmv.x.s a5, v27
+; ZVFHMIN64-NEXT:    vmv.x.s s10, v7
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, ra
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a0
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a0, 170(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 722(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 466(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s s9, v31
-; ZVFHMIN64-NEXT:    vmv.x.s a3, v5
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 169(sp)
-; ZVFHMIN64-NEXT:    lh a0, 720(sp)
-; ZVFHMIN64-NEXT:    lh a1, 464(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s a2, v27
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, s5
+; ZVFHMIN64-NEXT:    vmv.x.s ra, v21
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, s7
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa3, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa4, fa3
-; ZVFHMIN64-NEXT:    sb a0, 168(sp)
-; ZVFHMIN64-NEXT:    lh a0, 718(sp)
-; ZVFHMIN64-NEXT:    lh a1, 462(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, s2
-; ZVFHMIN64-NEXT:    fmv.h.x fa3, s6
+; ZVFHMIN64-NEXT:    sb a0, 169(sp)
+; ZVFHMIN64-NEXT:    lh a0, 720(sp)
+; ZVFHMIN64-NEXT:    lh a1, 464(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, s4
+; ZVFHMIN64-NEXT:    fmv.h.x fa3, s8
 ; ZVFHMIN64-NEXT:    fmv.h.x fa2, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa1, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa2, fa1
-; ZVFHMIN64-NEXT:    sb a0, 167(sp)
-; ZVFHMIN64-NEXT:    lh a0, 716(sp)
-; ZVFHMIN64-NEXT:    lh a1, 460(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa2, s3
-; ZVFHMIN64-NEXT:    fmv.h.x fa1, s7
+; ZVFHMIN64-NEXT:    sb a0, 168(sp)
+; ZVFHMIN64-NEXT:    lh a0, 718(sp)
+; ZVFHMIN64-NEXT:    lh a1, 462(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa2, s5
+; ZVFHMIN64-NEXT:    fmv.h.x fa1, s9
 ; ZVFHMIN64-NEXT:    fmv.h.x fa0, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x ft0, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa0, ft0
-; ZVFHMIN64-NEXT:    sb a0, 166(sp)
-; ZVFHMIN64-NEXT:    lh a0, 714(sp)
-; ZVFHMIN64-NEXT:    lh a1, 458(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa0, s4
-; ZVFHMIN64-NEXT:    fmv.h.x ft0, s8
+; ZVFHMIN64-NEXT:    sb a0, 167(sp)
+; ZVFHMIN64-NEXT:    lh a0, 716(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa0, s6
+; ZVFHMIN64-NEXT:    lh a1, 460(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x ft0, a3
 ; ZVFHMIN64-NEXT:    fmv.h.x ft1, a0
-; ZVFHMIN64-NEXT:    fmv.h.x ft2, a1
-; ZVFHMIN64-NEXT:    feq.h a0, ft1, ft2
-; ZVFHMIN64-NEXT:    sb a0, 165(sp)
-; ZVFHMIN64-NEXT:    lh a0, 712(sp)
-; ZVFHMIN64-NEXT:    lh a1, 456(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x ft1, s10
-; ZVFHMIN64-NEXT:    fmv.h.x ft2, s11
-; ZVFHMIN64-NEXT:    fmv.h.x ft3, a0
-; ZVFHMIN64-NEXT:    fmv.h.x ft4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, ft3, ft4
-; ZVFHMIN64-NEXT:    sb a0, 164(sp)
-; ZVFHMIN64-NEXT:    lh a0, 710(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x ft3, a4
-; ZVFHMIN64-NEXT:    lh a1, 454(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x ft4, ra
-; ZVFHMIN64-NEXT:    fmv.h.x ft5, a0
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, ft1
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, ft0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT:    feq.h a1, ft5, fa5
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a3
-; ZVFHMIN64-NEXT:    sb a1, 163(sp)
-; ZVFHMIN64-NEXT:    lh a1, 708(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x ft1, a2
-; ZVFHMIN64-NEXT:    lh a2, 452(sp)
-; ZVFHMIN64-NEXT:    feq.h a3, fa0, fa5
+; ZVFHMIN64-NEXT:    feq.h a1, ft1, fa5
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a4
+; ZVFHMIN64-NEXT:    sb a1, 166(sp)
+; ZVFHMIN64-NEXT:    lh a1, 714(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x ft0, a2
+; ZVFHMIN64-NEXT:    lh a2, 458(sp)
+; ZVFHMIN64-NEXT:    feq.h a3, fa4, fa5
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT:    feq.h a1, ft0, ft1
-; ZVFHMIN64-NEXT:    fmv.h.x fa0, a2
-; ZVFHMIN64-NEXT:    feq.h a2, fa5, fa0
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, s9
+; ZVFHMIN64-NEXT:    feq.h a1, fa3, ft0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
+; ZVFHMIN64-NEXT:    feq.h a2, fa5, fa4
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, s3
+; ZVFHMIN64-NEXT:    sb a2, 165(sp)
+; ZVFHMIN64-NEXT:    lh a2, 712(sp)
+; ZVFHMIN64-NEXT:    lh a4, 456(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, s11
+; ZVFHMIN64-NEXT:    feq.h s3, fa2, fa4
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
+; ZVFHMIN64-NEXT:    fmv.h.x fa3, a4
+; ZVFHMIN64-NEXT:    feq.h a2, fa4, fa3
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a5
+; ZVFHMIN64-NEXT:    sb a2, 164(sp)
+; ZVFHMIN64-NEXT:    lh a2, 710(sp)
+; ZVFHMIN64-NEXT:    lh a4, 454(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa3, s10
+; ZVFHMIN64-NEXT:    feq.h a5, fa1, fa4
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
+; ZVFHMIN64-NEXT:    fmv.h.x fa2, a4
+; ZVFHMIN64-NEXT:    feq.h a2, fa4, fa2
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, ra
+; ZVFHMIN64-NEXT:    sb a2, 163(sp)
+; ZVFHMIN64-NEXT:    lh a2, 708(sp)
+; ZVFHMIN64-NEXT:    lh a4, 452(sp)
+; ZVFHMIN64-NEXT:    feq.h s4, fa0, fa3
+; ZVFHMIN64-NEXT:    feq.h s5, fa5, fa4
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a2
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a4
+; ZVFHMIN64-NEXT:    feq.h a2, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a2, 162(sp)
 ; ZVFHMIN64-NEXT:    lh a2, 706(sp)
 ; ZVFHMIN64-NEXT:    lh a4, 450(sp)
-; ZVFHMIN64-NEXT:    sb a1, 129(sp)
-; ZVFHMIN64-NEXT:    feq.h a1, fa1, fa5
-; ZVFHMIN64-NEXT:    sb a3, 130(sp)
-; ZVFHMIN64-NEXT:    feq.h a3, fa2, ft4
-; ZVFHMIN64-NEXT:    sb a1, 131(sp)
-; ZVFHMIN64-NEXT:    feq.h a1, fa4, ft2
-; ZVFHMIN64-NEXT:    sb a3, 132(sp)
-; ZVFHMIN64-NEXT:    feq.h a3, fa3, ft3
+; ZVFHMIN64-NEXT:    sb s5, 129(sp)
+; ZVFHMIN64-NEXT:    sb s4, 130(sp)
+; ZVFHMIN64-NEXT:    sb a5, 131(sp)
+; ZVFHMIN64-NEXT:    sb s3, 132(sp)
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a2
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a4
 ; ZVFHMIN64-NEXT:    feq.h a2, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a3, 133(sp)
-; ZVFHMIN64-NEXT:    sb a1, 134(sp)
+; ZVFHMIN64-NEXT:    sb a1, 133(sp)
+; ZVFHMIN64-NEXT:    sb a3, 134(sp)
 ; ZVFHMIN64-NEXT:    sb a0, 135(sp)
 ; ZVFHMIN64-NEXT:    sb a2, 161(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 610(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 354(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s s4, v23
+; ZVFHMIN64-NEXT:    vmv.x.s s6, v23
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    li a3, 10
+; ZVFHMIN64-NEXT:    li a3, 18
 ; ZVFHMIN64-NEXT:    mul a2, a2, a3
 ; ZVFHMIN64-NEXT:    add a2, sp, a2
-; ZVFHMIN64-NEXT:    lh s2, 800(a2) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    lh s5, 800(a2) # 8-byte Folded Reload
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
@@ -2882,12 +2889,13 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN64-NEXT:    lh a0, 608(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 352(sp)
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    slli a2, a2, 4
+; ZVFHMIN64-NEXT:    li a3, 22
+; ZVFHMIN64-NEXT:    mul a2, a2, a3
 ; ZVFHMIN64-NEXT:    add a2, sp, a2
-; ZVFHMIN64-NEXT:    lh s5, 800(a2) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    lh s4, 800(a2) # 8-byte Folded Reload
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    slli a3, a2, 4
-; ZVFHMIN64-NEXT:    sub a2, a3, a2
+; ZVFHMIN64-NEXT:    li a3, 21
+; ZVFHMIN64-NEXT:    mul a2, a2, a3
 ; ZVFHMIN64-NEXT:    add a2, sp, a2
 ; ZVFHMIN64-NEXT:    lh s3, 800(a2) # 8-byte Folded Reload
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
@@ -2896,148 +2904,148 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN64-NEXT:    sb a0, 240(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 606(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 350(sp)
-; ZVFHMIN64-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 7
-; ZVFHMIN64-NEXT:    vmv.x.s s6, v8
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    fmv.h.x fa3, t5
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, s2
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa2, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa4, fa2
 ; ZVFHMIN64-NEXT:    sb a0, 239(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 604(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 348(sp)
-; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 6
-; ZVFHMIN64-NEXT:    vmv.x.s s7, v8
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, t6
+; ZVFHMIN64-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 7
+; ZVFHMIN64-NEXT:    fmv.h.x fa2, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa1, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa2, fa1
 ; ZVFHMIN64-NEXT:    sb a0, 238(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 602(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 346(sp)
-; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 5
-; ZVFHMIN64-NEXT:    vmv.x.s s8, v8
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    vmv.x.s a2, v8
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 6
+; ZVFHMIN64-NEXT:    fmv.h.x fa2, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa1, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa2, fa1
 ; ZVFHMIN64-NEXT:    sb a0, 237(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 600(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 344(sp)
-; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 4
-; ZVFHMIN64-NEXT:    vmv.x.s s9, v8
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    vmv.x.s a3, v8
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 5
+; ZVFHMIN64-NEXT:    fmv.h.x fa2, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa1, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa2, fa1
 ; ZVFHMIN64-NEXT:    sb a0, 236(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 598(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 342(sp)
-; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 3
-; ZVFHMIN64-NEXT:    vmv.x.s s10, v8
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    vmv.x.s a4, v8
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 4
+; ZVFHMIN64-NEXT:    fmv.h.x fa2, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa1, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa2, fa1
 ; ZVFHMIN64-NEXT:    sb a0, 235(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 596(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 340(sp)
-; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 2
-; ZVFHMIN64-NEXT:    vmv.x.s s11, v8
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    vmv.x.s a5, v8
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 3
+; ZVFHMIN64-NEXT:    fmv.h.x fa2, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa1, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa2, fa1
 ; ZVFHMIN64-NEXT:    sb a0, 234(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 594(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 338(sp)
-; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 1
-; ZVFHMIN64-NEXT:    vmv.x.s ra, v8
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    vmv.x.s t6, v8
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 2
+; ZVFHMIN64-NEXT:    fmv.h.x fa2, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa1, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa2, fa1
 ; ZVFHMIN64-NEXT:    sb a0, 233(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 592(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 336(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, t4
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, t6
-; ZVFHMIN64-NEXT:    fmv.h.x fa3, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa2, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa3, fa2
+; ZVFHMIN64-NEXT:    vmv.x.s s2, v8
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 1
+; ZVFHMIN64-NEXT:    fmv.h.x fa2, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa1, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa2, fa1
 ; ZVFHMIN64-NEXT:    sb a0, 232(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 590(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa2, a2
 ; ZVFHMIN64-NEXT:    lh a1, 334(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa3, t5
-; ZVFHMIN64-NEXT:    fmv.h.x fa2, s4
+; ZVFHMIN64-NEXT:    vmv.x.s a2, v8
 ; ZVFHMIN64-NEXT:    fmv.h.x fa1, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa0, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa1, fa0
+; ZVFHMIN64-NEXT:    feq.h t5, fa3, fa2
+; ZVFHMIN64-NEXT:    fmv.h.x fa3, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa1, fa3
+; ZVFHMIN64-NEXT:    fmv.h.x fa3, a3
 ; ZVFHMIN64-NEXT:    sb a0, 231(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 588(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa2, a4
 ; ZVFHMIN64-NEXT:    lh a1, 332(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa1, s2
-; ZVFHMIN64-NEXT:    fmv.h.x fa0, s5
-; ZVFHMIN64-NEXT:    fmv.h.x ft0, a0
-; ZVFHMIN64-NEXT:    fmv.h.x ft1, a1
-; ZVFHMIN64-NEXT:    feq.h a0, ft0, ft1
-; ZVFHMIN64-NEXT:    sb a0, 230(sp)
-; ZVFHMIN64-NEXT:    lh a0, 586(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x ft0, s3
-; ZVFHMIN64-NEXT:    lh a1, 330(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x ft1, s6
-; ZVFHMIN64-NEXT:    fmv.h.x ft2, a0
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, ft1
+; ZVFHMIN64-NEXT:    feq.h a3, fa5, fa3
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    feq.h a0, fa4, fa2
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, s6
+; ZVFHMIN64-NEXT:    sb a1, 230(sp)
+; ZVFHMIN64-NEXT:    lh a1, 586(sp)
+; ZVFHMIN64-NEXT:    lh a4, 330(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a5
+; ZVFHMIN64-NEXT:    feq.h a5, fa5, fa4
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT:    feq.h a1, ft2, fa5
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, s7
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a4
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, s5
 ; ZVFHMIN64-NEXT:    sb a1, 229(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 584(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x ft1, s8
-; ZVFHMIN64-NEXT:    lh a2, 328(sp)
-; ZVFHMIN64-NEXT:    feq.h a3, fa4, fa5
+; ZVFHMIN64-NEXT:    lh a4, 328(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, t6
+; ZVFHMIN64-NEXT:    feq.h t6, fa5, fa4
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT:    feq.h a1, fa3, ft1
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
-; ZVFHMIN64-NEXT:    feq.h a2, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, s9
-; ZVFHMIN64-NEXT:    sb a2, 228(sp)
-; ZVFHMIN64-NEXT:    lh a2, 582(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a4
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, s4
+; ZVFHMIN64-NEXT:    sb a1, 228(sp)
+; ZVFHMIN64-NEXT:    lh a1, 582(sp)
 ; ZVFHMIN64-NEXT:    lh a4, 326(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, s10
-; ZVFHMIN64-NEXT:    feq.h t4, fa2, fa5
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a2
-; ZVFHMIN64-NEXT:    fmv.h.x fa3, a4
-; ZVFHMIN64-NEXT:    feq.h a2, fa5, fa3
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, s11
-; ZVFHMIN64-NEXT:    fmv.h.x fa3, ra
-; ZVFHMIN64-NEXT:    sb a2, 227(sp)
-; ZVFHMIN64-NEXT:    lh a2, 580(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, s2
+; ZVFHMIN64-NEXT:    feq.h s2, fa5, fa4
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a4
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, s3
+; ZVFHMIN64-NEXT:    sb a1, 227(sp)
+; ZVFHMIN64-NEXT:    lh a1, 580(sp)
 ; ZVFHMIN64-NEXT:    lh a4, 324(sp)
-; ZVFHMIN64-NEXT:    feq.h t5, fa0, fa5
-; ZVFHMIN64-NEXT:    feq.h t6, ft0, fa3
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a2
-; ZVFHMIN64-NEXT:    fmv.h.x fa3, a4
-; ZVFHMIN64-NEXT:    feq.h a2, fa5, fa3
-; ZVFHMIN64-NEXT:    sb a2, 226(sp)
-; ZVFHMIN64-NEXT:    lh a2, 578(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
+; ZVFHMIN64-NEXT:    feq.h a2, fa5, fa4
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a4
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a1, 226(sp)
+; ZVFHMIN64-NEXT:    lh a1, 578(sp)
 ; ZVFHMIN64-NEXT:    lh a4, 322(sp)
-; ZVFHMIN64-NEXT:    sb t6, 193(sp)
-; ZVFHMIN64-NEXT:    feq.h t6, fa1, fa4
-; ZVFHMIN64-NEXT:    sb t5, 194(sp)
+; ZVFHMIN64-NEXT:    sb a2, 193(sp)
+; ZVFHMIN64-NEXT:    sb s2, 194(sp)
 ; ZVFHMIN64-NEXT:    sb t6, 195(sp)
-; ZVFHMIN64-NEXT:    sb t4, 196(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a2
+; ZVFHMIN64-NEXT:    sb a5, 196(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a4
-; ZVFHMIN64-NEXT:    feq.h a2, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a1, 197(sp)
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a0, 197(sp)
 ; ZVFHMIN64-NEXT:    sb a3, 198(sp)
-; ZVFHMIN64-NEXT:    sb a0, 199(sp)
-; ZVFHMIN64-NEXT:    sb a2, 225(sp)
+; ZVFHMIN64-NEXT:    sb t5, 199(sp)
+; ZVFHMIN64-NEXT:    sb a1, 225(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 766(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 510(sp)
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    slli a3, a2, 4
-; ZVFHMIN64-NEXT:    add a2, a3, a2
+; ZVFHMIN64-NEXT:    li a3, 19
+; ZVFHMIN64-NEXT:    mul a2, a2, a3
 ; ZVFHMIN64-NEXT:    add a2, sp, a2
 ; ZVFHMIN64-NEXT:    addi a2, a2, 800
 ; ZVFHMIN64-NEXT:    vl2r.v v8, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN64-NEXT:    vmv.x.s s2, v8
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    li a3, 11
+; ZVFHMIN64-NEXT:    li a3, 14
 ; ZVFHMIN64-NEXT:    mul a2, a2, a3
 ; ZVFHMIN64-NEXT:    add a2, sp, a2
 ; ZVFHMIN64-NEXT:    addi a2, a2, 800
@@ -3049,305 +3057,301 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN64-NEXT:    sb a0, 191(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 764(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 508(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s t5, v4
-; ZVFHMIN64-NEXT:    vmv.x.s t4, v30
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 190(sp)
-; ZVFHMIN64-NEXT:    lh a0, 762(sp)
-; ZVFHMIN64-NEXT:    lh a1, 506(sp)
+; ZVFHMIN64-NEXT:    vmv.x.s t5, v6
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
 ; ZVFHMIN64-NEXT:    slli a2, a2, 2
 ; ZVFHMIN64-NEXT:    add a2, sp, a2
 ; ZVFHMIN64-NEXT:    addi a2, a2, 800
 ; ZVFHMIN64-NEXT:    vl2r.v v8, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN64-NEXT:    vmv.x.s a2, v8
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a0, 190(sp)
+; ZVFHMIN64-NEXT:    lh a0, 762(sp)
+; ZVFHMIN64-NEXT:    lh a1, 506(sp)
 ; ZVFHMIN64-NEXT:    csrr a3, vlenb
-; ZVFHMIN64-NEXT:    slli a3, a3, 1
+; ZVFHMIN64-NEXT:    slli a3, a3, 3
 ; ZVFHMIN64-NEXT:    add a3, sp, a3
 ; ZVFHMIN64-NEXT:    addi a3, a3, 800
 ; ZVFHMIN64-NEXT:    vl2r.v v8, (a3) # Unknown-size Folded Reload
 ; ZVFHMIN64-NEXT:    vmv.x.s a3, v8
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 189(sp)
-; ZVFHMIN64-NEXT:    lh a0, 760(sp)
-; ZVFHMIN64-NEXT:    lh a1, 504(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, t3
 ; ZVFHMIN64-NEXT:    csrr a4, vlenb
-; ZVFHMIN64-NEXT:    li t3, 6
-; ZVFHMIN64-NEXT:    mul a4, a4, t3
+; ZVFHMIN64-NEXT:    li a5, 6
+; ZVFHMIN64-NEXT:    mul a4, a4, a5
 ; ZVFHMIN64-NEXT:    add a4, sp, a4
 ; ZVFHMIN64-NEXT:    addi a4, a4, 800
 ; ZVFHMIN64-NEXT:    vl2r.v v8, (a4) # Unknown-size Folded Reload
 ; ZVFHMIN64-NEXT:    vmv.x.s a4, v8
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa3, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa4, fa3
-; ZVFHMIN64-NEXT:    sb a0, 188(sp)
-; ZVFHMIN64-NEXT:    lh a0, 758(sp)
-; ZVFHMIN64-NEXT:    lh a1, 502(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, t2
-; ZVFHMIN64-NEXT:    csrr t2, vlenb
-; ZVFHMIN64-NEXT:    slli t2, t2, 3
-; ZVFHMIN64-NEXT:    add t2, sp, t2
-; ZVFHMIN64-NEXT:    addi t2, t2, 800
-; ZVFHMIN64-NEXT:    vl2r.v v8, (t2) # Unknown-size Folded Reload
-; ZVFHMIN64-NEXT:    vmv.x.s t2, v8
-; ZVFHMIN64-NEXT:    fmv.h.x fa3, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa2, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa3, fa2
-; ZVFHMIN64-NEXT:    sb a0, 187(sp)
-; ZVFHMIN64-NEXT:    lh a0, 756(sp)
-; ZVFHMIN64-NEXT:    lh a1, 500(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa3, t1
-; ZVFHMIN64-NEXT:    csrr t1, vlenb
-; ZVFHMIN64-NEXT:    li t3, 13
-; ZVFHMIN64-NEXT:    mul t1, t1, t3
-; ZVFHMIN64-NEXT:    add t1, sp, t1
-; ZVFHMIN64-NEXT:    addi t1, t1, 800
-; ZVFHMIN64-NEXT:    vl2r.v v8, (t1) # Unknown-size Folded Reload
-; ZVFHMIN64-NEXT:    vmv.x.s t3, v8
-; ZVFHMIN64-NEXT:    fmv.h.x fa2, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa1, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa2, fa1
-; ZVFHMIN64-NEXT:    sb a0, 186(sp)
-; ZVFHMIN64-NEXT:    lh a0, 754(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa2, t0
-; ZVFHMIN64-NEXT:    lh a1, 498(sp)
-; ZVFHMIN64-NEXT:    csrr t0, vlenb
-; ZVFHMIN64-NEXT:    li t1, 19
-; ZVFHMIN64-NEXT:    mul t0, t0, t1
-; ZVFHMIN64-NEXT:    add t0, sp, t0
-; ZVFHMIN64-NEXT:    addi t0, t0, 800
-; ZVFHMIN64-NEXT:    vl2r.v v8, (t0) # Unknown-size Folded Reload
-; ZVFHMIN64-NEXT:    vmv.x.s s3, v8
-; ZVFHMIN64-NEXT:    fmv.h.x fa1, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a0, 189(sp)
+; ZVFHMIN64-NEXT:    lh a1, 760(sp)
+; ZVFHMIN64-NEXT:    lh a5, 504(sp)
+; ZVFHMIN64-NEXT:    csrr a0, vlenb
+; ZVFHMIN64-NEXT:    li s3, 12
+; ZVFHMIN64-NEXT:    mul a0, a0, s3
+; ZVFHMIN64-NEXT:    add a0, sp, a0
+; ZVFHMIN64-NEXT:    addi a0, a0, 800
+; ZVFHMIN64-NEXT:    vl2r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN64-NEXT:    vmv.x.s s5, v8
 ; ZVFHMIN64-NEXT:    csrr a0, vlenb
-; ZVFHMIN64-NEXT:    li t0, 21
-; ZVFHMIN64-NEXT:    mul a0, a0, t0
+; ZVFHMIN64-NEXT:    li s3, 10
+; ZVFHMIN64-NEXT:    mul a0, a0, s3
 ; ZVFHMIN64-NEXT:    add a0, sp, a0
 ; ZVFHMIN64-NEXT:    addi a0, a0, 800
 ; ZVFHMIN64-NEXT:    vl2r.v v8, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN64-NEXT:    vmv.x.s a0, v8
-; ZVFHMIN64-NEXT:    fmv.h.x fa0, a1
-; ZVFHMIN64-NEXT:    feq.h a1, fa1, fa0
-; ZVFHMIN64-NEXT:    fmv.h.x fa1, a2
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a5
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a1, 188(sp)
+; ZVFHMIN64-NEXT:    lh a1, 758(sp)
+; ZVFHMIN64-NEXT:    lh a5, 502(sp)
+; ZVFHMIN64-NEXT:    csrr s3, vlenb
+; ZVFHMIN64-NEXT:    slli s3, s3, 4
+; ZVFHMIN64-NEXT:    add s3, sp, s3
+; ZVFHMIN64-NEXT:    addi s3, s3, 800
+; ZVFHMIN64-NEXT:    vl2r.v v8, (s3) # Unknown-size Folded Reload
+; ZVFHMIN64-NEXT:    vmv.x.s s4, v8
+; ZVFHMIN64-NEXT:    vmv.x.s s3, v16
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a5
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, t4
+; ZVFHMIN64-NEXT:    sb a1, 187(sp)
+; ZVFHMIN64-NEXT:    lh a1, 756(sp)
+; ZVFHMIN64-NEXT:    lh a5, 500(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
+; ZVFHMIN64-NEXT:    feq.h t4, fa5, fa4
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a5
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, t3
+; ZVFHMIN64-NEXT:    sb a1, 186(sp)
+; ZVFHMIN64-NEXT:    lh a1, 754(sp)
+; ZVFHMIN64-NEXT:    lh a2, 498(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a3
+; ZVFHMIN64-NEXT:    feq.h t3, fa5, fa4
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, t1
 ; ZVFHMIN64-NEXT:    sb a1, 185(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 752(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa0, a3
 ; ZVFHMIN64-NEXT:    lh a2, 496(sp)
-; ZVFHMIN64-NEXT:    feq.h t0, fa5, fa1
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a4
+; ZVFHMIN64-NEXT:    feq.h t1, fa5, fa4
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT:    feq.h t1, fa4, fa0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
 ; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a4
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, t2
 ; ZVFHMIN64-NEXT:    sb a1, 184(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 750(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, t2
 ; ZVFHMIN64-NEXT:    lh a2, 494(sp)
-; ZVFHMIN64-NEXT:    feq.h a3, fa3, fa5
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, s5
+; ZVFHMIN64-NEXT:    feq.h a3, fa5, fa4
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT:    feq.h a1, fa2, fa4
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
-; ZVFHMIN64-NEXT:    feq.h a2, fa5, fa4
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, t0
+; ZVFHMIN64-NEXT:    sb a1, 183(sp)
+; ZVFHMIN64-NEXT:    lh a1, 748(sp)
+; ZVFHMIN64-NEXT:    lh a2, 492(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a0
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a7
-; ZVFHMIN64-NEXT:    sb a2, 183(sp)
-; ZVFHMIN64-NEXT:    lh a2, 748(sp)
-; ZVFHMIN64-NEXT:    lh a4, 492(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, t3
-; ZVFHMIN64-NEXT:    feq.h a7, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a2
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a4
-; ZVFHMIN64-NEXT:    feq.h a2, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a1, 182(sp)
+; ZVFHMIN64-NEXT:    lh a1, 746(sp)
+; ZVFHMIN64-NEXT:    lh a2, 490(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, s4
+; ZVFHMIN64-NEXT:    feq.h a4, fa5, fa4
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a6
-; ZVFHMIN64-NEXT:    sb a2, 182(sp)
-; ZVFHMIN64-NEXT:    lh a2, 746(sp)
-; ZVFHMIN64-NEXT:    lh a4, 490(sp)
+; ZVFHMIN64-NEXT:    sb a1, 181(sp)
+; ZVFHMIN64-NEXT:    lh a1, 744(sp)
+; ZVFHMIN64-NEXT:    lh a2, 488(sp)
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, s3
 ; ZVFHMIN64-NEXT:    feq.h a6, fa5, fa4
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    ld a2, 80(sp) # 8-byte Folded Reload
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a2
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a4
-; ZVFHMIN64-NEXT:    feq.h a2, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a5
-; ZVFHMIN64-NEXT:    sb a2, 181(sp)
-; ZVFHMIN64-NEXT:    lh a2, 744(sp)
-; ZVFHMIN64-NEXT:    lh a4, 488(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a0
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a2
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a4
-; ZVFHMIN64-NEXT:    feq.h a2, fa5, fa4
-; ZVFHMIN64-NEXT:    ld a4, 88(sp) # 8-byte Folded Reload
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a4
-; ZVFHMIN64-NEXT:    vmv.x.s a5, v0
+; ZVFHMIN64-NEXT:    addi a2, sp, 800
+; ZVFHMIN64-NEXT:    vl2r.v v8, (a2) # Unknown-size Folded Reload
+; ZVFHMIN64-NEXT:    vmv.x.s a2, v8
 ; ZVFHMIN64-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
 ; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 15
-; ZVFHMIN64-NEXT:    vmv.x.s a4, v8
-; ZVFHMIN64-NEXT:    sb a2, 180(sp)
-; ZVFHMIN64-NEXT:    lh a2, 742(sp)
-; ZVFHMIN64-NEXT:    lh t2, 486(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a5
-; ZVFHMIN64-NEXT:    feq.h a5, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a2
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, t2
+; ZVFHMIN64-NEXT:    vmv.x.s a5, v8
+; ZVFHMIN64-NEXT:    sb a1, 180(sp)
+; ZVFHMIN64-NEXT:    lh a1, 742(sp)
+; ZVFHMIN64-NEXT:    lh a7, 486(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
 ; ZVFHMIN64-NEXT:    feq.h a2, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a2, 179(sp)
-; ZVFHMIN64-NEXT:    lh a2, 740(sp)
-; ZVFHMIN64-NEXT:    lh t2, 484(sp)
-; ZVFHMIN64-NEXT:    sb a1, 140(sp)
-; ZVFHMIN64-NEXT:    sb a3, 141(sp)
-; ZVFHMIN64-NEXT:    sb t1, 142(sp)
-; ZVFHMIN64-NEXT:    sb t0, 143(sp)
-; ZVFHMIN64-NEXT:    sb a5, 136(sp)
-; ZVFHMIN64-NEXT:    sb a0, 137(sp)
-; ZVFHMIN64-NEXT:    sb a6, 138(sp)
-; ZVFHMIN64-NEXT:    sb a7, 139(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a2
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, t2
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a7
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a1, 179(sp)
+; ZVFHMIN64-NEXT:    lh a1, 740(sp)
+; ZVFHMIN64-NEXT:    lh a7, 484(sp)
+; ZVFHMIN64-NEXT:    sb a3, 140(sp)
+; ZVFHMIN64-NEXT:    sb t1, 141(sp)
+; ZVFHMIN64-NEXT:    sb t3, 142(sp)
+; ZVFHMIN64-NEXT:    sb t4, 143(sp)
+; ZVFHMIN64-NEXT:    sb a2, 136(sp)
+; ZVFHMIN64-NEXT:    sb a6, 137(sp)
+; ZVFHMIN64-NEXT:    sb a4, 138(sp)
+; ZVFHMIN64-NEXT:    sb a0, 139(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a7
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a0, 178(sp)
-; ZVFHMIN64-NEXT:    lh a0, 638(sp)
-; ZVFHMIN64-NEXT:    lh a1, 382(sp)
+; ZVFHMIN64-NEXT:    lh a1, 638(sp)
+; ZVFHMIN64-NEXT:    lh a2, 382(sp)
 ; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 14
-; ZVFHMIN64-NEXT:    vmv.x.s t2, v8
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 255(sp)
-; ZVFHMIN64-NEXT:    lh a0, 636(sp)
-; ZVFHMIN64-NEXT:    lh a1, 380(sp)
+; ZVFHMIN64-NEXT:    vmv.x.s a0, v8
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a1, 255(sp)
+; ZVFHMIN64-NEXT:    lh a1, 636(sp)
+; ZVFHMIN64-NEXT:    lh a2, 380(sp)
 ; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 13
-; ZVFHMIN64-NEXT:    vmv.x.s t1, v8
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 254(sp)
-; ZVFHMIN64-NEXT:    lh a0, 634(sp)
-; ZVFHMIN64-NEXT:    lh a1, 378(sp)
+; ZVFHMIN64-NEXT:    vmv.x.s t2, v8
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a1, 254(sp)
+; ZVFHMIN64-NEXT:    lh a1, 634(sp)
+; ZVFHMIN64-NEXT:    lh a2, 378(sp)
 ; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 12
-; ZVFHMIN64-NEXT:    vmv.x.s t0, v8
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 253(sp)
-; ZVFHMIN64-NEXT:    lh a0, 632(sp)
-; ZVFHMIN64-NEXT:    lh a1, 376(sp)
+; ZVFHMIN64-NEXT:    vmv.x.s t1, v8
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a1, 253(sp)
+; ZVFHMIN64-NEXT:    lh a1, 632(sp)
+; ZVFHMIN64-NEXT:    lh a2, 376(sp)
 ; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 11
-; ZVFHMIN64-NEXT:    vmv.x.s a7, v8
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 252(sp)
-; ZVFHMIN64-NEXT:    lh a0, 630(sp)
-; ZVFHMIN64-NEXT:    lh a1, 374(sp)
+; ZVFHMIN64-NEXT:    vmv.x.s t0, v8
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a1, 252(sp)
+; ZVFHMIN64-NEXT:    lh a1, 630(sp)
+; ZVFHMIN64-NEXT:    lh a2, 374(sp)
 ; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 10
-; ZVFHMIN64-NEXT:    vmv.x.s a6, v8
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 251(sp)
-; ZVFHMIN64-NEXT:    lh a0, 628(sp)
-; ZVFHMIN64-NEXT:    lh a1, 372(sp)
+; ZVFHMIN64-NEXT:    vmv.x.s a7, v8
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a1, 251(sp)
+; ZVFHMIN64-NEXT:    lh a1, 628(sp)
+; ZVFHMIN64-NEXT:    lh a2, 372(sp)
 ; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 9
-; ZVFHMIN64-NEXT:    vmv.x.s a5, v8
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    vmv.x.s a6, v8
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    ld a2, 88(sp) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a2
+; ZVFHMIN64-NEXT:    sb a1, 250(sp)
+; ZVFHMIN64-NEXT:    lh a1, 626(sp)
+; ZVFHMIN64-NEXT:    lh a2, 370(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a5
+; ZVFHMIN64-NEXT:    feq.h a4, fa5, fa4
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    ld a2, 96(sp) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a2
+; ZVFHMIN64-NEXT:    sb a1, 249(sp)
+; ZVFHMIN64-NEXT:    lh a1, 624(sp)
+; ZVFHMIN64-NEXT:    lh a2, 368(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a0
+; ZVFHMIN64-NEXT:    feq.h a3, fa5, fa4
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    ld a1, 104(sp) # 8-byte Folded Reload
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT:    sb a0, 250(sp)
-; ZVFHMIN64-NEXT:    lh a0, 626(sp)
-; ZVFHMIN64-NEXT:    lh a1, 370(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a4
+; ZVFHMIN64-NEXT:    sb a0, 248(sp)
+; ZVFHMIN64-NEXT:    lh a0, 622(sp)
+; ZVFHMIN64-NEXT:    lh a1, 366(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, t2
 ; ZVFHMIN64-NEXT:    feq.h a2, fa5, fa4
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    ld a1, 120(sp) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    ld a1, 112(sp) # 8-byte Folded Reload
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT:    sb a0, 249(sp)
-; ZVFHMIN64-NEXT:    lh a1, 624(sp)
-; ZVFHMIN64-NEXT:    lh a3, 368(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, t2
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a3
-; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
-; ZVFHMIN64-NEXT:    ld a3, 96(sp) # 8-byte Folded Reload
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a3
-; ZVFHMIN64-NEXT:    sb a1, 248(sp)
-; ZVFHMIN64-NEXT:    lh a1, 622(sp)
-; ZVFHMIN64-NEXT:    lh a3, 366(sp)
+; ZVFHMIN64-NEXT:    sb a0, 247(sp)
+; ZVFHMIN64-NEXT:    lh a0, 620(sp)
+; ZVFHMIN64-NEXT:    lh a1, 364(sp)
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, t1
-; ZVFHMIN64-NEXT:    feq.h a4, fa5, fa4
+; ZVFHMIN64-NEXT:    feq.h a5, fa5, fa4
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    ld a1, 120(sp) # 8-byte Folded Reload
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a3
-; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
-; ZVFHMIN64-NEXT:    ld a3, 112(sp) # 8-byte Folded Reload
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a3
-; ZVFHMIN64-NEXT:    sb a1, 247(sp)
-; ZVFHMIN64-NEXT:    lh a1, 620(sp)
-; ZVFHMIN64-NEXT:    lh a3, 364(sp)
+; ZVFHMIN64-NEXT:    sb a0, 246(sp)
+; ZVFHMIN64-NEXT:    lh a0, 618(sp)
+; ZVFHMIN64-NEXT:    lh a1, 362(sp)
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, t0
 ; ZVFHMIN64-NEXT:    feq.h t0, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a3
-; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, s2
-; ZVFHMIN64-NEXT:    sb a1, 246(sp)
-; ZVFHMIN64-NEXT:    lh a1, 618(sp)
-; ZVFHMIN64-NEXT:    lh a3, 362(sp)
+; ZVFHMIN64-NEXT:    sb a0, 245(sp)
+; ZVFHMIN64-NEXT:    lh a0, 616(sp)
+; ZVFHMIN64-NEXT:    lh a1, 360(sp)
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a7
 ; ZVFHMIN64-NEXT:    feq.h a7, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a3
-; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, t6
-; ZVFHMIN64-NEXT:    sb a1, 245(sp)
-; ZVFHMIN64-NEXT:    lh a1, 616(sp)
-; ZVFHMIN64-NEXT:    lh a3, 360(sp)
+; ZVFHMIN64-NEXT:    sb a0, 244(sp)
+; ZVFHMIN64-NEXT:    lh a0, 614(sp)
+; ZVFHMIN64-NEXT:    lh a1, 358(sp)
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a6
 ; ZVFHMIN64-NEXT:    feq.h a6, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a3
-; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, t5
-; ZVFHMIN64-NEXT:    sb a1, 244(sp)
-; ZVFHMIN64-NEXT:    lh a1, 614(sp)
-; ZVFHMIN64-NEXT:    lh a3, 358(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a5
-; ZVFHMIN64-NEXT:    feq.h a5, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a3
-; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, t4
 ; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 8
-; ZVFHMIN64-NEXT:    vmv.x.s a3, v8
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a3
-; ZVFHMIN64-NEXT:    sb a1, 243(sp)
-; ZVFHMIN64-NEXT:    lh a1, 612(sp)
-; ZVFHMIN64-NEXT:    lh a3, 356(sp)
-; ZVFHMIN64-NEXT:    sb t0, 204(sp)
-; ZVFHMIN64-NEXT:    sb a4, 205(sp)
-; ZVFHMIN64-NEXT:    sb a0, 206(sp)
-; ZVFHMIN64-NEXT:    sb a2, 207(sp)
+; ZVFHMIN64-NEXT:    vmv.x.s a1, v8
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    sb a0, 243(sp)
+; ZVFHMIN64-NEXT:    lh a0, 612(sp)
+; ZVFHMIN64-NEXT:    lh a1, 356(sp)
+; ZVFHMIN64-NEXT:    sb a5, 204(sp)
+; ZVFHMIN64-NEXT:    sb a2, 205(sp)
+; ZVFHMIN64-NEXT:    sb a3, 206(sp)
+; ZVFHMIN64-NEXT:    sb a4, 207(sp)
+; ZVFHMIN64-NEXT:    feq.h a2, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a2, 200(sp)
+; ZVFHMIN64-NEXT:    sb a6, 201(sp)
+; ZVFHMIN64-NEXT:    sb a7, 202(sp)
+; ZVFHMIN64-NEXT:    sb t0, 203(sp)
+; ZVFHMIN64-NEXT:    li a2, 128
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 200(sp)
-; ZVFHMIN64-NEXT:    sb a5, 201(sp)
-; ZVFHMIN64-NEXT:    sb a6, 202(sp)
-; ZVFHMIN64-NEXT:    sb a7, 203(sp)
-; ZVFHMIN64-NEXT:    li a0, 128
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a3
-; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a1, 242(sp)
-; ZVFHMIN64-NEXT:    addi a1, sp, 128
-; ZVFHMIN64-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
-; ZVFHMIN64-NEXT:    vle8.v v8, (a1)
+; ZVFHMIN64-NEXT:    sb a0, 242(sp)
+; ZVFHMIN64-NEXT:    addi a0, sp, 128
+; ZVFHMIN64-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
+; ZVFHMIN64-NEXT:    vle8.v v8, (a0)
 ; ZVFHMIN64-NEXT:    vand.vi v8, v8, 1
 ; ZVFHMIN64-NEXT:    vmsne.vi v0, v8, 0
 ; ZVFHMIN64-NEXT:    addi sp, s0, -896
diff --git a/llvm/test/CodeGen/RISCV/rvv/intrinsic-vector-match.ll b/llvm/test/CodeGen/RISCV/rvv/intrinsic-vector-match.ll
index dd2a8240ee2533..5b272c98a1e0ac 100644
--- a/llvm/test/CodeGen/RISCV/rvv/intrinsic-vector-match.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/intrinsic-vector-match.ll
@@ -507,26 +507,34 @@ define <8 x i1> @match_v8i8_v16i8(<8 x i8> %op1, <16 x i8> %op2, <8 x i1> %mask)
 define <vscale x 16 x i1> @match_nxv16i8_v32i8(<vscale x 16 x i8> %op1, <32 x i8> %op2, <vscale x 16 x i1> %mask) {
 ; RV32-LABEL: match_nxv16i8_v32i8:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -48
-; RV32-NEXT:    .cfi_def_cfa_offset 48
-; RV32-NEXT:    sw s0, 44(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s1, 40(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s2, 36(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s3, 32(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s4, 28(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s5, 24(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s6, 20(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s7, 16(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s8, 12(sp) # 4-byte Folded Spill
-; RV32-NEXT:    .cfi_offset s0, -4
-; RV32-NEXT:    .cfi_offset s1, -8
-; RV32-NEXT:    .cfi_offset s2, -12
-; RV32-NEXT:    .cfi_offset s3, -16
-; RV32-NEXT:    .cfi_offset s4, -20
-; RV32-NEXT:    .cfi_offset s5, -24
-; RV32-NEXT:    .cfi_offset s6, -28
-; RV32-NEXT:    .cfi_offset s7, -32
-; RV32-NEXT:    .cfi_offset s8, -36
+; RV32-NEXT:    addi sp, sp, -64
+; RV32-NEXT:    .cfi_def_cfa_offset 64
+; RV32-NEXT:    sw ra, 60(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s0, 56(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s1, 52(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s2, 48(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s3, 44(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s4, 40(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s5, 36(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s6, 32(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s7, 28(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s8, 24(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s9, 20(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s10, 16(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s11, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT:    .cfi_offset ra, -4
+; RV32-NEXT:    .cfi_offset s0, -8
+; RV32-NEXT:    .cfi_offset s1, -12
+; RV32-NEXT:    .cfi_offset s2, -16
+; RV32-NEXT:    .cfi_offset s3, -20
+; RV32-NEXT:    .cfi_offset s4, -24
+; RV32-NEXT:    .cfi_offset s5, -28
+; RV32-NEXT:    .cfi_offset s6, -32
+; RV32-NEXT:    .cfi_offset s7, -36
+; RV32-NEXT:    .cfi_offset s8, -40
+; RV32-NEXT:    .cfi_offset s9, -44
+; RV32-NEXT:    .cfi_offset s10, -48
+; RV32-NEXT:    .cfi_offset s11, -52
 ; RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
 ; RV32-NEXT:    vmv.x.s a0, v10
 ; RV32-NEXT:    vslidedown.vi v12, v10, 1
@@ -584,43 +592,43 @@ define <vscale x 16 x i1> @match_nxv16i8_v32i8(<vscale x 16 x i8> %op1, <32 x i8
 ; RV32-NEXT:    vmv.x.s s5, v15
 ; RV32-NEXT:    vmv.x.s s6, v16
 ; RV32-NEXT:    vmv.x.s s7, v17
-; RV32-NEXT:    vsetvli s8, zero, e8, m2, ta, ma
+; RV32-NEXT:    vmv.x.s s8, v18
+; RV32-NEXT:    vmv.x.s s9, v19
+; RV32-NEXT:    vmv.x.s s10, v20
+; RV32-NEXT:    vmv.x.s s11, v21
+; RV32-NEXT:    vsetvli ra, zero, e8, m2, ta, ma
 ; RV32-NEXT:    vmseq.vx v12, v8, a0
-; RV32-NEXT:    vmv.x.s a0, v18
+; RV32-NEXT:    vmv.x.s a0, v22
 ; RV32-NEXT:    vmseq.vx v13, v8, s2
-; RV32-NEXT:    vmv.x.s s2, v19
+; RV32-NEXT:    vmv.x.s s2, v23
 ; RV32-NEXT:    vmseq.vx v14, v8, s3
-; RV32-NEXT:    vmv.x.s s3, v20
-; RV32-NEXT:    vmseq.vx v15, v8, s4
-; RV32-NEXT:    vmv.x.s s4, v21
-; RV32-NEXT:    vmseq.vx v16, v8, s5
-; RV32-NEXT:    vmv.x.s s5, v22
-; RV32-NEXT:    vmseq.vx v17, v8, s6
-; RV32-NEXT:    vmv.x.s s6, v23
-; RV32-NEXT:    vmseq.vx v18, v8, s7
-; RV32-NEXT:    vmv.x.s s7, v11
-; RV32-NEXT:    vmseq.vx v11, v8, a0
-; RV32-NEXT:    vmv.x.s a0, v24
-; RV32-NEXT:    vmseq.vx v19, v8, s2
-; RV32-NEXT:    vmv.x.s s2, v10
+; RV32-NEXT:    vmv.x.s s3, v11
+; RV32-NEXT:    vmseq.vx v11, v8, s4
+; RV32-NEXT:    vmv.x.s s4, v24
+; RV32-NEXT:    vmseq.vx v15, v8, s5
+; RV32-NEXT:    vmv.x.s s5, v10
 ; RV32-NEXT:    vmor.mm v10, v12, v13
+; RV32-NEXT:    vmseq.vx v12, v8, s6
 ; RV32-NEXT:    vmor.mm v10, v10, v14
+; RV32-NEXT:    vmseq.vx v13, v8, s7
+; RV32-NEXT:    vmor.mm v10, v10, v11
+; RV32-NEXT:    vmseq.vx v11, v8, s8
 ; RV32-NEXT:    vmor.mm v10, v10, v15
-; RV32-NEXT:    vmor.mm v10, v10, v16
-; RV32-NEXT:    vmor.mm v10, v10, v17
-; RV32-NEXT:    vmseq.vx v12, v8, s3
-; RV32-NEXT:    vmor.mm v10, v10, v18
-; RV32-NEXT:    vmseq.vx v13, v8, s4
+; RV32-NEXT:    vmseq.vx v14, v8, s9
+; RV32-NEXT:    vmor.mm v10, v10, v12
+; RV32-NEXT:    vmseq.vx v12, v8, s10
+; RV32-NEXT:    vmor.mm v10, v10, v13
+; RV32-NEXT:    vmseq.vx v13, v8, s11
 ; RV32-NEXT:    vmor.mm v10, v10, v11
-; RV32-NEXT:    vmseq.vx v11, v8, s5
-; RV32-NEXT:    vmor.mm v10, v10, v19
-; RV32-NEXT:    vmseq.vx v14, v8, s6
+; RV32-NEXT:    vmseq.vx v11, v8, a0
+; RV32-NEXT:    vmor.mm v10, v10, v14
+; RV32-NEXT:    vmseq.vx v14, v8, s2
 ; RV32-NEXT:    vmor.mm v10, v10, v12
-; RV32-NEXT:    vmseq.vx v12, v8, s7
+; RV32-NEXT:    vmseq.vx v12, v8, s3
 ; RV32-NEXT:    vmor.mm v10, v10, v13
-; RV32-NEXT:    vmseq.vx v13, v8, a0
+; RV32-NEXT:    vmseq.vx v13, v8, s4
 ; RV32-NEXT:    vmor.mm v10, v10, v11
-; RV32-NEXT:    vmseq.vx v11, v8, s2
+; RV32-NEXT:    vmseq.vx v11, v8, s5
 ; RV32-NEXT:    vmor.mm v10, v10, v14
 ; RV32-NEXT:    vmseq.vx v14, v8, a1
 ; RV32-NEXT:    vmor.mm v10, v10, v12
@@ -658,15 +666,20 @@ define <vscale x 16 x i1> @match_nxv16i8_v32i8(<vscale x 16 x i8> %op1, <32 x i8
 ; RV32-NEXT:    vmseq.vx v11, v8, s1
 ; RV32-NEXT:    vmor.mm v8, v10, v11
 ; RV32-NEXT:    vmand.mm v0, v8, v0
-; RV32-NEXT:    lw s0, 44(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s1, 40(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s2, 36(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s3, 32(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s4, 28(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s5, 24(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s6, 20(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s7, 16(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s8, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw ra, 60(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s0, 56(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s1, 52(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s2, 48(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s3, 44(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s4, 40(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s5, 36(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s6, 32(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s7, 28(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s8, 24(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s9, 20(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s10, 16(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s11, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT:    .cfi_restore ra
 ; RV32-NEXT:    .cfi_restore s0
 ; RV32-NEXT:    .cfi_restore s1
 ; RV32-NEXT:    .cfi_restore s2
@@ -676,32 +689,43 @@ define <vscale x 16 x i1> @match_nxv16i8_v32i8(<vscale x 16 x i8> %op1, <32 x i8
 ; RV32-NEXT:    .cfi_restore s6
 ; RV32-NEXT:    .cfi_restore s7
 ; RV32-NEXT:    .cfi_restore s8
-; RV32-NEXT:    addi sp, sp, 48
+; RV32-NEXT:    .cfi_restore s9
+; RV32-NEXT:    .cfi_restore s10
+; RV32-NEXT:    .cfi_restore s11
+; RV32-NEXT:    addi sp, sp, 64
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: match_nxv16i8_v32i8:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    addi sp, sp, -80
-; RV64-NEXT:    .cfi_def_cfa_offset 80
-; RV64-NEXT:    sd s0, 72(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s1, 64(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s2, 56(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s3, 48(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s4, 40(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s5, 32(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s6, 24(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s7, 16(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s8, 8(sp) # 8-byte Folded Spill
-; RV64-NEXT:    .cfi_offset s0, -8
-; RV64-NEXT:    .cfi_offset s1, -16
-; RV64-NEXT:    .cfi_offset s2, -24
-; RV64-NEXT:    .cfi_offset s3, -32
-; RV64-NEXT:    .cfi_offset s4, -40
-; RV64-NEXT:    .cfi_offset s5, -48
-; RV64-NEXT:    .cfi_offset s6, -56
-; RV64-NEXT:    .cfi_offset s7, -64
-; RV64-NEXT:    .cfi_offset s8, -72
+; RV64-NEXT:    addi sp, sp, -112
+; RV64-NEXT:    .cfi_def_cfa_offset 112
+; RV64-NEXT:    sd ra, 104(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s0, 96(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s1, 88(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s2, 80(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s3, 72(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s4, 64(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s5, 56(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s6, 48(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s7, 40(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s8, 32(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s9, 24(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s10, 16(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s11, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT:    .cfi_offset ra, -8
+; RV64-NEXT:    .cfi_offset s0, -16
+; RV64-NEXT:    .cfi_offset s1, -24
+; RV64-NEXT:    .cfi_offset s2, -32
+; RV64-NEXT:    .cfi_offset s3, -40
+; RV64-NEXT:    .cfi_offset s4, -48
+; RV64-NEXT:    .cfi_offset s5, -56
+; RV64-NEXT:    .cfi_offset s6, -64
+; RV64-NEXT:    .cfi_offset s7, -72
+; RV64-NEXT:    .cfi_offset s8, -80
+; RV64-NEXT:    .cfi_offset s9, -88
+; RV64-NEXT:    .cfi_offset s10, -96
+; RV64-NEXT:    .cfi_offset s11, -104
 ; RV64-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
 ; RV64-NEXT:    vmv.x.s a0, v10
 ; RV64-NEXT:    vslidedown.vi v12, v10, 1
@@ -759,43 +783,43 @@ define <vscale x 16 x i1> @match_nxv16i8_v32i8(<vscale x 16 x i8> %op1, <32 x i8
 ; RV64-NEXT:    vmv.x.s s5, v15
 ; RV64-NEXT:    vmv.x.s s6, v16
 ; RV64-NEXT:    vmv.x.s s7, v17
-; RV64-NEXT:    vsetvli s8, zero, e8, m2, ta, ma
+; RV64-NEXT:    vmv.x.s s8, v18
+; RV64-NEXT:    vmv.x.s s9, v19
+; RV64-NEXT:    vmv.x.s s10, v20
+; RV64-NEXT:    vmv.x.s s11, v21
+; RV64-NEXT:    vsetvli ra, zero, e8, m2, ta, ma
 ; RV64-NEXT:    vmseq.vx v12, v8, a0
-; RV64-NEXT:    vmv.x.s a0, v18
+; RV64-NEXT:    vmv.x.s a0, v22
 ; RV64-NEXT:    vmseq.vx v13, v8, s2
-; RV64-NEXT:    vmv.x.s s2, v19
+; RV64-NEXT:    vmv.x.s s2, v23
 ; RV64-NEXT:    vmseq.vx v14, v8, s3
-; RV64-NEXT:    vmv.x.s s3, v20
-; RV64-NEXT:    vmseq.vx v15, v8, s4
-; RV64-NEXT:    vmv.x.s s4, v21
-; RV64-NEXT:    vmseq.vx v16, v8, s5
-; RV64-NEXT:    vmv.x.s s5, v22
-; RV64-NEXT:    vmseq.vx v17, v8, s6
-; RV64-NEXT:    vmv.x.s s6, v23
-; RV64-NEXT:    vmseq.vx v18, v8, s7
-; RV64-NEXT:    vmv.x.s s7, v11
-; RV64-NEXT:    vmseq.vx v11, v8, a0
-; RV64-NEXT:    vmv.x.s a0, v24
-; RV64-NEXT:    vmseq.vx v19, v8, s2
-; RV64-NEXT:    vmv.x.s s2, v10
+; RV64-NEXT:    vmv.x.s s3, v11
+; RV64-NEXT:    vmseq.vx v11, v8, s4
+; RV64-NEXT:    vmv.x.s s4, v24
+; RV64-NEXT:    vmseq.vx v15, v8, s5
+; RV64-NEXT:    vmv.x.s s5, v10
 ; RV64-NEXT:    vmor.mm v10, v12, v13
+; RV64-NEXT:    vmseq.vx v12, v8, s6
 ; RV64-NEXT:    vmor.mm v10, v10, v14
+; RV64-NEXT:    vmseq.vx v13, v8, s7
+; RV64-NEXT:    vmor.mm v10, v10, v11
+; RV64-NEXT:    vmseq.vx v11, v8, s8
 ; RV64-NEXT:    vmor.mm v10, v10, v15
-; RV64-NEXT:    vmor.mm v10, v10, v16
-; RV64-NEXT:    vmor.mm v10, v10, v17
-; RV64-NEXT:    vmseq.vx v12, v8, s3
-; RV64-NEXT:    vmor.mm v10, v10, v18
-; RV64-NEXT:    vmseq.vx v13, v8, s4
+; RV64-NEXT:    vmseq.vx v14, v8, s9
+; RV64-NEXT:    vmor.mm v10, v10, v12
+; RV64-NEXT:    vmseq.vx v12, v8, s10
+; RV64-NEXT:    vmor.mm v10, v10, v13
+; RV64-NEXT:    vmseq.vx v13, v8, s11
 ; RV64-NEXT:    vmor.mm v10, v10, v11
-; RV64-NEXT:    vmseq.vx v11, v8, s5
-; RV64-NEXT:    vmor.mm v10, v10, v19
-; RV64-NEXT:    vmseq.vx v14, v8, s6
+; RV64-NEXT:    vmseq.vx v11, v8, a0
+; RV64-NEXT:    vmor.mm v10, v10, v14
+; RV64-NEXT:    vmseq.vx v14, v8, s2
 ; RV64-NEXT:    vmor.mm v10, v10, v12
-; RV64-NEXT:    vmseq.vx v12, v8, s7
+; RV64-NEXT:    vmseq.vx v12, v8, s3
 ; RV64-NEXT:    vmor.mm v10, v10, v13
-; RV64-NEXT:    vmseq.vx v13, v8, a0
+; RV64-NEXT:    vmseq.vx v13, v8, s4
 ; RV64-NEXT:    vmor.mm v10, v10, v11
-; RV64-NEXT:    vmseq.vx v11, v8, s2
+; RV64-NEXT:    vmseq.vx v11, v8, s5
 ; RV64-NEXT:    vmor.mm v10, v10, v14
 ; RV64-NEXT:    vmseq.vx v14, v8, a1
 ; RV64-NEXT:    vmor.mm v10, v10, v12
@@ -833,15 +857,20 @@ define <vscale x 16 x i1> @match_nxv16i8_v32i8(<vscale x 16 x i8> %op1, <32 x i8
 ; RV64-NEXT:    vmseq.vx v11, v8, s1
 ; RV64-NEXT:    vmor.mm v8, v10, v11
 ; RV64-NEXT:    vmand.mm v0, v8, v0
-; RV64-NEXT:    ld s0, 72(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s1, 64(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s2, 56(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s3, 48(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s4, 40(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s5, 32(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s6, 24(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s7, 16(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s8, 8(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld ra, 104(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s0, 96(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s1, 88(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s2, 80(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s3, 72(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s4, 64(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s5, 56(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s6, 48(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s7, 40(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s8, 32(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s9, 24(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s10, 16(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s11, 8(sp) # 8-byte Folded Reload
+; RV64-NEXT:    .cfi_restore ra
 ; RV64-NEXT:    .cfi_restore s0
 ; RV64-NEXT:    .cfi_restore s1
 ; RV64-NEXT:    .cfi_restore s2
@@ -851,7 +880,10 @@ define <vscale x 16 x i1> @match_nxv16i8_v32i8(<vscale x 16 x i8> %op1, <32 x i8
 ; RV64-NEXT:    .cfi_restore s6
 ; RV64-NEXT:    .cfi_restore s7
 ; RV64-NEXT:    .cfi_restore s8
-; RV64-NEXT:    addi sp, sp, 80
+; RV64-NEXT:    .cfi_restore s9
+; RV64-NEXT:    .cfi_restore s10
+; RV64-NEXT:    .cfi_restore s11
+; RV64-NEXT:    addi sp, sp, 112
 ; RV64-NEXT:    .cfi_def_cfa_offset 0
 ; RV64-NEXT:    ret
   %r = tail call <vscale x 16 x i1> @llvm.experimental.vector.match(<vscale x 16 x i8> %op1, <32 x i8> %op2, <vscale x 16 x i1> %mask)
@@ -861,16 +893,20 @@ define <vscale x 16 x i1> @match_nxv16i8_v32i8(<vscale x 16 x i8> %op1, <32 x i8
 define <16 x i1> @match_v16i8_v32i8(<16 x i8> %op1, <32 x i8> %op2, <16 x i1> %mask) {
 ; RV32-LABEL: match_v16i8_v32i8:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -32
-; RV32-NEXT:    .cfi_def_cfa_offset 32
-; RV32-NEXT:    sw s0, 28(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s1, 24(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s2, 20(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s3, 16(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s4, 12(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s5, 8(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s6, 4(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s7, 0(sp) # 4-byte Folded Spill
+; RV32-NEXT:    addi sp, sp, -48
+; RV32-NEXT:    .cfi_def_cfa_offset 48
+; RV32-NEXT:    sw s0, 44(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s1, 40(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s2, 36(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s3, 32(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s4, 28(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s5, 24(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s6, 20(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s7, 16(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s8, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s9, 8(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s10, 4(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s11, 0(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    .cfi_offset s0, -4
 ; RV32-NEXT:    .cfi_offset s1, -8
 ; RV32-NEXT:    .cfi_offset s2, -12
@@ -879,6 +915,10 @@ define <16 x i1> @match_v16i8_v32i8(<16 x i8> %op1, <32 x i8> %op2, <16 x i1> %m
 ; RV32-NEXT:    .cfi_offset s5, -24
 ; RV32-NEXT:    .cfi_offset s6, -28
 ; RV32-NEXT:    .cfi_offset s7, -32
+; RV32-NEXT:    .cfi_offset s8, -36
+; RV32-NEXT:    .cfi_offset s9, -40
+; RV32-NEXT:    .cfi_offset s10, -44
+; RV32-NEXT:    .cfi_offset s11, -48
 ; RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
 ; RV32-NEXT:    vmv.x.s a0, v10
 ; RV32-NEXT:    vslidedown.vi v9, v10, 1
@@ -936,42 +976,42 @@ define <16 x i1> @match_v16i8_v32i8(<16 x i8> %op1, <32 x i8> %op2, <16 x i1> %m
 ; RV32-NEXT:    vmv.x.s s5, v14
 ; RV32-NEXT:    vmv.x.s s6, v15
 ; RV32-NEXT:    vmv.x.s s7, v16
+; RV32-NEXT:    vmv.x.s s8, v17
+; RV32-NEXT:    vmv.x.s s9, v18
+; RV32-NEXT:    vmv.x.s s10, v19
+; RV32-NEXT:    vmv.x.s s11, v20
 ; RV32-NEXT:    vmseq.vx v9, v8, a0
-; RV32-NEXT:    vmv.x.s a0, v17
+; RV32-NEXT:    vmv.x.s a0, v21
 ; RV32-NEXT:    vmseq.vx v12, v8, s2
-; RV32-NEXT:    vmv.x.s s2, v18
+; RV32-NEXT:    vmv.x.s s2, v22
 ; RV32-NEXT:    vmseq.vx v13, v8, s3
-; RV32-NEXT:    vmv.x.s s3, v19
-; RV32-NEXT:    vmseq.vx v14, v8, s4
-; RV32-NEXT:    vmv.x.s s4, v20
-; RV32-NEXT:    vmseq.vx v15, v8, s5
-; RV32-NEXT:    vmv.x.s s5, v21
-; RV32-NEXT:    vmseq.vx v16, v8, s6
-; RV32-NEXT:    vmv.x.s s6, v22
-; RV32-NEXT:    vmseq.vx v17, v8, s7
-; RV32-NEXT:    vmv.x.s s7, v11
-; RV32-NEXT:    vmseq.vx v11, v8, a0
-; RV32-NEXT:    vmv.x.s a0, v23
-; RV32-NEXT:    vmseq.vx v18, v8, s2
-; RV32-NEXT:    vmv.x.s s2, v10
+; RV32-NEXT:    vmv.x.s s3, v11
+; RV32-NEXT:    vmseq.vx v11, v8, s4
+; RV32-NEXT:    vmv.x.s s4, v23
+; RV32-NEXT:    vmseq.vx v14, v8, s5
+; RV32-NEXT:    vmv.x.s s5, v10
 ; RV32-NEXT:    vmor.mm v9, v9, v12
+; RV32-NEXT:    vmseq.vx v10, v8, s6
 ; RV32-NEXT:    vmor.mm v9, v9, v13
+; RV32-NEXT:    vmseq.vx v12, v8, s7
+; RV32-NEXT:    vmor.mm v9, v9, v11
+; RV32-NEXT:    vmseq.vx v11, v8, s8
 ; RV32-NEXT:    vmor.mm v9, v9, v14
-; RV32-NEXT:    vmor.mm v9, v9, v15
-; RV32-NEXT:    vmor.mm v9, v9, v16
-; RV32-NEXT:    vmseq.vx v10, v8, s3
-; RV32-NEXT:    vmor.mm v9, v9, v17
-; RV32-NEXT:    vmseq.vx v12, v8, s4
+; RV32-NEXT:    vmseq.vx v13, v8, s9
+; RV32-NEXT:    vmor.mm v9, v9, v10
+; RV32-NEXT:    vmseq.vx v10, v8, s10
+; RV32-NEXT:    vmor.mm v9, v9, v12
+; RV32-NEXT:    vmseq.vx v12, v8, s11
 ; RV32-NEXT:    vmor.mm v9, v9, v11
-; RV32-NEXT:    vmseq.vx v11, v8, s5
-; RV32-NEXT:    vmor.mm v9, v9, v18
-; RV32-NEXT:    vmseq.vx v13, v8, s6
+; RV32-NEXT:    vmseq.vx v11, v8, a0
+; RV32-NEXT:    vmor.mm v9, v9, v13
+; RV32-NEXT:    vmseq.vx v13, v8, s2
 ; RV32-NEXT:    vmor.mm v9, v9, v10
-; RV32-NEXT:    vmseq.vx v10, v8, s7
+; RV32-NEXT:    vmseq.vx v10, v8, s3
 ; RV32-NEXT:    vmor.mm v9, v9, v12
-; RV32-NEXT:    vmseq.vx v12, v8, a0
+; RV32-NEXT:    vmseq.vx v12, v8, s4
 ; RV32-NEXT:    vmor.mm v9, v9, v11
-; RV32-NEXT:    vmseq.vx v11, v8, s2
+; RV32-NEXT:    vmseq.vx v11, v8, s5
 ; RV32-NEXT:    vmor.mm v9, v9, v13
 ; RV32-NEXT:    vmseq.vx v13, v8, a1
 ; RV32-NEXT:    vmor.mm v9, v9, v10
@@ -1009,14 +1049,18 @@ define <16 x i1> @match_v16i8_v32i8(<16 x i8> %op1, <32 x i8> %op2, <16 x i1> %m
 ; RV32-NEXT:    vmseq.vx v8, v8, s1
 ; RV32-NEXT:    vmor.mm v8, v9, v8
 ; RV32-NEXT:    vmand.mm v0, v8, v0
-; RV32-NEXT:    lw s0, 28(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s1, 24(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s2, 20(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s3, 16(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s4, 12(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s5, 8(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s6, 4(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s7, 0(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s0, 44(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s1, 40(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s2, 36(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s3, 32(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s4, 28(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s5, 24(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s6, 20(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s7, 16(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s8, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s9, 8(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s10, 4(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s11, 0(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    .cfi_restore s0
 ; RV32-NEXT:    .cfi_restore s1
 ; RV32-NEXT:    .cfi_restore s2
@@ -1025,22 +1069,30 @@ define <16 x i1> @match_v16i8_v32i8(<16 x i8> %op1, <32 x i8> %op2, <16 x i1> %m
 ; RV32-NEXT:    .cfi_restore s5
 ; RV32-NEXT:    .cfi_restore s6
 ; RV32-NEXT:    .cfi_restore s7
-; RV32-NEXT:    addi sp, sp, 32
+; RV32-NEXT:    .cfi_restore s8
+; RV32-NEXT:    .cfi_restore s9
+; RV32-NEXT:    .cfi_restore s10
+; RV32-NEXT:    .cfi_restore s11
+; RV32-NEXT:    addi sp, sp, 48
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: match_v16i8_v32i8:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    addi sp, sp, -64
-; RV64-NEXT:    .cfi_def_cfa_offset 64
-; RV64-NEXT:    sd s0, 56(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s1, 48(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s2, 40(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s3, 32(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s4, 24(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s5, 16(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s6, 8(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s7, 0(sp) # 8-byte Folded Spill
+; RV64-NEXT:    addi sp, sp, -96
+; RV64-NEXT:    .cfi_def_cfa_offset 96
+; RV64-NEXT:    sd s0, 88(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s1, 80(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s2, 72(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s3, 64(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s4, 56(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s5, 48(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s6, 40(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s7, 32(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s8, 24(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s9, 16(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s10, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s11, 0(sp) # 8-byte Folded Spill
 ; RV64-NEXT:    .cfi_offset s0, -8
 ; RV64-NEXT:    .cfi_offset s1, -16
 ; RV64-NEXT:    .cfi_offset s2, -24
@@ -1049,6 +1101,10 @@ define <16 x i1> @match_v16i8_v32i8(<16 x i8> %op1, <32 x i8> %op2, <16 x i1> %m
 ; RV64-NEXT:    .cfi_offset s5, -48
 ; RV64-NEXT:    .cfi_offset s6, -56
 ; RV64-NEXT:    .cfi_offset s7, -64
+; RV64-NEXT:    .cfi_offset s8, -72
+; RV64-NEXT:    .cfi_offset s9, -80
+; RV64-NEXT:    .cfi_offset s10, -88
+; RV64-NEXT:    .cfi_offset s11, -96
 ; RV64-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
 ; RV64-NEXT:    vmv.x.s a0, v10
 ; RV64-NEXT:    vslidedown.vi v9, v10, 1
@@ -1106,42 +1162,42 @@ define <16 x i1> @match_v16i8_v32i8(<16 x i8> %op1, <32 x i8> %op2, <16 x i1> %m
 ; RV64-NEXT:    vmv.x.s s5, v14
 ; RV64-NEXT:    vmv.x.s s6, v15
 ; RV64-NEXT:    vmv.x.s s7, v16
+; RV64-NEXT:    vmv.x.s s8, v17
+; RV64-NEXT:    vmv.x.s s9, v18
+; RV64-NEXT:    vmv.x.s s10, v19
+; RV64-NEXT:    vmv.x.s s11, v20
 ; RV64-NEXT:    vmseq.vx v9, v8, a0
-; RV64-NEXT:    vmv.x.s a0, v17
+; RV64-NEXT:    vmv.x.s a0, v21
 ; RV64-NEXT:    vmseq.vx v12, v8, s2
-; RV64-NEXT:    vmv.x.s s2, v18
+; RV64-NEXT:    vmv.x.s s2, v22
 ; RV64-NEXT:    vmseq.vx v13, v8, s3
-; RV64-NEXT:    vmv.x.s s3, v19
-; RV64-NEXT:    vmseq.vx v14, v8, s4
-; RV64-NEXT:    vmv.x.s s4, v20
-; RV64-NEXT:    vmseq.vx v15, v8, s5
-; RV64-NEXT:    vmv.x.s s5, v21
-; RV64-NEXT:    vmseq.vx v16, v8, s6
-; RV64-NEXT:    vmv.x.s s6, v22
-; RV64-NEXT:    vmseq.vx v17, v8, s7
-; RV64-NEXT:    vmv.x.s s7, v11
-; RV64-NEXT:    vmseq.vx v11, v8, a0
-; RV64-NEXT:    vmv.x.s a0, v23
-; RV64-NEXT:    vmseq.vx v18, v8, s2
-; RV64-NEXT:    vmv.x.s s2, v10
+; RV64-NEXT:    vmv.x.s s3, v11
+; RV64-NEXT:    vmseq.vx v11, v8, s4
+; RV64-NEXT:    vmv.x.s s4, v23
+; RV64-NEXT:    vmseq.vx v14, v8, s5
+; RV64-NEXT:    vmv.x.s s5, v10
 ; RV64-NEXT:    vmor.mm v9, v9, v12
+; RV64-NEXT:    vmseq.vx v10, v8, s6
 ; RV64-NEXT:    vmor.mm v9, v9, v13
+; RV64-NEXT:    vmseq.vx v12, v8, s7
+; RV64-NEXT:    vmor.mm v9, v9, v11
+; RV64-NEXT:    vmseq.vx v11, v8, s8
 ; RV64-NEXT:    vmor.mm v9, v9, v14
-; RV64-NEXT:    vmor.mm v9, v9, v15
-; RV64-NEXT:    vmor.mm v9, v9, v16
-; RV64-NEXT:    vmseq.vx v10, v8, s3
-; RV64-NEXT:    vmor.mm v9, v9, v17
-; RV64-NEXT:    vmseq.vx v12, v8, s4
+; RV64-NEXT:    vmseq.vx v13, v8, s9
+; RV64-NEXT:    vmor.mm v9, v9, v10
+; RV64-NEXT:    vmseq.vx v10, v8, s10
+; RV64-NEXT:    vmor.mm v9, v9, v12
+; RV64-NEXT:    vmseq.vx v12, v8, s11
 ; RV64-NEXT:    vmor.mm v9, v9, v11
-; RV64-NEXT:    vmseq.vx v11, v8, s5
-; RV64-NEXT:    vmor.mm v9, v9, v18
-; RV64-NEXT:    vmseq.vx v13, v8, s6
+; RV64-NEXT:    vmseq.vx v11, v8, a0
+; RV64-NEXT:    vmor.mm v9, v9, v13
+; RV64-NEXT:    vmseq.vx v13, v8, s2
 ; RV64-NEXT:    vmor.mm v9, v9, v10
-; RV64-NEXT:    vmseq.vx v10, v8, s7
+; RV64-NEXT:    vmseq.vx v10, v8, s3
 ; RV64-NEXT:    vmor.mm v9, v9, v12
-; RV64-NEXT:    vmseq.vx v12, v8, a0
+; RV64-NEXT:    vmseq.vx v12, v8, s4
 ; RV64-NEXT:    vmor.mm v9, v9, v11
-; RV64-NEXT:    vmseq.vx v11, v8, s2
+; RV64-NEXT:    vmseq.vx v11, v8, s5
 ; RV64-NEXT:    vmor.mm v9, v9, v13
 ; RV64-NEXT:    vmseq.vx v13, v8, a1
 ; RV64-NEXT:    vmor.mm v9, v9, v10
@@ -1179,14 +1235,18 @@ define <16 x i1> @match_v16i8_v32i8(<16 x i8> %op1, <32 x i8> %op2, <16 x i1> %m
 ; RV64-NEXT:    vmseq.vx v8, v8, s1
 ; RV64-NEXT:    vmor.mm v8, v9, v8
 ; RV64-NEXT:    vmand.mm v0, v8, v0
-; RV64-NEXT:    ld s0, 56(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s1, 48(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s2, 40(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s3, 32(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s4, 24(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s5, 16(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s6, 8(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s7, 0(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s0, 88(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s1, 80(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s2, 72(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s3, 64(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s4, 56(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s5, 48(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s6, 40(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s7, 32(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s8, 24(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s9, 16(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s10, 8(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s11, 0(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    .cfi_restore s0
 ; RV64-NEXT:    .cfi_restore s1
 ; RV64-NEXT:    .cfi_restore s2
@@ -1195,7 +1255,11 @@ define <16 x i1> @match_v16i8_v32i8(<16 x i8> %op1, <32 x i8> %op2, <16 x i1> %m
 ; RV64-NEXT:    .cfi_restore s5
 ; RV64-NEXT:    .cfi_restore s6
 ; RV64-NEXT:    .cfi_restore s7
-; RV64-NEXT:    addi sp, sp, 64
+; RV64-NEXT:    .cfi_restore s8
+; RV64-NEXT:    .cfi_restore s9
+; RV64-NEXT:    .cfi_restore s10
+; RV64-NEXT:    .cfi_restore s11
+; RV64-NEXT:    addi sp, sp, 96
 ; RV64-NEXT:    .cfi_def_cfa_offset 0
 ; RV64-NEXT:    ret
   %r = tail call <16 x i1> @llvm.experimental.vector.match(<16 x i8> %op1, <32 x i8> %op2, <16 x i1> %mask)
diff --git a/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll
index 22e6f23d4d6e6a..123048d996360c 100644
--- a/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll
+++ b/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll
@@ -2203,136 +2203,139 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ;
 ; RV32I-LABEL: lshr_32bytes:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -112
-; RV32I-NEXT:    sw s0, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s5, 88(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s6, 84(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s7, 80(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s8, 76(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s9, 72(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s10, 68(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s11, 64(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 0(a0)
+; RV32I-NEXT:    addi sp, sp, -128
+; RV32I-NEXT:    sw ra, 124(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 120(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 116(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 112(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 108(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 104(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s5, 100(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s6, 96(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s7, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s8, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s9, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s10, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s11, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu s1, 0(a0)
 ; RV32I-NEXT:    lbu a4, 1(a0)
 ; RV32I-NEXT:    lbu a5, 2(a0)
 ; RV32I-NEXT:    lbu a6, 3(a0)
-; RV32I-NEXT:    lbu a7, 4(a0)
-; RV32I-NEXT:    lbu t0, 5(a0)
-; RV32I-NEXT:    lbu t1, 6(a0)
-; RV32I-NEXT:    lbu t2, 7(a0)
-; RV32I-NEXT:    lbu t3, 8(a0)
-; RV32I-NEXT:    lbu t4, 9(a0)
-; RV32I-NEXT:    lbu t5, 10(a0)
-; RV32I-NEXT:    lbu t6, 11(a0)
-; RV32I-NEXT:    lbu s0, 12(a0)
-; RV32I-NEXT:    lbu s1, 13(a0)
-; RV32I-NEXT:    lbu s2, 14(a0)
-; RV32I-NEXT:    lbu s3, 15(a0)
-; RV32I-NEXT:    lbu s4, 16(a0)
-; RV32I-NEXT:    lbu s5, 17(a0)
-; RV32I-NEXT:    lbu s6, 18(a0)
-; RV32I-NEXT:    lbu s7, 19(a0)
+; RV32I-NEXT:    lbu t1, 4(a0)
+; RV32I-NEXT:    lbu t3, 5(a0)
+; RV32I-NEXT:    lbu t4, 6(a0)
+; RV32I-NEXT:    lbu s0, 7(a0)
+; RV32I-NEXT:    lbu t2, 8(a0)
+; RV32I-NEXT:    lbu s3, 9(a0)
+; RV32I-NEXT:    lbu s6, 10(a0)
+; RV32I-NEXT:    lbu s8, 11(a0)
+; RV32I-NEXT:    lbu s9, 12(a0)
+; RV32I-NEXT:    lbu s10, 13(a0)
+; RV32I-NEXT:    lbu s4, 14(a0)
+; RV32I-NEXT:    lbu s7, 15(a0)
+; RV32I-NEXT:    lbu s5, 16(a0)
+; RV32I-NEXT:    lbu s11, 17(a0)
+; RV32I-NEXT:    lbu ra, 18(a0)
+; RV32I-NEXT:    lbu a3, 19(a0)
+; RV32I-NEXT:    lbu t5, 20(a0)
+; RV32I-NEXT:    lbu t6, 21(a0)
+; RV32I-NEXT:    lbu a7, 22(a0)
+; RV32I-NEXT:    lbu t0, 23(a0)
 ; RV32I-NEXT:    slli a4, a4, 8
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    slli t1, t1, 16
-; RV32I-NEXT:    slli t2, t2, 24
-; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    slli t3, t3, 8
+; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    slli s0, s0, 24
+; RV32I-NEXT:    or a4, a4, s1
+; RV32I-NEXT:    sw a4, 4(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a5, t0, a7
-; RV32I-NEXT:    or a6, t2, t1
-; RV32I-NEXT:    lbu s8, 20(a0)
-; RV32I-NEXT:    lbu s9, 21(a0)
-; RV32I-NEXT:    lbu s10, 22(a0)
-; RV32I-NEXT:    lbu s11, 23(a0)
-; RV32I-NEXT:    slli t4, t4, 8
-; RV32I-NEXT:    slli t5, t5, 16
-; RV32I-NEXT:    slli t6, t6, 24
-; RV32I-NEXT:    slli s1, s1, 8
-; RV32I-NEXT:    slli s2, s2, 16
-; RV32I-NEXT:    slli s3, s3, 24
-; RV32I-NEXT:    or a7, t4, t3
-; RV32I-NEXT:    or t0, t6, t5
-; RV32I-NEXT:    or t1, s1, s0
-; RV32I-NEXT:    or t2, s3, s2
-; RV32I-NEXT:    lbu t6, 24(a0)
+; RV32I-NEXT:    or a5, t3, t1
+; RV32I-NEXT:    or a6, s0, t4
+; RV32I-NEXT:    lbu t1, 24(a0)
 ; RV32I-NEXT:    lbu s0, 25(a0)
 ; RV32I-NEXT:    lbu s1, 26(a0)
 ; RV32I-NEXT:    lbu s2, 27(a0)
-; RV32I-NEXT:    slli s5, s5, 8
+; RV32I-NEXT:    slli s3, s3, 8
 ; RV32I-NEXT:    slli s6, s6, 16
-; RV32I-NEXT:    slli s7, s7, 24
-; RV32I-NEXT:    slli s9, s9, 8
-; RV32I-NEXT:    or t3, s5, s4
-; RV32I-NEXT:    or t4, s7, s6
-; RV32I-NEXT:    or t5, s9, s8
+; RV32I-NEXT:    slli s8, s8, 24
+; RV32I-NEXT:    slli s10, s10, 8
+; RV32I-NEXT:    or t2, s3, t2
+; RV32I-NEXT:    or t3, s8, s6
+; RV32I-NEXT:    or t4, s10, s9
 ; RV32I-NEXT:    lbu s3, 28(a0)
-; RV32I-NEXT:    lbu s4, 29(a0)
-; RV32I-NEXT:    lbu s5, 30(a0)
-; RV32I-NEXT:    lbu s6, 31(a0)
-; RV32I-NEXT:    slli s10, s10, 16
-; RV32I-NEXT:    slli s11, s11, 24
-; RV32I-NEXT:    slli s0, s0, 8
-; RV32I-NEXT:    slli s1, s1, 16
-; RV32I-NEXT:    slli s2, s2, 24
-; RV32I-NEXT:    or a0, s11, s10
-; RV32I-NEXT:    or t6, s0, t6
-; RV32I-NEXT:    or s0, s2, s1
-; RV32I-NEXT:    lbu s1, 0(a1)
-; RV32I-NEXT:    lbu s2, 1(a1)
-; RV32I-NEXT:    lbu s7, 2(a1)
+; RV32I-NEXT:    lbu s6, 29(a0)
+; RV32I-NEXT:    lbu s8, 30(a0)
+; RV32I-NEXT:    lbu s9, 31(a0)
+; RV32I-NEXT:    slli s4, s4, 16
+; RV32I-NEXT:    slli s7, s7, 24
+; RV32I-NEXT:    slli s11, s11, 8
+; RV32I-NEXT:    slli ra, ra, 16
+; RV32I-NEXT:    slli a3, a3, 24
+; RV32I-NEXT:    or a0, s7, s4
+; RV32I-NEXT:    or s4, s11, s5
+; RV32I-NEXT:    or s5, a3, ra
+; RV32I-NEXT:    lbu a3, 0(a1)
+; RV32I-NEXT:    lbu s7, 1(a1)
+; RV32I-NEXT:    lbu s10, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    sw zero, 48(sp)
-; RV32I-NEXT:    sw zero, 52(sp)
 ; RV32I-NEXT:    sw zero, 56(sp)
 ; RV32I-NEXT:    sw zero, 60(sp)
-; RV32I-NEXT:    sw zero, 32(sp)
-; RV32I-NEXT:    sw zero, 36(sp)
+; RV32I-NEXT:    sw zero, 64(sp)
+; RV32I-NEXT:    sw zero, 68(sp)
 ; RV32I-NEXT:    sw zero, 40(sp)
 ; RV32I-NEXT:    sw zero, 44(sp)
-; RV32I-NEXT:    slli s4, s4, 8
-; RV32I-NEXT:    or s3, s4, s3
-; RV32I-NEXT:    mv s4, sp
-; RV32I-NEXT:    slli s5, s5, 16
-; RV32I-NEXT:    slli s6, s6, 24
-; RV32I-NEXT:    slli s2, s2, 8
-; RV32I-NEXT:    slli s7, s7, 16
+; RV32I-NEXT:    sw zero, 48(sp)
+; RV32I-NEXT:    sw zero, 52(sp)
+; RV32I-NEXT:    slli t6, t6, 8
+; RV32I-NEXT:    or t5, t6, t5
+; RV32I-NEXT:    addi t6, sp, 8
+; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    slli t0, t0, 24
+; RV32I-NEXT:    slli s0, s0, 8
+; RV32I-NEXT:    slli s1, s1, 16
+; RV32I-NEXT:    slli s2, s2, 24
+; RV32I-NEXT:    slli s6, s6, 8
+; RV32I-NEXT:    slli s8, s8, 16
+; RV32I-NEXT:    slli s9, s9, 24
+; RV32I-NEXT:    slli s7, s7, 8
+; RV32I-NEXT:    slli s10, s10, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or s5, s6, s5
-; RV32I-NEXT:    or s1, s2, s1
-; RV32I-NEXT:    or a1, a1, s7
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a5, t0, a7
-; RV32I-NEXT:    or a6, t2, t1
-; RV32I-NEXT:    or a7, t4, t3
-; RV32I-NEXT:    or a0, a0, t5
-; RV32I-NEXT:    or t0, s0, t6
-; RV32I-NEXT:    or t1, s5, s3
-; RV32I-NEXT:    or a1, a1, s1
-; RV32I-NEXT:    sw a7, 16(sp)
+; RV32I-NEXT:    or a7, t0, a7
+; RV32I-NEXT:    or t0, s0, t1
+; RV32I-NEXT:    or t1, s2, s1
+; RV32I-NEXT:    or s0, s6, s3
+; RV32I-NEXT:    or s1, s9, s8
+; RV32I-NEXT:    or a3, s7, a3
+; RV32I-NEXT:    or a1, a1, s10
+; RV32I-NEXT:    lw s2, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    or a4, a4, s2
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    or a6, t3, t2
+; RV32I-NEXT:    or a0, a0, t4
+; RV32I-NEXT:    or t2, s5, s4
+; RV32I-NEXT:    or a7, a7, t5
+; RV32I-NEXT:    or t0, t1, t0
+; RV32I-NEXT:    or s0, s1, s0
+; RV32I-NEXT:    or a1, a1, a3
+; RV32I-NEXT:    sw t2, 24(sp)
+; RV32I-NEXT:    sw a7, 28(sp)
+; RV32I-NEXT:    sw t0, 32(sp)
+; RV32I-NEXT:    sw s0, 36(sp)
+; RV32I-NEXT:    sw a4, 8(sp)
+; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a6, 16(sp)
 ; RV32I-NEXT:    sw a0, 20(sp)
-; RV32I-NEXT:    sw t0, 24(sp)
-; RV32I-NEXT:    sw t1, 28(sp)
-; RV32I-NEXT:    sw a3, 0(sp)
-; RV32I-NEXT:    sw a4, 4(sp)
-; RV32I-NEXT:    sw a5, 8(sp)
-; RV32I-NEXT:    sw a6, 12(sp)
 ; RV32I-NEXT:    slli t1, a1, 3
 ; RV32I-NEXT:    andi a1, a1, 28
-; RV32I-NEXT:    add a1, s4, a1
+; RV32I-NEXT:    add a1, t6, a1
 ; RV32I-NEXT:    andi a0, t1, 24
-; RV32I-NEXT:    xori a7, a0, 31
+; RV32I-NEXT:    xori t0, a0, 31
 ; RV32I-NEXT:    lw a3, 0(a1)
 ; RV32I-NEXT:    lw a4, 4(a1)
 ; RV32I-NEXT:    lw a5, 8(a1)
 ; RV32I-NEXT:    lw a6, 12(a1)
-; RV32I-NEXT:    lw t0, 16(a1)
+; RV32I-NEXT:    lw a7, 16(a1)
 ; RV32I-NEXT:    lw t2, 20(a1)
 ; RV32I-NEXT:    lw t3, 24(a1)
 ; RV32I-NEXT:    lw t4, 28(a1)
@@ -2341,33 +2344,33 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    srl a1, a3, t1
 ; RV32I-NEXT:    slli t6, a4, 1
 ; RV32I-NEXT:    srl a3, a6, t1
-; RV32I-NEXT:    slli s0, t0, 1
+; RV32I-NEXT:    slli s0, a7, 1
 ; RV32I-NEXT:    srl a4, a5, t1
 ; RV32I-NEXT:    slli s1, a6, 1
 ; RV32I-NEXT:    srl a5, t2, t1
 ; RV32I-NEXT:    slli s2, t3, 1
-; RV32I-NEXT:    srl a6, t0, t1
+; RV32I-NEXT:    srl a6, a7, t1
 ; RV32I-NEXT:    slli t2, t2, 1
-; RV32I-NEXT:    srl t0, t3, t1
+; RV32I-NEXT:    srl a7, t3, t1
 ; RV32I-NEXT:    slli t3, t4, 1
 ; RV32I-NEXT:    srl t1, t4, t1
-; RV32I-NEXT:    sll t4, t5, a7
-; RV32I-NEXT:    sll t5, t6, a7
-; RV32I-NEXT:    sll t6, s0, a7
-; RV32I-NEXT:    sll s0, s1, a7
-; RV32I-NEXT:    sll s1, s2, a7
-; RV32I-NEXT:    sll t2, t2, a7
-; RV32I-NEXT:    sll t3, t3, a7
+; RV32I-NEXT:    sll t4, t5, t0
+; RV32I-NEXT:    sll t5, t6, t0
+; RV32I-NEXT:    sll t6, s0, t0
+; RV32I-NEXT:    sll s0, s1, t0
+; RV32I-NEXT:    sll s1, s2, t0
+; RV32I-NEXT:    sll t2, t2, t0
+; RV32I-NEXT:    sll t3, t3, t0
 ; RV32I-NEXT:    srli s2, t1, 24
 ; RV32I-NEXT:    srli s3, t1, 16
 ; RV32I-NEXT:    srli s4, t1, 8
-; RV32I-NEXT:    or a7, a0, t4
+; RV32I-NEXT:    or t0, a0, t4
 ; RV32I-NEXT:    or t4, a1, t5
 ; RV32I-NEXT:    or t5, a3, t6
 ; RV32I-NEXT:    or s0, a4, s0
 ; RV32I-NEXT:    or s1, a5, s1
 ; RV32I-NEXT:    or t2, a6, t2
-; RV32I-NEXT:    or t3, t0, t3
+; RV32I-NEXT:    or t3, a7, t3
 ; RV32I-NEXT:    sb t1, 28(a2)
 ; RV32I-NEXT:    sb s4, 29(a2)
 ; RV32I-NEXT:    sb s3, 30(a2)
@@ -2384,23 +2387,23 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    srli s6, s0, 24
 ; RV32I-NEXT:    srli s7, s0, 16
 ; RV32I-NEXT:    srli s0, s0, 8
-; RV32I-NEXT:    sb t0, 24(a2)
-; RV32I-NEXT:    srli t0, t5, 24
-; RV32I-NEXT:    sb t3, 25(a2)
-; RV32I-NEXT:    srli t3, t5, 16
+; RV32I-NEXT:    srli s8, t5, 24
+; RV32I-NEXT:    srli s9, t5, 16
 ; RV32I-NEXT:    srli t5, t5, 8
+; RV32I-NEXT:    srli s10, t4, 24
+; RV32I-NEXT:    srli s11, t4, 16
+; RV32I-NEXT:    srli t4, t4, 8
+; RV32I-NEXT:    sb a7, 24(a2)
+; RV32I-NEXT:    sb t3, 25(a2)
 ; RV32I-NEXT:    sb t6, 26(a2)
-; RV32I-NEXT:    srli t6, t4, 24
 ; RV32I-NEXT:    sb t1, 27(a2)
-; RV32I-NEXT:    srli t1, t4, 16
-; RV32I-NEXT:    srli t4, t4, 8
+; RV32I-NEXT:    srli a7, t0, 24
 ; RV32I-NEXT:    sb a6, 16(a2)
-; RV32I-NEXT:    srli a6, a7, 24
 ; RV32I-NEXT:    sb t2, 17(a2)
 ; RV32I-NEXT:    sb s3, 18(a2)
 ; RV32I-NEXT:    sb s2, 19(a2)
-; RV32I-NEXT:    srli t2, a7, 16
-; RV32I-NEXT:    srli a7, a7, 8
+; RV32I-NEXT:    srli a6, t0, 16
+; RV32I-NEXT:    srli t0, t0, 8
 ; RV32I-NEXT:    sb a5, 20(a2)
 ; RV32I-NEXT:    sb s1, 21(a2)
 ; RV32I-NEXT:    sb s5, 22(a2)
@@ -2411,29 +2414,30 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sb s6, 11(a2)
 ; RV32I-NEXT:    sb a3, 12(a2)
 ; RV32I-NEXT:    sb t5, 13(a2)
-; RV32I-NEXT:    sb t3, 14(a2)
-; RV32I-NEXT:    sb t0, 15(a2)
+; RV32I-NEXT:    sb s9, 14(a2)
+; RV32I-NEXT:    sb s8, 15(a2)
 ; RV32I-NEXT:    sb a1, 0(a2)
 ; RV32I-NEXT:    sb t4, 1(a2)
-; RV32I-NEXT:    sb t1, 2(a2)
-; RV32I-NEXT:    sb t6, 3(a2)
+; RV32I-NEXT:    sb s11, 2(a2)
+; RV32I-NEXT:    sb s10, 3(a2)
 ; RV32I-NEXT:    sb a0, 4(a2)
-; RV32I-NEXT:    sb a7, 5(a2)
-; RV32I-NEXT:    sb t2, 6(a2)
-; RV32I-NEXT:    sb a6, 7(a2)
-; RV32I-NEXT:    lw s0, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s4, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s5, 88(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s6, 84(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s7, 80(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s8, 76(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s9, 72(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s10, 68(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s11, 64(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 112
+; RV32I-NEXT:    sb t0, 5(a2)
+; RV32I-NEXT:    sb a6, 6(a2)
+; RV32I-NEXT:    sb a7, 7(a2)
+; RV32I-NEXT:    lw ra, 124(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 120(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 116(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 112(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 108(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 104(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s5, 100(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s6, 96(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s7, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s8, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s9, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s10, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s11, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 128
 ; RV32I-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
   %byteOff = load i256, ptr %byteOff.ptr, align 1
@@ -2678,128 +2682,132 @@ define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
 ;
 ; RV32I-LABEL: lshr_32bytes_wordOff:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -112
-; RV32I-NEXT:    sw s0, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s5, 88(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s6, 84(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s7, 80(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s8, 76(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s9, 72(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s10, 68(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s11, 64(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 0(a0)
-; RV32I-NEXT:    lbu a4, 1(a0)
-; RV32I-NEXT:    lbu a5, 2(a0)
-; RV32I-NEXT:    lbu a6, 3(a0)
-; RV32I-NEXT:    lbu a7, 4(a0)
-; RV32I-NEXT:    lbu t0, 5(a0)
-; RV32I-NEXT:    lbu t1, 6(a0)
-; RV32I-NEXT:    lbu t2, 7(a0)
-; RV32I-NEXT:    lbu t3, 8(a0)
-; RV32I-NEXT:    lbu t4, 9(a0)
-; RV32I-NEXT:    lbu t5, 10(a0)
-; RV32I-NEXT:    lbu t6, 11(a0)
-; RV32I-NEXT:    lbu s2, 12(a0)
-; RV32I-NEXT:    lbu s3, 13(a0)
-; RV32I-NEXT:    lbu s4, 14(a0)
-; RV32I-NEXT:    lbu s5, 15(a0)
-; RV32I-NEXT:    lbu s6, 16(a0)
-; RV32I-NEXT:    lbu s7, 17(a0)
-; RV32I-NEXT:    lbu s8, 18(a0)
-; RV32I-NEXT:    lbu s9, 19(a0)
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    slli t1, t1, 16
-; RV32I-NEXT:    slli t2, t2, 24
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a5, t0, a7
-; RV32I-NEXT:    or a6, t2, t1
-; RV32I-NEXT:    lbu s10, 20(a0)
-; RV32I-NEXT:    lbu s11, 21(a0)
+; RV32I-NEXT:    addi sp, sp, -128
+; RV32I-NEXT:    sw ra, 124(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 120(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 116(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 112(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 108(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 104(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s5, 100(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s6, 96(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s7, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s8, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s9, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s10, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s11, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    mv a3, a1
+; RV32I-NEXT:    lbu a5, 0(a0)
+; RV32I-NEXT:    lbu a7, 1(a0)
+; RV32I-NEXT:    lbu t0, 2(a0)
+; RV32I-NEXT:    lbu t1, 3(a0)
+; RV32I-NEXT:    lbu s2, 4(a0)
+; RV32I-NEXT:    lbu s4, 5(a0)
+; RV32I-NEXT:    lbu s5, 6(a0)
+; RV32I-NEXT:    lbu s6, 7(a0)
+; RV32I-NEXT:    lbu s3, 8(a0)
+; RV32I-NEXT:    lbu s9, 9(a0)
+; RV32I-NEXT:    lbu s10, 10(a0)
+; RV32I-NEXT:    lbu s11, 11(a0)
+; RV32I-NEXT:    lbu ra, 12(a0)
+; RV32I-NEXT:    lbu a1, 13(a0)
+; RV32I-NEXT:    lbu t4, 14(a0)
+; RV32I-NEXT:    lbu t6, 15(a0)
+; RV32I-NEXT:    lbu a4, 16(a0)
+; RV32I-NEXT:    sw a4, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a6, 17(a0)
+; RV32I-NEXT:    lbu t2, 18(a0)
+; RV32I-NEXT:    lbu t3, 19(a0)
+; RV32I-NEXT:    lbu a4, 20(a0)
+; RV32I-NEXT:    lbu t5, 21(a0)
 ; RV32I-NEXT:    lbu s0, 22(a0)
 ; RV32I-NEXT:    lbu s1, 23(a0)
-; RV32I-NEXT:    slli t4, t4, 8
-; RV32I-NEXT:    slli t5, t5, 16
-; RV32I-NEXT:    slli t6, t6, 24
-; RV32I-NEXT:    slli s3, s3, 8
-; RV32I-NEXT:    slli s4, s4, 16
-; RV32I-NEXT:    slli s5, s5, 24
-; RV32I-NEXT:    or a7, t4, t3
-; RV32I-NEXT:    or t0, t6, t5
-; RV32I-NEXT:    or t1, s3, s2
-; RV32I-NEXT:    or t2, s5, s4
-; RV32I-NEXT:    lbu t3, 24(a0)
-; RV32I-NEXT:    lbu s2, 25(a0)
-; RV32I-NEXT:    lbu s3, 26(a0)
-; RV32I-NEXT:    lbu s4, 27(a0)
-; RV32I-NEXT:    slli s7, s7, 8
-; RV32I-NEXT:    slli s8, s8, 16
-; RV32I-NEXT:    slli s9, s9, 24
-; RV32I-NEXT:    slli s11, s11, 8
-; RV32I-NEXT:    or t4, s7, s6
-; RV32I-NEXT:    or t5, s9, s8
-; RV32I-NEXT:    or t6, s11, s10
-; RV32I-NEXT:    lbu s5, 28(a0)
-; RV32I-NEXT:    lbu s6, 29(a0)
-; RV32I-NEXT:    lbu s7, 30(a0)
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli t1, t1, 24
+; RV32I-NEXT:    slli s4, s4, 8
+; RV32I-NEXT:    slli s5, s5, 16
+; RV32I-NEXT:    slli s6, s6, 24
+; RV32I-NEXT:    or a5, a7, a5
+; RV32I-NEXT:    or a7, t1, t0
+; RV32I-NEXT:    or t0, s4, s2
+; RV32I-NEXT:    or t1, s6, s5
+; RV32I-NEXT:    lbu s2, 24(a0)
+; RV32I-NEXT:    lbu s6, 25(a0)
+; RV32I-NEXT:    lbu s7, 26(a0)
+; RV32I-NEXT:    lbu s8, 27(a0)
+; RV32I-NEXT:    slli s9, s9, 8
+; RV32I-NEXT:    slli s10, s10, 16
+; RV32I-NEXT:    slli s11, s11, 24
+; RV32I-NEXT:    slli a1, a1, 8
+; RV32I-NEXT:    or s3, s9, s3
+; RV32I-NEXT:    or s4, s11, s10
+; RV32I-NEXT:    or s5, a1, ra
+; RV32I-NEXT:    lbu s9, 28(a0)
+; RV32I-NEXT:    lbu a1, 29(a0)
+; RV32I-NEXT:    lbu s10, 30(a0)
 ; RV32I-NEXT:    lbu a0, 31(a0)
-; RV32I-NEXT:    lbu a1, 0(a1)
-; RV32I-NEXT:    sw zero, 48(sp)
-; RV32I-NEXT:    sw zero, 52(sp)
+; RV32I-NEXT:    lbu a3, 0(a3)
 ; RV32I-NEXT:    sw zero, 56(sp)
 ; RV32I-NEXT:    sw zero, 60(sp)
-; RV32I-NEXT:    sw zero, 32(sp)
-; RV32I-NEXT:    sw zero, 36(sp)
+; RV32I-NEXT:    sw zero, 64(sp)
+; RV32I-NEXT:    sw zero, 68(sp)
 ; RV32I-NEXT:    sw zero, 40(sp)
 ; RV32I-NEXT:    sw zero, 44(sp)
+; RV32I-NEXT:    sw zero, 48(sp)
+; RV32I-NEXT:    sw zero, 52(sp)
+; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    slli t6, t6, 24
+; RV32I-NEXT:    or t4, t6, t4
+; RV32I-NEXT:    addi t6, sp, 8
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    slli t2, t2, 16
+; RV32I-NEXT:    slli t3, t3, 24
+; RV32I-NEXT:    slli t5, t5, 8
 ; RV32I-NEXT:    slli s0, s0, 16
 ; RV32I-NEXT:    slli s1, s1, 24
-; RV32I-NEXT:    or s0, s1, s0
-; RV32I-NEXT:    mv s1, sp
-; RV32I-NEXT:    slli s2, s2, 8
-; RV32I-NEXT:    slli s3, s3, 16
-; RV32I-NEXT:    slli s4, s4, 24
 ; RV32I-NEXT:    slli s6, s6, 8
 ; RV32I-NEXT:    slli s7, s7, 16
+; RV32I-NEXT:    slli s8, s8, 24
+; RV32I-NEXT:    slli a1, a1, 8
+; RV32I-NEXT:    slli s10, s10, 16
 ; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    slli a1, a1, 2
-; RV32I-NEXT:    or t3, s2, t3
-; RV32I-NEXT:    or s2, s4, s3
-; RV32I-NEXT:    or s3, s6, s5
-; RV32I-NEXT:    or a0, a0, s7
-; RV32I-NEXT:    andi a1, a1, 28
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a5, t0, a7
-; RV32I-NEXT:    or a6, t2, t1
-; RV32I-NEXT:    or a7, t5, t4
-; RV32I-NEXT:    or t0, s0, t6
-; RV32I-NEXT:    or t1, s2, t3
-; RV32I-NEXT:    or a0, a0, s3
-; RV32I-NEXT:    add s1, s1, a1
-; RV32I-NEXT:    sw a7, 16(sp)
-; RV32I-NEXT:    sw t0, 20(sp)
-; RV32I-NEXT:    sw t1, 24(sp)
-; RV32I-NEXT:    sw a0, 28(sp)
-; RV32I-NEXT:    sw a3, 0(sp)
-; RV32I-NEXT:    sw a4, 4(sp)
+; RV32I-NEXT:    slli a3, a3, 2
+; RV32I-NEXT:    lw s11, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    or a6, a6, s11
+; RV32I-NEXT:    or t2, t3, t2
+; RV32I-NEXT:    or a4, t5, a4
+; RV32I-NEXT:    or s0, s1, s0
+; RV32I-NEXT:    or t3, s6, s2
+; RV32I-NEXT:    or t5, s8, s7
+; RV32I-NEXT:    or a1, a1, s9
+; RV32I-NEXT:    or a0, a0, s10
+; RV32I-NEXT:    andi a3, a3, 28
+; RV32I-NEXT:    or a5, a7, a5
+; RV32I-NEXT:    or a7, t1, t0
+; RV32I-NEXT:    or t0, s4, s3
+; RV32I-NEXT:    or t1, t4, s5
+; RV32I-NEXT:    or a6, t2, a6
+; RV32I-NEXT:    or a4, s0, a4
+; RV32I-NEXT:    or t2, t5, t3
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    add t6, t6, a3
+; RV32I-NEXT:    sw a6, 24(sp)
+; RV32I-NEXT:    sw a4, 28(sp)
+; RV32I-NEXT:    sw t2, 32(sp)
+; RV32I-NEXT:    sw a0, 36(sp)
 ; RV32I-NEXT:    sw a5, 8(sp)
-; RV32I-NEXT:    sw a6, 12(sp)
-; RV32I-NEXT:    lw a6, 16(s1)
-; RV32I-NEXT:    lw a5, 20(s1)
-; RV32I-NEXT:    lw a7, 24(s1)
-; RV32I-NEXT:    lw a1, 0(s1)
-; RV32I-NEXT:    lw a0, 4(s1)
-; RV32I-NEXT:    lw a4, 8(s1)
-; RV32I-NEXT:    lw a3, 12(s1)
-; RV32I-NEXT:    lw t0, 28(s1)
+; RV32I-NEXT:    sw a7, 12(sp)
+; RV32I-NEXT:    sw t0, 16(sp)
+; RV32I-NEXT:    sw t1, 20(sp)
+; RV32I-NEXT:    lw a6, 16(t6)
+; RV32I-NEXT:    lw a5, 20(t6)
+; RV32I-NEXT:    lw a7, 24(t6)
+; RV32I-NEXT:    lw a1, 0(t6)
+; RV32I-NEXT:    lw a0, 4(t6)
+; RV32I-NEXT:    lw a4, 8(t6)
+; RV32I-NEXT:    lw a3, 12(t6)
+; RV32I-NEXT:    lw t0, 28(t6)
 ; RV32I-NEXT:    srli t1, a7, 24
 ; RV32I-NEXT:    srli t2, a7, 16
 ; RV32I-NEXT:    srli t3, a7, 8
@@ -2814,21 +2822,21 @@ define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
 ; RV32I-NEXT:    srli s5, a5, 8
 ; RV32I-NEXT:    srli s6, a4, 24
 ; RV32I-NEXT:    srli s7, a4, 16
+; RV32I-NEXT:    srli s8, a4, 8
+; RV32I-NEXT:    srli s9, a3, 24
+; RV32I-NEXT:    srli s10, a3, 16
+; RV32I-NEXT:    srli s11, a3, 8
 ; RV32I-NEXT:    sb a7, 24(a2)
-; RV32I-NEXT:    srli a7, a4, 8
+; RV32I-NEXT:    srli a7, a1, 24
 ; RV32I-NEXT:    sb t3, 25(a2)
-; RV32I-NEXT:    srli t3, a3, 24
 ; RV32I-NEXT:    sb t2, 26(a2)
-; RV32I-NEXT:    srli t2, a3, 16
 ; RV32I-NEXT:    sb t1, 27(a2)
-; RV32I-NEXT:    srli t1, a3, 8
+; RV32I-NEXT:    srli t1, a1, 16
 ; RV32I-NEXT:    sb t0, 28(a2)
-; RV32I-NEXT:    srli t0, a1, 24
 ; RV32I-NEXT:    sb t6, 29(a2)
-; RV32I-NEXT:    srli t6, a1, 16
 ; RV32I-NEXT:    sb t5, 30(a2)
 ; RV32I-NEXT:    sb t4, 31(a2)
-; RV32I-NEXT:    srli t4, a1, 8
+; RV32I-NEXT:    srli t0, a1, 8
 ; RV32I-NEXT:    sb a6, 16(a2)
 ; RV32I-NEXT:    sb s2, 17(a2)
 ; RV32I-NEXT:    sb s1, 18(a2)
@@ -2840,35 +2848,36 @@ define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
 ; RV32I-NEXT:    sb s3, 23(a2)
 ; RV32I-NEXT:    srli a5, a0, 16
 ; RV32I-NEXT:    sb a4, 8(a2)
-; RV32I-NEXT:    sb a7, 9(a2)
+; RV32I-NEXT:    sb s8, 9(a2)
 ; RV32I-NEXT:    sb s7, 10(a2)
 ; RV32I-NEXT:    sb s6, 11(a2)
 ; RV32I-NEXT:    srli a4, a0, 8
 ; RV32I-NEXT:    sb a3, 12(a2)
-; RV32I-NEXT:    sb t1, 13(a2)
-; RV32I-NEXT:    sb t2, 14(a2)
-; RV32I-NEXT:    sb t3, 15(a2)
+; RV32I-NEXT:    sb s11, 13(a2)
+; RV32I-NEXT:    sb s10, 14(a2)
+; RV32I-NEXT:    sb s9, 15(a2)
 ; RV32I-NEXT:    sb a1, 0(a2)
-; RV32I-NEXT:    sb t4, 1(a2)
-; RV32I-NEXT:    sb t6, 2(a2)
-; RV32I-NEXT:    sb t0, 3(a2)
+; RV32I-NEXT:    sb t0, 1(a2)
+; RV32I-NEXT:    sb t1, 2(a2)
+; RV32I-NEXT:    sb a7, 3(a2)
 ; RV32I-NEXT:    sb a0, 4(a2)
 ; RV32I-NEXT:    sb a4, 5(a2)
 ; RV32I-NEXT:    sb a5, 6(a2)
 ; RV32I-NEXT:    sb a6, 7(a2)
-; RV32I-NEXT:    lw s0, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s4, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s5, 88(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s6, 84(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s7, 80(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s8, 76(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s9, 72(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s10, 68(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s11, 64(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 112
+; RV32I-NEXT:    lw ra, 124(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 120(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 116(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 112(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 108(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 104(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s5, 100(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s6, 96(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s7, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s8, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s9, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s10, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s11, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 128
 ; RV32I-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
   %wordOff = load i256, ptr %wordOff.ptr, align 1
@@ -2894,111 +2903,111 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; RV64I-NEXT:    sd s9, 80(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s10, 72(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s11, 64(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 0(a0)
-; RV64I-NEXT:    lbu a4, 1(a0)
-; RV64I-NEXT:    lbu a5, 2(a0)
-; RV64I-NEXT:    lbu a6, 3(a0)
-; RV64I-NEXT:    lbu a7, 4(a0)
-; RV64I-NEXT:    lbu t0, 5(a0)
-; RV64I-NEXT:    lbu t1, 6(a0)
-; RV64I-NEXT:    lbu t2, 7(a0)
-; RV64I-NEXT:    lbu t3, 8(a0)
-; RV64I-NEXT:    lbu t4, 9(a0)
-; RV64I-NEXT:    lbu t5, 10(a0)
-; RV64I-NEXT:    lbu t6, 11(a0)
-; RV64I-NEXT:    lbu s0, 12(a0)
-; RV64I-NEXT:    lbu s1, 13(a0)
-; RV64I-NEXT:    lbu s2, 14(a0)
-; RV64I-NEXT:    lbu s3, 15(a0)
-; RV64I-NEXT:    lbu s4, 16(a0)
-; RV64I-NEXT:    lbu s5, 17(a0)
-; RV64I-NEXT:    lbu s6, 18(a0)
-; RV64I-NEXT:    lbu s7, 19(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    slli a5, a5, 16
-; RV64I-NEXT:    slli a6, a6, 24
-; RV64I-NEXT:    slli t0, t0, 8
-; RV64I-NEXT:    slli t1, t1, 16
-; RV64I-NEXT:    slli t2, t2, 24
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    or a5, t0, a7
-; RV64I-NEXT:    or a6, t2, t1
-; RV64I-NEXT:    lbu s8, 20(a0)
-; RV64I-NEXT:    lbu s9, 21(a0)
-; RV64I-NEXT:    lbu s10, 22(a0)
-; RV64I-NEXT:    lbu s11, 23(a0)
-; RV64I-NEXT:    slli t4, t4, 8
-; RV64I-NEXT:    slli t5, t5, 16
-; RV64I-NEXT:    slli t6, t6, 24
-; RV64I-NEXT:    slli s1, s1, 8
-; RV64I-NEXT:    slli s2, s2, 16
+; RV64I-NEXT:    lbu a5, 0(a0)
+; RV64I-NEXT:    lbu a7, 1(a0)
+; RV64I-NEXT:    lbu t2, 2(a0)
+; RV64I-NEXT:    lbu s3, 3(a0)
+; RV64I-NEXT:    lbu t0, 4(a0)
+; RV64I-NEXT:    lbu s8, 5(a0)
+; RV64I-NEXT:    lbu s9, 6(a0)
+; RV64I-NEXT:    lbu s10, 7(a0)
+; RV64I-NEXT:    lbu s2, 8(a0)
+; RV64I-NEXT:    lbu s4, 9(a0)
+; RV64I-NEXT:    lbu s5, 10(a0)
+; RV64I-NEXT:    lbu s6, 11(a0)
+; RV64I-NEXT:    lbu s7, 12(a0)
+; RV64I-NEXT:    lbu s11, 13(a0)
+; RV64I-NEXT:    lbu t1, 14(a0)
+; RV64I-NEXT:    lbu t3, 15(a0)
+; RV64I-NEXT:    lbu a3, 16(a0)
+; RV64I-NEXT:    lbu a6, 17(a0)
+; RV64I-NEXT:    lbu t4, 18(a0)
+; RV64I-NEXT:    lbu t5, 19(a0)
+; RV64I-NEXT:    lbu a4, 20(a0)
+; RV64I-NEXT:    lbu t6, 21(a0)
+; RV64I-NEXT:    lbu s0, 22(a0)
+; RV64I-NEXT:    lbu s1, 23(a0)
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    slli t2, t2, 16
 ; RV64I-NEXT:    slli s3, s3, 24
-; RV64I-NEXT:    or a7, t4, t3
-; RV64I-NEXT:    or t0, t6, t5
-; RV64I-NEXT:    or t1, s1, s0
-; RV64I-NEXT:    or t2, s3, s2
-; RV64I-NEXT:    lbu t3, 24(a0)
-; RV64I-NEXT:    lbu t4, 25(a0)
-; RV64I-NEXT:    lbu t5, 26(a0)
-; RV64I-NEXT:    lbu t6, 27(a0)
-; RV64I-NEXT:    slli s5, s5, 8
-; RV64I-NEXT:    slli s6, s6, 16
-; RV64I-NEXT:    slli s7, s7, 24
-; RV64I-NEXT:    slli s9, s9, 8
-; RV64I-NEXT:    or s0, s5, s4
-; RV64I-NEXT:    or s1, s7, s6
-; RV64I-NEXT:    or s2, s9, s8
-; RV64I-NEXT:    lbu s3, 28(a0)
-; RV64I-NEXT:    lbu s4, 29(a0)
-; RV64I-NEXT:    lbu s5, 30(a0)
+; RV64I-NEXT:    slli s8, s8, 8
+; RV64I-NEXT:    slli s9, s9, 16
+; RV64I-NEXT:    slli s10, s10, 24
+; RV64I-NEXT:    or a5, a7, a5
+; RV64I-NEXT:    or a7, s3, t2
+; RV64I-NEXT:    or t0, s8, t0
+; RV64I-NEXT:    or t2, s10, s9
+; RV64I-NEXT:    lbu s3, 24(a0)
+; RV64I-NEXT:    lbu s8, 25(a0)
+; RV64I-NEXT:    lbu s9, 26(a0)
+; RV64I-NEXT:    lbu s10, 27(a0)
+; RV64I-NEXT:    slli s4, s4, 8
+; RV64I-NEXT:    slli s5, s5, 16
+; RV64I-NEXT:    slli s6, s6, 24
+; RV64I-NEXT:    slli s11, s11, 8
+; RV64I-NEXT:    or s2, s4, s2
+; RV64I-NEXT:    or s4, s6, s5
+; RV64I-NEXT:    or s5, s11, s7
+; RV64I-NEXT:    lbu s6, 28(a0)
+; RV64I-NEXT:    lbu s7, 29(a0)
+; RV64I-NEXT:    lbu s11, 30(a0)
 ; RV64I-NEXT:    lbu a0, 31(a0)
 ; RV64I-NEXT:    lbu a1, 0(a1)
 ; RV64I-NEXT:    sd zero, 32(sp)
 ; RV64I-NEXT:    sd zero, 40(sp)
 ; RV64I-NEXT:    sd zero, 48(sp)
 ; RV64I-NEXT:    sd zero, 56(sp)
-; RV64I-NEXT:    slli s10, s10, 16
-; RV64I-NEXT:    slli s11, s11, 24
-; RV64I-NEXT:    or s6, s11, s10
-; RV64I-NEXT:    mv s7, sp
-; RV64I-NEXT:    slli t4, t4, 8
-; RV64I-NEXT:    slli t5, t5, 16
-; RV64I-NEXT:    slli t6, t6, 24
-; RV64I-NEXT:    slli s4, s4, 8
-; RV64I-NEXT:    slli s5, s5, 16
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t3, t3, 24
+; RV64I-NEXT:    or t1, t3, t1
+; RV64I-NEXT:    mv t3, sp
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    slli t4, t4, 16
+; RV64I-NEXT:    slli t5, t5, 24
+; RV64I-NEXT:    slli t6, t6, 8
+; RV64I-NEXT:    slli s0, s0, 16
+; RV64I-NEXT:    slli s1, s1, 24
+; RV64I-NEXT:    slli s8, s8, 8
+; RV64I-NEXT:    slli s9, s9, 16
+; RV64I-NEXT:    slli s10, s10, 24
+; RV64I-NEXT:    slli s7, s7, 8
+; RV64I-NEXT:    slli s11, s11, 16
 ; RV64I-NEXT:    slli a0, a0, 24
 ; RV64I-NEXT:    slli a1, a1, 3
-; RV64I-NEXT:    or t3, t4, t3
-; RV64I-NEXT:    or t4, t6, t5
-; RV64I-NEXT:    or t5, s4, s3
-; RV64I-NEXT:    or a0, a0, s5
-; RV64I-NEXT:    andi a1, a1, 24
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    or a5, t0, a7
-; RV64I-NEXT:    or a6, t2, t1
+; RV64I-NEXT:    or a3, a6, a3
+; RV64I-NEXT:    or a6, t5, t4
+; RV64I-NEXT:    or a4, t6, a4
 ; RV64I-NEXT:    or s0, s1, s0
-; RV64I-NEXT:    or a7, s6, s2
-; RV64I-NEXT:    or t0, t4, t3
-; RV64I-NEXT:    or a0, a0, t5
-; RV64I-NEXT:    add s7, s7, a1
-; RV64I-NEXT:    slli a4, a4, 32
-; RV64I-NEXT:    slli a6, a6, 32
+; RV64I-NEXT:    or t4, s8, s3
+; RV64I-NEXT:    or t5, s10, s9
+; RV64I-NEXT:    or t6, s7, s6
+; RV64I-NEXT:    or a0, a0, s11
+; RV64I-NEXT:    andi a1, a1, 24
+; RV64I-NEXT:    or a5, a7, a5
+; RV64I-NEXT:    or a7, t2, t0
+; RV64I-NEXT:    or t0, s4, s2
+; RV64I-NEXT:    or t1, t1, s5
+; RV64I-NEXT:    or a3, a6, a3
+; RV64I-NEXT:    or a4, s0, a4
+; RV64I-NEXT:    or a6, t5, t4
+; RV64I-NEXT:    or a0, a0, t6
+; RV64I-NEXT:    add t3, t3, a1
 ; RV64I-NEXT:    slli a7, a7, 32
+; RV64I-NEXT:    slli t1, t1, 32
+; RV64I-NEXT:    slli a4, a4, 32
 ; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    or a1, a7, a5
+; RV64I-NEXT:    or a5, t1, t0
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    or a1, a6, a5
-; RV64I-NEXT:    or a4, a7, s0
-; RV64I-NEXT:    or a0, a0, t0
-; RV64I-NEXT:    sd a3, 0(sp)
-; RV64I-NEXT:    sd a1, 8(sp)
-; RV64I-NEXT:    sd a4, 16(sp)
+; RV64I-NEXT:    or a0, a0, a6
+; RV64I-NEXT:    sd a1, 0(sp)
+; RV64I-NEXT:    sd a5, 8(sp)
+; RV64I-NEXT:    sd a3, 16(sp)
 ; RV64I-NEXT:    sd a0, 24(sp)
-; RV64I-NEXT:    ld a4, 16(s7)
-; RV64I-NEXT:    ld a0, 8(s7)
-; RV64I-NEXT:    ld a1, 0(s7)
-; RV64I-NEXT:    ld a3, 24(s7)
+; RV64I-NEXT:    ld a4, 16(t3)
+; RV64I-NEXT:    ld a0, 8(t3)
+; RV64I-NEXT:    ld a1, 0(t3)
+; RV64I-NEXT:    ld a3, 24(t3)
 ; RV64I-NEXT:    srli a5, a4, 56
 ; RV64I-NEXT:    srli a6, a4, 48
 ; RV64I-NEXT:    srli a7, a4, 40
@@ -3017,25 +3026,25 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; RV64I-NEXT:    srli s5, a1, 48
 ; RV64I-NEXT:    srli s6, a1, 40
 ; RV64I-NEXT:    srli s7, a1, 32
+; RV64I-NEXT:    srli s8, a1, 24
+; RV64I-NEXT:    srli s9, a1, 16
+; RV64I-NEXT:    srli s10, a1, 8
+; RV64I-NEXT:    srli s11, a0, 56
 ; RV64I-NEXT:    sb t0, 20(a2)
-; RV64I-NEXT:    srli t0, a1, 24
 ; RV64I-NEXT:    sb a7, 21(a2)
-; RV64I-NEXT:    srli a7, a1, 16
 ; RV64I-NEXT:    sb a6, 22(a2)
-; RV64I-NEXT:    srli a6, a1, 8
 ; RV64I-NEXT:    sb a5, 23(a2)
-; RV64I-NEXT:    srli a5, a0, 56
+; RV64I-NEXT:    srli a5, a0, 48
 ; RV64I-NEXT:    sb a4, 16(a2)
-; RV64I-NEXT:    srli a4, a0, 48
 ; RV64I-NEXT:    sb t3, 17(a2)
 ; RV64I-NEXT:    sb t2, 18(a2)
 ; RV64I-NEXT:    sb t1, 19(a2)
-; RV64I-NEXT:    srli t1, a0, 40
+; RV64I-NEXT:    srli a4, a0, 40
 ; RV64I-NEXT:    sb s0, 28(a2)
 ; RV64I-NEXT:    sb t6, 29(a2)
 ; RV64I-NEXT:    sb t5, 30(a2)
 ; RV64I-NEXT:    sb t4, 31(a2)
-; RV64I-NEXT:    srli t2, a0, 32
+; RV64I-NEXT:    srli a6, a0, 32
 ; RV64I-NEXT:    sb a3, 24(a2)
 ; RV64I-NEXT:    sb s3, 25(a2)
 ; RV64I-NEXT:    sb s2, 26(a2)
@@ -3045,19 +3054,19 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; RV64I-NEXT:    sb s6, 5(a2)
 ; RV64I-NEXT:    sb s5, 6(a2)
 ; RV64I-NEXT:    sb s4, 7(a2)
-; RV64I-NEXT:    srli t3, a0, 16
+; RV64I-NEXT:    srli a7, a0, 16
 ; RV64I-NEXT:    sb a1, 0(a2)
-; RV64I-NEXT:    sb a6, 1(a2)
-; RV64I-NEXT:    sb a7, 2(a2)
-; RV64I-NEXT:    sb t0, 3(a2)
+; RV64I-NEXT:    sb s10, 1(a2)
+; RV64I-NEXT:    sb s9, 2(a2)
+; RV64I-NEXT:    sb s8, 3(a2)
 ; RV64I-NEXT:    srli a1, a0, 8
-; RV64I-NEXT:    sb t2, 12(a2)
-; RV64I-NEXT:    sb t1, 13(a2)
-; RV64I-NEXT:    sb a4, 14(a2)
-; RV64I-NEXT:    sb a5, 15(a2)
+; RV64I-NEXT:    sb a6, 12(a2)
+; RV64I-NEXT:    sb a4, 13(a2)
+; RV64I-NEXT:    sb a5, 14(a2)
+; RV64I-NEXT:    sb s11, 15(a2)
 ; RV64I-NEXT:    sb a0, 8(a2)
 ; RV64I-NEXT:    sb a1, 9(a2)
-; RV64I-NEXT:    sb t3, 10(a2)
+; RV64I-NEXT:    sb a7, 10(a2)
 ; RV64I-NEXT:    sb a3, 11(a2)
 ; RV64I-NEXT:    ld s0, 152(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 144(sp) # 8-byte Folded Reload
@@ -3076,128 +3085,132 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ;
 ; RV32I-LABEL: lshr_32bytes_dwordOff:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -112
-; RV32I-NEXT:    sw s0, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s5, 88(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s6, 84(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s7, 80(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s8, 76(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s9, 72(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s10, 68(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s11, 64(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 0(a0)
-; RV32I-NEXT:    lbu a4, 1(a0)
-; RV32I-NEXT:    lbu a5, 2(a0)
-; RV32I-NEXT:    lbu a6, 3(a0)
-; RV32I-NEXT:    lbu a7, 4(a0)
-; RV32I-NEXT:    lbu t0, 5(a0)
-; RV32I-NEXT:    lbu t1, 6(a0)
-; RV32I-NEXT:    lbu t2, 7(a0)
-; RV32I-NEXT:    lbu t3, 8(a0)
-; RV32I-NEXT:    lbu t4, 9(a0)
-; RV32I-NEXT:    lbu t5, 10(a0)
-; RV32I-NEXT:    lbu t6, 11(a0)
-; RV32I-NEXT:    lbu s2, 12(a0)
-; RV32I-NEXT:    lbu s3, 13(a0)
-; RV32I-NEXT:    lbu s4, 14(a0)
-; RV32I-NEXT:    lbu s5, 15(a0)
-; RV32I-NEXT:    lbu s6, 16(a0)
-; RV32I-NEXT:    lbu s7, 17(a0)
-; RV32I-NEXT:    lbu s8, 18(a0)
-; RV32I-NEXT:    lbu s9, 19(a0)
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    slli t1, t1, 16
-; RV32I-NEXT:    slli t2, t2, 24
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a5, t0, a7
-; RV32I-NEXT:    or a6, t2, t1
-; RV32I-NEXT:    lbu s10, 20(a0)
-; RV32I-NEXT:    lbu s11, 21(a0)
+; RV32I-NEXT:    addi sp, sp, -128
+; RV32I-NEXT:    sw ra, 124(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 120(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 116(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 112(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 108(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 104(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s5, 100(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s6, 96(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s7, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s8, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s9, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s10, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s11, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    mv a3, a1
+; RV32I-NEXT:    lbu a5, 0(a0)
+; RV32I-NEXT:    lbu a7, 1(a0)
+; RV32I-NEXT:    lbu t0, 2(a0)
+; RV32I-NEXT:    lbu t1, 3(a0)
+; RV32I-NEXT:    lbu s2, 4(a0)
+; RV32I-NEXT:    lbu s4, 5(a0)
+; RV32I-NEXT:    lbu s5, 6(a0)
+; RV32I-NEXT:    lbu s6, 7(a0)
+; RV32I-NEXT:    lbu s3, 8(a0)
+; RV32I-NEXT:    lbu s9, 9(a0)
+; RV32I-NEXT:    lbu s10, 10(a0)
+; RV32I-NEXT:    lbu s11, 11(a0)
+; RV32I-NEXT:    lbu ra, 12(a0)
+; RV32I-NEXT:    lbu a1, 13(a0)
+; RV32I-NEXT:    lbu t4, 14(a0)
+; RV32I-NEXT:    lbu t6, 15(a0)
+; RV32I-NEXT:    lbu a4, 16(a0)
+; RV32I-NEXT:    sw a4, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a6, 17(a0)
+; RV32I-NEXT:    lbu t2, 18(a0)
+; RV32I-NEXT:    lbu t3, 19(a0)
+; RV32I-NEXT:    lbu a4, 20(a0)
+; RV32I-NEXT:    lbu t5, 21(a0)
 ; RV32I-NEXT:    lbu s0, 22(a0)
 ; RV32I-NEXT:    lbu s1, 23(a0)
-; RV32I-NEXT:    slli t4, t4, 8
-; RV32I-NEXT:    slli t5, t5, 16
-; RV32I-NEXT:    slli t6, t6, 24
-; RV32I-NEXT:    slli s3, s3, 8
-; RV32I-NEXT:    slli s4, s4, 16
-; RV32I-NEXT:    slli s5, s5, 24
-; RV32I-NEXT:    or a7, t4, t3
-; RV32I-NEXT:    or t0, t6, t5
-; RV32I-NEXT:    or t1, s3, s2
-; RV32I-NEXT:    or t2, s5, s4
-; RV32I-NEXT:    lbu t3, 24(a0)
-; RV32I-NEXT:    lbu s2, 25(a0)
-; RV32I-NEXT:    lbu s3, 26(a0)
-; RV32I-NEXT:    lbu s4, 27(a0)
-; RV32I-NEXT:    slli s7, s7, 8
-; RV32I-NEXT:    slli s8, s8, 16
-; RV32I-NEXT:    slli s9, s9, 24
-; RV32I-NEXT:    slli s11, s11, 8
-; RV32I-NEXT:    or t4, s7, s6
-; RV32I-NEXT:    or t5, s9, s8
-; RV32I-NEXT:    or t6, s11, s10
-; RV32I-NEXT:    lbu s5, 28(a0)
-; RV32I-NEXT:    lbu s6, 29(a0)
-; RV32I-NEXT:    lbu s7, 30(a0)
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli t1, t1, 24
+; RV32I-NEXT:    slli s4, s4, 8
+; RV32I-NEXT:    slli s5, s5, 16
+; RV32I-NEXT:    slli s6, s6, 24
+; RV32I-NEXT:    or a5, a7, a5
+; RV32I-NEXT:    or a7, t1, t0
+; RV32I-NEXT:    or t0, s4, s2
+; RV32I-NEXT:    or t1, s6, s5
+; RV32I-NEXT:    lbu s2, 24(a0)
+; RV32I-NEXT:    lbu s6, 25(a0)
+; RV32I-NEXT:    lbu s7, 26(a0)
+; RV32I-NEXT:    lbu s8, 27(a0)
+; RV32I-NEXT:    slli s9, s9, 8
+; RV32I-NEXT:    slli s10, s10, 16
+; RV32I-NEXT:    slli s11, s11, 24
+; RV32I-NEXT:    slli a1, a1, 8
+; RV32I-NEXT:    or s3, s9, s3
+; RV32I-NEXT:    or s4, s11, s10
+; RV32I-NEXT:    or s5, a1, ra
+; RV32I-NEXT:    lbu s9, 28(a0)
+; RV32I-NEXT:    lbu a1, 29(a0)
+; RV32I-NEXT:    lbu s10, 30(a0)
 ; RV32I-NEXT:    lbu a0, 31(a0)
-; RV32I-NEXT:    lbu a1, 0(a1)
-; RV32I-NEXT:    sw zero, 48(sp)
-; RV32I-NEXT:    sw zero, 52(sp)
+; RV32I-NEXT:    lbu a3, 0(a3)
 ; RV32I-NEXT:    sw zero, 56(sp)
 ; RV32I-NEXT:    sw zero, 60(sp)
-; RV32I-NEXT:    sw zero, 32(sp)
-; RV32I-NEXT:    sw zero, 36(sp)
+; RV32I-NEXT:    sw zero, 64(sp)
+; RV32I-NEXT:    sw zero, 68(sp)
 ; RV32I-NEXT:    sw zero, 40(sp)
 ; RV32I-NEXT:    sw zero, 44(sp)
+; RV32I-NEXT:    sw zero, 48(sp)
+; RV32I-NEXT:    sw zero, 52(sp)
+; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    slli t6, t6, 24
+; RV32I-NEXT:    or t4, t6, t4
+; RV32I-NEXT:    addi t6, sp, 8
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    slli t2, t2, 16
+; RV32I-NEXT:    slli t3, t3, 24
+; RV32I-NEXT:    slli t5, t5, 8
 ; RV32I-NEXT:    slli s0, s0, 16
 ; RV32I-NEXT:    slli s1, s1, 24
-; RV32I-NEXT:    or s0, s1, s0
-; RV32I-NEXT:    mv s1, sp
-; RV32I-NEXT:    slli s2, s2, 8
-; RV32I-NEXT:    slli s3, s3, 16
-; RV32I-NEXT:    slli s4, s4, 24
 ; RV32I-NEXT:    slli s6, s6, 8
 ; RV32I-NEXT:    slli s7, s7, 16
+; RV32I-NEXT:    slli s8, s8, 24
+; RV32I-NEXT:    slli a1, a1, 8
+; RV32I-NEXT:    slli s10, s10, 16
 ; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    slli a1, a1, 3
-; RV32I-NEXT:    or t3, s2, t3
-; RV32I-NEXT:    or s2, s4, s3
-; RV32I-NEXT:    or s3, s6, s5
-; RV32I-NEXT:    or a0, a0, s7
-; RV32I-NEXT:    andi a1, a1, 24
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a5, t0, a7
-; RV32I-NEXT:    or a6, t2, t1
-; RV32I-NEXT:    or a7, t5, t4
-; RV32I-NEXT:    or t0, s0, t6
-; RV32I-NEXT:    or t1, s2, t3
-; RV32I-NEXT:    or a0, a0, s3
-; RV32I-NEXT:    add s1, s1, a1
-; RV32I-NEXT:    sw a7, 16(sp)
-; RV32I-NEXT:    sw t0, 20(sp)
-; RV32I-NEXT:    sw t1, 24(sp)
-; RV32I-NEXT:    sw a0, 28(sp)
-; RV32I-NEXT:    sw a3, 0(sp)
-; RV32I-NEXT:    sw a4, 4(sp)
+; RV32I-NEXT:    slli a3, a3, 3
+; RV32I-NEXT:    lw s11, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    or a6, a6, s11
+; RV32I-NEXT:    or t2, t3, t2
+; RV32I-NEXT:    or a4, t5, a4
+; RV32I-NEXT:    or s0, s1, s0
+; RV32I-NEXT:    or t3, s6, s2
+; RV32I-NEXT:    or t5, s8, s7
+; RV32I-NEXT:    or a1, a1, s9
+; RV32I-NEXT:    or a0, a0, s10
+; RV32I-NEXT:    andi a3, a3, 24
+; RV32I-NEXT:    or a5, a7, a5
+; RV32I-NEXT:    or a7, t1, t0
+; RV32I-NEXT:    or t0, s4, s3
+; RV32I-NEXT:    or t1, t4, s5
+; RV32I-NEXT:    or a6, t2, a6
+; RV32I-NEXT:    or a4, s0, a4
+; RV32I-NEXT:    or t2, t5, t3
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    add t6, t6, a3
+; RV32I-NEXT:    sw a6, 24(sp)
+; RV32I-NEXT:    sw a4, 28(sp)
+; RV32I-NEXT:    sw t2, 32(sp)
+; RV32I-NEXT:    sw a0, 36(sp)
 ; RV32I-NEXT:    sw a5, 8(sp)
-; RV32I-NEXT:    sw a6, 12(sp)
-; RV32I-NEXT:    lw a6, 16(s1)
-; RV32I-NEXT:    lw a5, 20(s1)
-; RV32I-NEXT:    lw a7, 24(s1)
-; RV32I-NEXT:    lw a1, 0(s1)
-; RV32I-NEXT:    lw a0, 4(s1)
-; RV32I-NEXT:    lw a4, 8(s1)
-; RV32I-NEXT:    lw a3, 12(s1)
-; RV32I-NEXT:    lw t0, 28(s1)
+; RV32I-NEXT:    sw a7, 12(sp)
+; RV32I-NEXT:    sw t0, 16(sp)
+; RV32I-NEXT:    sw t1, 20(sp)
+; RV32I-NEXT:    lw a6, 16(t6)
+; RV32I-NEXT:    lw a5, 20(t6)
+; RV32I-NEXT:    lw a7, 24(t6)
+; RV32I-NEXT:    lw a1, 0(t6)
+; RV32I-NEXT:    lw a0, 4(t6)
+; RV32I-NEXT:    lw a4, 8(t6)
+; RV32I-NEXT:    lw a3, 12(t6)
+; RV32I-NEXT:    lw t0, 28(t6)
 ; RV32I-NEXT:    srli t1, a7, 24
 ; RV32I-NEXT:    srli t2, a7, 16
 ; RV32I-NEXT:    srli t3, a7, 8
@@ -3212,21 +3225,21 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; RV32I-NEXT:    srli s5, a5, 8
 ; RV32I-NEXT:    srli s6, a4, 24
 ; RV32I-NEXT:    srli s7, a4, 16
+; RV32I-NEXT:    srli s8, a4, 8
+; RV32I-NEXT:    srli s9, a3, 24
+; RV32I-NEXT:    srli s10, a3, 16
+; RV32I-NEXT:    srli s11, a3, 8
 ; RV32I-NEXT:    sb a7, 24(a2)
-; RV32I-NEXT:    srli a7, a4, 8
+; RV32I-NEXT:    srli a7, a1, 24
 ; RV32I-NEXT:    sb t3, 25(a2)
-; RV32I-NEXT:    srli t3, a3, 24
 ; RV32I-NEXT:    sb t2, 26(a2)
-; RV32I-NEXT:    srli t2, a3, 16
 ; RV32I-NEXT:    sb t1, 27(a2)
-; RV32I-NEXT:    srli t1, a3, 8
+; RV32I-NEXT:    srli t1, a1, 16
 ; RV32I-NEXT:    sb t0, 28(a2)
-; RV32I-NEXT:    srli t0, a1, 24
 ; RV32I-NEXT:    sb t6, 29(a2)
-; RV32I-NEXT:    srli t6, a1, 16
 ; RV32I-NEXT:    sb t5, 30(a2)
 ; RV32I-NEXT:    sb t4, 31(a2)
-; RV32I-NEXT:    srli t4, a1, 8
+; RV32I-NEXT:    srli t0, a1, 8
 ; RV32I-NEXT:    sb a6, 16(a2)
 ; RV32I-NEXT:    sb s2, 17(a2)
 ; RV32I-NEXT:    sb s1, 18(a2)
@@ -3238,35 +3251,36 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; RV32I-NEXT:    sb s3, 23(a2)
 ; RV32I-NEXT:    srli a5, a0, 16
 ; RV32I-NEXT:    sb a4, 8(a2)
-; RV32I-NEXT:    sb a7, 9(a2)
+; RV32I-NEXT:    sb s8, 9(a2)
 ; RV32I-NEXT:    sb s7, 10(a2)
 ; RV32I-NEXT:    sb s6, 11(a2)
 ; RV32I-NEXT:    srli a4, a0, 8
 ; RV32I-NEXT:    sb a3, 12(a2)
-; RV32I-NEXT:    sb t1, 13(a2)
-; RV32I-NEXT:    sb t2, 14(a2)
-; RV32I-NEXT:    sb t3, 15(a2)
+; RV32I-NEXT:    sb s11, 13(a2)
+; RV32I-NEXT:    sb s10, 14(a2)
+; RV32I-NEXT:    sb s9, 15(a2)
 ; RV32I-NEXT:    sb a1, 0(a2)
-; RV32I-NEXT:    sb t4, 1(a2)
-; RV32I-NEXT:    sb t6, 2(a2)
-; RV32I-NEXT:    sb t0, 3(a2)
+; RV32I-NEXT:    sb t0, 1(a2)
+; RV32I-NEXT:    sb t1, 2(a2)
+; RV32I-NEXT:    sb a7, 3(a2)
 ; RV32I-NEXT:    sb a0, 4(a2)
 ; RV32I-NEXT:    sb a4, 5(a2)
 ; RV32I-NEXT:    sb a5, 6(a2)
 ; RV32I-NEXT:    sb a6, 7(a2)
-; RV32I-NEXT:    lw s0, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s4, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s5, 88(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s6, 84(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s7, 80(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s8, 76(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s9, 72(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s10, 68(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s11, 64(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 112
+; RV32I-NEXT:    lw ra, 124(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 120(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 116(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 112(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 108(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 104(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s5, 100(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s6, 96(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s7, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s8, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s9, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s10, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s11, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 128
 ; RV32I-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
   %dwordOff = load i256, ptr %dwordOff.ptr, align 1
@@ -3510,129 +3524,132 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ;
 ; RV32I-LABEL: shl_32bytes:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -112
-; RV32I-NEXT:    sw s0, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s5, 88(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s6, 84(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s7, 80(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s8, 76(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s9, 72(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s10, 68(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s11, 64(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 0(a0)
+; RV32I-NEXT:    addi sp, sp, -128
+; RV32I-NEXT:    sw ra, 124(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 120(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 116(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 112(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 108(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 104(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s5, 100(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s6, 96(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s7, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s8, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s9, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s10, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s11, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu s1, 0(a0)
 ; RV32I-NEXT:    lbu a4, 1(a0)
 ; RV32I-NEXT:    lbu a5, 2(a0)
 ; RV32I-NEXT:    lbu a6, 3(a0)
-; RV32I-NEXT:    lbu a7, 4(a0)
-; RV32I-NEXT:    lbu t0, 5(a0)
-; RV32I-NEXT:    lbu t1, 6(a0)
-; RV32I-NEXT:    lbu t2, 7(a0)
-; RV32I-NEXT:    lbu t3, 8(a0)
-; RV32I-NEXT:    lbu t4, 9(a0)
-; RV32I-NEXT:    lbu t5, 10(a0)
-; RV32I-NEXT:    lbu t6, 11(a0)
-; RV32I-NEXT:    lbu s0, 12(a0)
-; RV32I-NEXT:    lbu s1, 13(a0)
-; RV32I-NEXT:    lbu s2, 14(a0)
-; RV32I-NEXT:    lbu s3, 15(a0)
-; RV32I-NEXT:    lbu s4, 16(a0)
-; RV32I-NEXT:    lbu s5, 17(a0)
-; RV32I-NEXT:    lbu s6, 18(a0)
-; RV32I-NEXT:    lbu s7, 19(a0)
+; RV32I-NEXT:    lbu t1, 4(a0)
+; RV32I-NEXT:    lbu t3, 5(a0)
+; RV32I-NEXT:    lbu t4, 6(a0)
+; RV32I-NEXT:    lbu s0, 7(a0)
+; RV32I-NEXT:    lbu t2, 8(a0)
+; RV32I-NEXT:    lbu s3, 9(a0)
+; RV32I-NEXT:    lbu s6, 10(a0)
+; RV32I-NEXT:    lbu s8, 11(a0)
+; RV32I-NEXT:    lbu s9, 12(a0)
+; RV32I-NEXT:    lbu s10, 13(a0)
+; RV32I-NEXT:    lbu s4, 14(a0)
+; RV32I-NEXT:    lbu s7, 15(a0)
+; RV32I-NEXT:    lbu s5, 16(a0)
+; RV32I-NEXT:    lbu s11, 17(a0)
+; RV32I-NEXT:    lbu ra, 18(a0)
+; RV32I-NEXT:    lbu a3, 19(a0)
+; RV32I-NEXT:    lbu t5, 20(a0)
+; RV32I-NEXT:    lbu t6, 21(a0)
+; RV32I-NEXT:    lbu a7, 22(a0)
+; RV32I-NEXT:    lbu t0, 23(a0)
 ; RV32I-NEXT:    slli a4, a4, 8
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    slli t1, t1, 16
-; RV32I-NEXT:    slli t2, t2, 24
-; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    slli t3, t3, 8
+; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    slli s0, s0, 24
+; RV32I-NEXT:    or a4, a4, s1
+; RV32I-NEXT:    sw a4, 4(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a5, t0, a7
-; RV32I-NEXT:    or a6, t2, t1
-; RV32I-NEXT:    lbu s8, 20(a0)
-; RV32I-NEXT:    lbu s9, 21(a0)
-; RV32I-NEXT:    lbu s10, 22(a0)
-; RV32I-NEXT:    lbu s11, 23(a0)
-; RV32I-NEXT:    slli t4, t4, 8
-; RV32I-NEXT:    slli t5, t5, 16
-; RV32I-NEXT:    slli t6, t6, 24
-; RV32I-NEXT:    slli s1, s1, 8
-; RV32I-NEXT:    slli s2, s2, 16
-; RV32I-NEXT:    slli s3, s3, 24
-; RV32I-NEXT:    or a7, t4, t3
-; RV32I-NEXT:    or t0, t6, t5
-; RV32I-NEXT:    or t1, s1, s0
-; RV32I-NEXT:    or t2, s3, s2
-; RV32I-NEXT:    lbu t6, 24(a0)
+; RV32I-NEXT:    or a5, t3, t1
+; RV32I-NEXT:    or a6, s0, t4
+; RV32I-NEXT:    lbu t1, 24(a0)
 ; RV32I-NEXT:    lbu s0, 25(a0)
 ; RV32I-NEXT:    lbu s1, 26(a0)
 ; RV32I-NEXT:    lbu s2, 27(a0)
-; RV32I-NEXT:    slli s5, s5, 8
+; RV32I-NEXT:    slli s3, s3, 8
 ; RV32I-NEXT:    slli s6, s6, 16
-; RV32I-NEXT:    slli s7, s7, 24
-; RV32I-NEXT:    slli s9, s9, 8
-; RV32I-NEXT:    or t3, s5, s4
-; RV32I-NEXT:    or t4, s7, s6
-; RV32I-NEXT:    or t5, s9, s8
+; RV32I-NEXT:    slli s8, s8, 24
+; RV32I-NEXT:    slli s10, s10, 8
+; RV32I-NEXT:    or t2, s3, t2
+; RV32I-NEXT:    or t3, s8, s6
+; RV32I-NEXT:    or t4, s10, s9
 ; RV32I-NEXT:    lbu s3, 28(a0)
-; RV32I-NEXT:    lbu s4, 29(a0)
-; RV32I-NEXT:    lbu s5, 30(a0)
-; RV32I-NEXT:    lbu s6, 31(a0)
-; RV32I-NEXT:    slli s10, s10, 16
-; RV32I-NEXT:    slli s11, s11, 24
-; RV32I-NEXT:    slli s0, s0, 8
-; RV32I-NEXT:    slli s1, s1, 16
-; RV32I-NEXT:    slli s2, s2, 24
-; RV32I-NEXT:    or a0, s11, s10
-; RV32I-NEXT:    or t6, s0, t6
-; RV32I-NEXT:    or s0, s2, s1
-; RV32I-NEXT:    lbu s1, 0(a1)
-; RV32I-NEXT:    lbu s2, 1(a1)
-; RV32I-NEXT:    lbu s7, 2(a1)
+; RV32I-NEXT:    lbu s6, 29(a0)
+; RV32I-NEXT:    lbu s8, 30(a0)
+; RV32I-NEXT:    lbu s9, 31(a0)
+; RV32I-NEXT:    slli s4, s4, 16
+; RV32I-NEXT:    slli s7, s7, 24
+; RV32I-NEXT:    slli s11, s11, 8
+; RV32I-NEXT:    slli ra, ra, 16
+; RV32I-NEXT:    slli a3, a3, 24
+; RV32I-NEXT:    or a0, s7, s4
+; RV32I-NEXT:    or s4, s11, s5
+; RV32I-NEXT:    or s5, a3, ra
+; RV32I-NEXT:    lbu a3, 0(a1)
+; RV32I-NEXT:    lbu s7, 1(a1)
+; RV32I-NEXT:    lbu s10, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    sw zero, 16(sp)
-; RV32I-NEXT:    sw zero, 20(sp)
 ; RV32I-NEXT:    sw zero, 24(sp)
 ; RV32I-NEXT:    sw zero, 28(sp)
-; RV32I-NEXT:    sw zero, 0(sp)
-; RV32I-NEXT:    sw zero, 4(sp)
+; RV32I-NEXT:    sw zero, 32(sp)
+; RV32I-NEXT:    sw zero, 36(sp)
 ; RV32I-NEXT:    sw zero, 8(sp)
 ; RV32I-NEXT:    sw zero, 12(sp)
-; RV32I-NEXT:    slli s4, s4, 8
-; RV32I-NEXT:    or s3, s4, s3
-; RV32I-NEXT:    addi s4, sp, 32
-; RV32I-NEXT:    slli s5, s5, 16
-; RV32I-NEXT:    slli s6, s6, 24
-; RV32I-NEXT:    slli s2, s2, 8
-; RV32I-NEXT:    slli s7, s7, 16
+; RV32I-NEXT:    sw zero, 16(sp)
+; RV32I-NEXT:    sw zero, 20(sp)
+; RV32I-NEXT:    slli t6, t6, 8
+; RV32I-NEXT:    or t5, t6, t5
+; RV32I-NEXT:    addi t6, sp, 40
+; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    slli t0, t0, 24
+; RV32I-NEXT:    slli s0, s0, 8
+; RV32I-NEXT:    slli s1, s1, 16
+; RV32I-NEXT:    slli s2, s2, 24
+; RV32I-NEXT:    slli s6, s6, 8
+; RV32I-NEXT:    slli s8, s8, 16
+; RV32I-NEXT:    slli s9, s9, 24
+; RV32I-NEXT:    slli s7, s7, 8
+; RV32I-NEXT:    slli s10, s10, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or s5, s6, s5
-; RV32I-NEXT:    or s1, s2, s1
-; RV32I-NEXT:    or a1, a1, s7
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a5, t0, a7
-; RV32I-NEXT:    or a6, t2, t1
-; RV32I-NEXT:    or a7, t4, t3
-; RV32I-NEXT:    or a0, a0, t5
-; RV32I-NEXT:    or t0, s0, t6
-; RV32I-NEXT:    or t1, s5, s3
-; RV32I-NEXT:    or a1, a1, s1
-; RV32I-NEXT:    sw a7, 48(sp)
+; RV32I-NEXT:    or a7, t0, a7
+; RV32I-NEXT:    or t0, s0, t1
+; RV32I-NEXT:    or t1, s2, s1
+; RV32I-NEXT:    or s0, s6, s3
+; RV32I-NEXT:    or s1, s9, s8
+; RV32I-NEXT:    or a3, s7, a3
+; RV32I-NEXT:    or a1, a1, s10
+; RV32I-NEXT:    lw s2, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    or a4, a4, s2
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    or a6, t3, t2
+; RV32I-NEXT:    or a0, a0, t4
+; RV32I-NEXT:    or t2, s5, s4
+; RV32I-NEXT:    or a7, a7, t5
+; RV32I-NEXT:    or t0, t1, t0
+; RV32I-NEXT:    or s0, s1, s0
+; RV32I-NEXT:    or a1, a1, a3
+; RV32I-NEXT:    sw t2, 56(sp)
+; RV32I-NEXT:    sw a7, 60(sp)
+; RV32I-NEXT:    sw t0, 64(sp)
+; RV32I-NEXT:    sw s0, 68(sp)
+; RV32I-NEXT:    sw a4, 40(sp)
+; RV32I-NEXT:    sw a5, 44(sp)
+; RV32I-NEXT:    sw a6, 48(sp)
 ; RV32I-NEXT:    sw a0, 52(sp)
-; RV32I-NEXT:    sw t0, 56(sp)
-; RV32I-NEXT:    sw t1, 60(sp)
-; RV32I-NEXT:    sw a3, 32(sp)
-; RV32I-NEXT:    sw a4, 36(sp)
-; RV32I-NEXT:    sw a5, 40(sp)
-; RV32I-NEXT:    sw a6, 44(sp)
 ; RV32I-NEXT:    slli a3, a1, 3
 ; RV32I-NEXT:    andi a1, a1, 28
-; RV32I-NEXT:    sub a1, s4, a1
+; RV32I-NEXT:    sub a1, t6, a1
 ; RV32I-NEXT:    andi a0, a3, 24
 ; RV32I-NEXT:    xori a0, a0, 31
 ; RV32I-NEXT:    lw a4, 0(a1)
@@ -3647,10 +3664,10 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    srli t4, a4, 1
 ; RV32I-NEXT:    sll t5, a7, a3
 ; RV32I-NEXT:    srli t6, a6, 1
-; RV32I-NEXT:    sll a6, a6, a3
+; RV32I-NEXT:    sll s0, a6, a3
 ; RV32I-NEXT:    srli a5, a5, 1
-; RV32I-NEXT:    sll s0, t1, a3
-; RV32I-NEXT:    srli s1, t0, 1
+; RV32I-NEXT:    sll s1, t1, a3
+; RV32I-NEXT:    srli a6, t0, 1
 ; RV32I-NEXT:    sll s2, t0, a3
 ; RV32I-NEXT:    srli a7, a7, 1
 ; RV32I-NEXT:    sll s3, a1, a3
@@ -3658,56 +3675,56 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sll s4, t2, a3
 ; RV32I-NEXT:    srli t0, t1, 1
 ; RV32I-NEXT:    sll s5, a4, a3
-; RV32I-NEXT:    srl t4, t4, a0
-; RV32I-NEXT:    srl a4, t6, a0
-; RV32I-NEXT:    srl t1, a5, a0
-; RV32I-NEXT:    srl t6, s1, a0
-; RV32I-NEXT:    srl s1, a7, a0
-; RV32I-NEXT:    srl s6, a1, a0
-; RV32I-NEXT:    srl s7, t0, a0
-; RV32I-NEXT:    srli t2, s4, 24
-; RV32I-NEXT:    srli t0, s3, 24
+; RV32I-NEXT:    srl t2, t4, a0
+; RV32I-NEXT:    srl t4, t6, a0
+; RV32I-NEXT:    srl t6, a5, a0
+; RV32I-NEXT:    srl s6, a6, a0
+; RV32I-NEXT:    srl s7, a7, a0
+; RV32I-NEXT:    srl s8, a1, a0
+; RV32I-NEXT:    srl s9, t0, a0
+; RV32I-NEXT:    srli t1, s4, 24
+; RV32I-NEXT:    srli a7, s3, 24
 ; RV32I-NEXT:    srli a5, s2, 24
-; RV32I-NEXT:    srli a3, s0, 24
-; RV32I-NEXT:    srli a1, a6, 24
+; RV32I-NEXT:    srli a3, s1, 24
+; RV32I-NEXT:    srli a1, s0, 24
 ; RV32I-NEXT:    srli a0, t5, 24
-; RV32I-NEXT:    srli s8, s5, 24
-; RV32I-NEXT:    or a4, t5, a4
-; RV32I-NEXT:    srli t5, s5, 16
-; RV32I-NEXT:    or t1, a6, t1
-; RV32I-NEXT:    srli s9, s5, 8
-; RV32I-NEXT:    or a7, t3, t4
-; RV32I-NEXT:    srli a6, t3, 24
-; RV32I-NEXT:    or t3, s0, t6
-; RV32I-NEXT:    or t4, s2, s1
-; RV32I-NEXT:    or t6, s3, s6
-; RV32I-NEXT:    or s0, s4, s7
+; RV32I-NEXT:    srli s10, s5, 24
+; RV32I-NEXT:    srli s11, s5, 16
+; RV32I-NEXT:    srli ra, s5, 8
+; RV32I-NEXT:    srli a4, t3, 24
+; RV32I-NEXT:    or a6, t3, t2
+; RV32I-NEXT:    or t0, t5, t4
+; RV32I-NEXT:    or t2, s0, t6
+; RV32I-NEXT:    or t3, s1, s6
+; RV32I-NEXT:    or t4, s2, s7
+; RV32I-NEXT:    or t5, s3, s8
+; RV32I-NEXT:    or t6, s4, s9
 ; RV32I-NEXT:    sb s5, 0(a2)
-; RV32I-NEXT:    sb s9, 1(a2)
-; RV32I-NEXT:    sb t5, 2(a2)
-; RV32I-NEXT:    sb s8, 3(a2)
-; RV32I-NEXT:    srli t5, s0, 16
-; RV32I-NEXT:    srli s1, s0, 8
-; RV32I-NEXT:    srli s2, t6, 16
-; RV32I-NEXT:    srli s3, t6, 8
+; RV32I-NEXT:    sb ra, 1(a2)
+; RV32I-NEXT:    sb s11, 2(a2)
+; RV32I-NEXT:    sb s10, 3(a2)
+; RV32I-NEXT:    srli s0, t6, 16
+; RV32I-NEXT:    srli s1, t6, 8
+; RV32I-NEXT:    srli s2, t5, 16
+; RV32I-NEXT:    srli s3, t5, 8
 ; RV32I-NEXT:    srli s4, t4, 16
 ; RV32I-NEXT:    srli s5, t4, 8
 ; RV32I-NEXT:    srli s6, t3, 16
 ; RV32I-NEXT:    srli s7, t3, 8
-; RV32I-NEXT:    sb s0, 24(a2)
-; RV32I-NEXT:    srli s0, t1, 16
+; RV32I-NEXT:    srli s8, t2, 16
+; RV32I-NEXT:    srli s9, t2, 8
+; RV32I-NEXT:    srli s10, t0, 16
+; RV32I-NEXT:    srli s11, t0, 8
+; RV32I-NEXT:    sb t6, 24(a2)
 ; RV32I-NEXT:    sb s1, 25(a2)
-; RV32I-NEXT:    srli s1, t1, 8
-; RV32I-NEXT:    sb t5, 26(a2)
-; RV32I-NEXT:    srli t5, a4, 16
-; RV32I-NEXT:    sb t2, 27(a2)
-; RV32I-NEXT:    srli t2, a4, 8
-; RV32I-NEXT:    sb t6, 28(a2)
-; RV32I-NEXT:    srli t6, a7, 16
+; RV32I-NEXT:    sb s0, 26(a2)
+; RV32I-NEXT:    sb t1, 27(a2)
+; RV32I-NEXT:    srli t1, a6, 16
+; RV32I-NEXT:    sb t5, 28(a2)
 ; RV32I-NEXT:    sb s3, 29(a2)
 ; RV32I-NEXT:    sb s2, 30(a2)
-; RV32I-NEXT:    sb t0, 31(a2)
-; RV32I-NEXT:    srli t0, a7, 8
+; RV32I-NEXT:    sb a7, 31(a2)
+; RV32I-NEXT:    srli a7, a6, 8
 ; RV32I-NEXT:    sb t4, 16(a2)
 ; RV32I-NEXT:    sb s5, 17(a2)
 ; RV32I-NEXT:    sb s4, 18(a2)
@@ -3716,31 +3733,32 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sb s7, 21(a2)
 ; RV32I-NEXT:    sb s6, 22(a2)
 ; RV32I-NEXT:    sb a3, 23(a2)
-; RV32I-NEXT:    sb t1, 8(a2)
-; RV32I-NEXT:    sb s1, 9(a2)
-; RV32I-NEXT:    sb s0, 10(a2)
+; RV32I-NEXT:    sb t2, 8(a2)
+; RV32I-NEXT:    sb s9, 9(a2)
+; RV32I-NEXT:    sb s8, 10(a2)
 ; RV32I-NEXT:    sb a1, 11(a2)
-; RV32I-NEXT:    sb a4, 12(a2)
-; RV32I-NEXT:    sb t2, 13(a2)
-; RV32I-NEXT:    sb t5, 14(a2)
+; RV32I-NEXT:    sb t0, 12(a2)
+; RV32I-NEXT:    sb s11, 13(a2)
+; RV32I-NEXT:    sb s10, 14(a2)
 ; RV32I-NEXT:    sb a0, 15(a2)
-; RV32I-NEXT:    sb a7, 4(a2)
-; RV32I-NEXT:    sb t0, 5(a2)
-; RV32I-NEXT:    sb t6, 6(a2)
-; RV32I-NEXT:    sb a6, 7(a2)
-; RV32I-NEXT:    lw s0, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s4, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s5, 88(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s6, 84(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s7, 80(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s8, 76(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s9, 72(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s10, 68(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s11, 64(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 112
+; RV32I-NEXT:    sb a6, 4(a2)
+; RV32I-NEXT:    sb a7, 5(a2)
+; RV32I-NEXT:    sb t1, 6(a2)
+; RV32I-NEXT:    sb a4, 7(a2)
+; RV32I-NEXT:    lw ra, 124(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 120(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 116(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 112(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 108(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 104(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s5, 100(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s6, 96(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s7, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s8, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s9, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s10, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s11, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 128
 ; RV32I-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
   %byteOff = load i256, ptr %byteOff.ptr, align 1
@@ -3985,128 +4003,132 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw
 ;
 ; RV32I-LABEL: shl_32bytes_wordOff:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -112
-; RV32I-NEXT:    sw s0, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s5, 88(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s6, 84(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s7, 80(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s8, 76(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s9, 72(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s10, 68(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s11, 64(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 0(a0)
-; RV32I-NEXT:    lbu a4, 1(a0)
-; RV32I-NEXT:    lbu a5, 2(a0)
-; RV32I-NEXT:    lbu a6, 3(a0)
-; RV32I-NEXT:    lbu a7, 4(a0)
-; RV32I-NEXT:    lbu t0, 5(a0)
-; RV32I-NEXT:    lbu t1, 6(a0)
-; RV32I-NEXT:    lbu t2, 7(a0)
-; RV32I-NEXT:    lbu t3, 8(a0)
-; RV32I-NEXT:    lbu t4, 9(a0)
-; RV32I-NEXT:    lbu t5, 10(a0)
-; RV32I-NEXT:    lbu t6, 11(a0)
-; RV32I-NEXT:    lbu s2, 12(a0)
-; RV32I-NEXT:    lbu s3, 13(a0)
-; RV32I-NEXT:    lbu s4, 14(a0)
-; RV32I-NEXT:    lbu s5, 15(a0)
-; RV32I-NEXT:    lbu s6, 16(a0)
-; RV32I-NEXT:    lbu s7, 17(a0)
-; RV32I-NEXT:    lbu s8, 18(a0)
-; RV32I-NEXT:    lbu s9, 19(a0)
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    slli t1, t1, 16
-; RV32I-NEXT:    slli t2, t2, 24
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a5, t0, a7
-; RV32I-NEXT:    or a6, t2, t1
-; RV32I-NEXT:    lbu s10, 20(a0)
-; RV32I-NEXT:    lbu s11, 21(a0)
+; RV32I-NEXT:    addi sp, sp, -128
+; RV32I-NEXT:    sw ra, 124(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 120(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 116(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 112(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 108(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 104(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s5, 100(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s6, 96(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s7, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s8, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s9, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s10, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s11, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    mv a3, a1
+; RV32I-NEXT:    lbu a5, 0(a0)
+; RV32I-NEXT:    lbu a7, 1(a0)
+; RV32I-NEXT:    lbu t0, 2(a0)
+; RV32I-NEXT:    lbu t1, 3(a0)
+; RV32I-NEXT:    lbu s2, 4(a0)
+; RV32I-NEXT:    lbu s4, 5(a0)
+; RV32I-NEXT:    lbu s5, 6(a0)
+; RV32I-NEXT:    lbu s6, 7(a0)
+; RV32I-NEXT:    lbu s3, 8(a0)
+; RV32I-NEXT:    lbu s9, 9(a0)
+; RV32I-NEXT:    lbu s10, 10(a0)
+; RV32I-NEXT:    lbu s11, 11(a0)
+; RV32I-NEXT:    lbu ra, 12(a0)
+; RV32I-NEXT:    lbu a1, 13(a0)
+; RV32I-NEXT:    lbu t4, 14(a0)
+; RV32I-NEXT:    lbu t6, 15(a0)
+; RV32I-NEXT:    lbu a4, 16(a0)
+; RV32I-NEXT:    sw a4, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a6, 17(a0)
+; RV32I-NEXT:    lbu t2, 18(a0)
+; RV32I-NEXT:    lbu t3, 19(a0)
+; RV32I-NEXT:    lbu a4, 20(a0)
+; RV32I-NEXT:    lbu t5, 21(a0)
 ; RV32I-NEXT:    lbu s0, 22(a0)
 ; RV32I-NEXT:    lbu s1, 23(a0)
-; RV32I-NEXT:    slli t4, t4, 8
-; RV32I-NEXT:    slli t5, t5, 16
-; RV32I-NEXT:    slli t6, t6, 24
-; RV32I-NEXT:    slli s3, s3, 8
-; RV32I-NEXT:    slli s4, s4, 16
-; RV32I-NEXT:    slli s5, s5, 24
-; RV32I-NEXT:    or a7, t4, t3
-; RV32I-NEXT:    or t0, t6, t5
-; RV32I-NEXT:    or t1, s3, s2
-; RV32I-NEXT:    or t2, s5, s4
-; RV32I-NEXT:    lbu t3, 24(a0)
-; RV32I-NEXT:    lbu s2, 25(a0)
-; RV32I-NEXT:    lbu s3, 26(a0)
-; RV32I-NEXT:    lbu s4, 27(a0)
-; RV32I-NEXT:    slli s7, s7, 8
-; RV32I-NEXT:    slli s8, s8, 16
-; RV32I-NEXT:    slli s9, s9, 24
-; RV32I-NEXT:    slli s11, s11, 8
-; RV32I-NEXT:    or t4, s7, s6
-; RV32I-NEXT:    or t5, s9, s8
-; RV32I-NEXT:    or t6, s11, s10
-; RV32I-NEXT:    lbu s5, 28(a0)
-; RV32I-NEXT:    lbu s6, 29(a0)
-; RV32I-NEXT:    lbu s7, 30(a0)
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli t1, t1, 24
+; RV32I-NEXT:    slli s4, s4, 8
+; RV32I-NEXT:    slli s5, s5, 16
+; RV32I-NEXT:    slli s6, s6, 24
+; RV32I-NEXT:    or a5, a7, a5
+; RV32I-NEXT:    or a7, t1, t0
+; RV32I-NEXT:    or t0, s4, s2
+; RV32I-NEXT:    or t1, s6, s5
+; RV32I-NEXT:    lbu s2, 24(a0)
+; RV32I-NEXT:    lbu s6, 25(a0)
+; RV32I-NEXT:    lbu s7, 26(a0)
+; RV32I-NEXT:    lbu s8, 27(a0)
+; RV32I-NEXT:    slli s9, s9, 8
+; RV32I-NEXT:    slli s10, s10, 16
+; RV32I-NEXT:    slli s11, s11, 24
+; RV32I-NEXT:    slli a1, a1, 8
+; RV32I-NEXT:    or s3, s9, s3
+; RV32I-NEXT:    or s4, s11, s10
+; RV32I-NEXT:    or s5, a1, ra
+; RV32I-NEXT:    lbu s9, 28(a0)
+; RV32I-NEXT:    lbu a1, 29(a0)
+; RV32I-NEXT:    lbu s10, 30(a0)
 ; RV32I-NEXT:    lbu a0, 31(a0)
-; RV32I-NEXT:    lbu a1, 0(a1)
-; RV32I-NEXT:    sw zero, 16(sp)
-; RV32I-NEXT:    sw zero, 20(sp)
+; RV32I-NEXT:    lbu a3, 0(a3)
 ; RV32I-NEXT:    sw zero, 24(sp)
 ; RV32I-NEXT:    sw zero, 28(sp)
-; RV32I-NEXT:    sw zero, 0(sp)
-; RV32I-NEXT:    sw zero, 4(sp)
+; RV32I-NEXT:    sw zero, 32(sp)
+; RV32I-NEXT:    sw zero, 36(sp)
 ; RV32I-NEXT:    sw zero, 8(sp)
 ; RV32I-NEXT:    sw zero, 12(sp)
+; RV32I-NEXT:    sw zero, 16(sp)
+; RV32I-NEXT:    sw zero, 20(sp)
+; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    slli t6, t6, 24
+; RV32I-NEXT:    or t4, t6, t4
+; RV32I-NEXT:    addi t6, sp, 40
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    slli t2, t2, 16
+; RV32I-NEXT:    slli t3, t3, 24
+; RV32I-NEXT:    slli t5, t5, 8
 ; RV32I-NEXT:    slli s0, s0, 16
 ; RV32I-NEXT:    slli s1, s1, 24
-; RV32I-NEXT:    or s0, s1, s0
-; RV32I-NEXT:    addi s1, sp, 32
-; RV32I-NEXT:    slli s2, s2, 8
-; RV32I-NEXT:    slli s3, s3, 16
-; RV32I-NEXT:    slli s4, s4, 24
 ; RV32I-NEXT:    slli s6, s6, 8
 ; RV32I-NEXT:    slli s7, s7, 16
+; RV32I-NEXT:    slli s8, s8, 24
+; RV32I-NEXT:    slli a1, a1, 8
+; RV32I-NEXT:    slli s10, s10, 16
 ; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    slli a1, a1, 2
-; RV32I-NEXT:    or t3, s2, t3
-; RV32I-NEXT:    or s2, s4, s3
-; RV32I-NEXT:    or s3, s6, s5
-; RV32I-NEXT:    or a0, a0, s7
-; RV32I-NEXT:    andi a1, a1, 28
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a5, t0, a7
-; RV32I-NEXT:    or a6, t2, t1
-; RV32I-NEXT:    or a7, t5, t4
-; RV32I-NEXT:    or t0, s0, t6
-; RV32I-NEXT:    or t1, s2, t3
-; RV32I-NEXT:    or a0, a0, s3
-; RV32I-NEXT:    sub s1, s1, a1
-; RV32I-NEXT:    sw a7, 48(sp)
-; RV32I-NEXT:    sw t0, 52(sp)
-; RV32I-NEXT:    sw t1, 56(sp)
-; RV32I-NEXT:    sw a0, 60(sp)
-; RV32I-NEXT:    sw a3, 32(sp)
-; RV32I-NEXT:    sw a4, 36(sp)
+; RV32I-NEXT:    slli a3, a3, 2
+; RV32I-NEXT:    lw s11, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    or a6, a6, s11
+; RV32I-NEXT:    or t2, t3, t2
+; RV32I-NEXT:    or a4, t5, a4
+; RV32I-NEXT:    or s0, s1, s0
+; RV32I-NEXT:    or t3, s6, s2
+; RV32I-NEXT:    or t5, s8, s7
+; RV32I-NEXT:    or a1, a1, s9
+; RV32I-NEXT:    or a0, a0, s10
+; RV32I-NEXT:    andi a3, a3, 28
+; RV32I-NEXT:    or a5, a7, a5
+; RV32I-NEXT:    or a7, t1, t0
+; RV32I-NEXT:    or t0, s4, s3
+; RV32I-NEXT:    or t1, t4, s5
+; RV32I-NEXT:    or a6, t2, a6
+; RV32I-NEXT:    or a4, s0, a4
+; RV32I-NEXT:    or t2, t5, t3
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    sub t3, t6, a3
+; RV32I-NEXT:    sw a6, 56(sp)
+; RV32I-NEXT:    sw a4, 60(sp)
+; RV32I-NEXT:    sw t2, 64(sp)
+; RV32I-NEXT:    sw a0, 68(sp)
 ; RV32I-NEXT:    sw a5, 40(sp)
-; RV32I-NEXT:    sw a6, 44(sp)
-; RV32I-NEXT:    lw a6, 16(s1)
-; RV32I-NEXT:    lw a5, 20(s1)
-; RV32I-NEXT:    lw a7, 24(s1)
-; RV32I-NEXT:    lw a1, 0(s1)
-; RV32I-NEXT:    lw a0, 4(s1)
-; RV32I-NEXT:    lw a4, 8(s1)
-; RV32I-NEXT:    lw a3, 12(s1)
-; RV32I-NEXT:    lw t0, 28(s1)
+; RV32I-NEXT:    sw a7, 44(sp)
+; RV32I-NEXT:    sw t0, 48(sp)
+; RV32I-NEXT:    sw t1, 52(sp)
+; RV32I-NEXT:    lw a6, 16(t3)
+; RV32I-NEXT:    lw a5, 20(t3)
+; RV32I-NEXT:    lw a7, 24(t3)
+; RV32I-NEXT:    lw a1, 0(t3)
+; RV32I-NEXT:    lw a0, 4(t3)
+; RV32I-NEXT:    lw a4, 8(t3)
+; RV32I-NEXT:    lw a3, 12(t3)
+; RV32I-NEXT:    lw t0, 28(t3)
 ; RV32I-NEXT:    srli t1, a7, 24
 ; RV32I-NEXT:    srli t2, a7, 16
 ; RV32I-NEXT:    srli t3, a7, 8
@@ -4121,21 +4143,21 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw
 ; RV32I-NEXT:    srli s5, a5, 8
 ; RV32I-NEXT:    srli s6, a4, 24
 ; RV32I-NEXT:    srli s7, a4, 16
+; RV32I-NEXT:    srli s8, a4, 8
+; RV32I-NEXT:    srli s9, a3, 24
+; RV32I-NEXT:    srli s10, a3, 16
+; RV32I-NEXT:    srli s11, a3, 8
 ; RV32I-NEXT:    sb a7, 24(a2)
-; RV32I-NEXT:    srli a7, a4, 8
+; RV32I-NEXT:    srli a7, a1, 24
 ; RV32I-NEXT:    sb t3, 25(a2)
-; RV32I-NEXT:    srli t3, a3, 24
 ; RV32I-NEXT:    sb t2, 26(a2)
-; RV32I-NEXT:    srli t2, a3, 16
 ; RV32I-NEXT:    sb t1, 27(a2)
-; RV32I-NEXT:    srli t1, a3, 8
+; RV32I-NEXT:    srli t1, a1, 16
 ; RV32I-NEXT:    sb t0, 28(a2)
-; RV32I-NEXT:    srli t0, a1, 24
 ; RV32I-NEXT:    sb t6, 29(a2)
-; RV32I-NEXT:    srli t6, a1, 16
 ; RV32I-NEXT:    sb t5, 30(a2)
 ; RV32I-NEXT:    sb t4, 31(a2)
-; RV32I-NEXT:    srli t4, a1, 8
+; RV32I-NEXT:    srli t0, a1, 8
 ; RV32I-NEXT:    sb a6, 16(a2)
 ; RV32I-NEXT:    sb s2, 17(a2)
 ; RV32I-NEXT:    sb s1, 18(a2)
@@ -4147,35 +4169,36 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw
 ; RV32I-NEXT:    sb s3, 23(a2)
 ; RV32I-NEXT:    srli a5, a0, 16
 ; RV32I-NEXT:    sb a4, 8(a2)
-; RV32I-NEXT:    sb a7, 9(a2)
+; RV32I-NEXT:    sb s8, 9(a2)
 ; RV32I-NEXT:    sb s7, 10(a2)
 ; RV32I-NEXT:    sb s6, 11(a2)
 ; RV32I-NEXT:    srli a4, a0, 8
 ; RV32I-NEXT:    sb a3, 12(a2)
-; RV32I-NEXT:    sb t1, 13(a2)
-; RV32I-NEXT:    sb t2, 14(a2)
-; RV32I-NEXT:    sb t3, 15(a2)
+; RV32I-NEXT:    sb s11, 13(a2)
+; RV32I-NEXT:    sb s10, 14(a2)
+; RV32I-NEXT:    sb s9, 15(a2)
 ; RV32I-NEXT:    sb a1, 0(a2)
-; RV32I-NEXT:    sb t4, 1(a2)
-; RV32I-NEXT:    sb t6, 2(a2)
-; RV32I-NEXT:    sb t0, 3(a2)
+; RV32I-NEXT:    sb t0, 1(a2)
+; RV32I-NEXT:    sb t1, 2(a2)
+; RV32I-NEXT:    sb a7, 3(a2)
 ; RV32I-NEXT:    sb a0, 4(a2)
 ; RV32I-NEXT:    sb a4, 5(a2)
 ; RV32I-NEXT:    sb a5, 6(a2)
 ; RV32I-NEXT:    sb a6, 7(a2)
-; RV32I-NEXT:    lw s0, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s4, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s5, 88(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s6, 84(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s7, 80(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s8, 76(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s9, 72(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s10, 68(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s11, 64(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 112
+; RV32I-NEXT:    lw ra, 124(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 120(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 116(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 112(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 108(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 104(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s5, 100(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s6, 96(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s7, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s8, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s9, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s10, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s11, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 128
 ; RV32I-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
   %wordOff = load i256, ptr %wordOff.ptr, align 1
@@ -4201,111 +4224,111 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
 ; RV64I-NEXT:    sd s9, 80(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s10, 72(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s11, 64(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 0(a0)
-; RV64I-NEXT:    lbu a4, 1(a0)
-; RV64I-NEXT:    lbu a5, 2(a0)
-; RV64I-NEXT:    lbu a6, 3(a0)
-; RV64I-NEXT:    lbu a7, 4(a0)
-; RV64I-NEXT:    lbu t0, 5(a0)
-; RV64I-NEXT:    lbu t1, 6(a0)
-; RV64I-NEXT:    lbu t2, 7(a0)
-; RV64I-NEXT:    lbu t3, 8(a0)
-; RV64I-NEXT:    lbu t4, 9(a0)
-; RV64I-NEXT:    lbu t5, 10(a0)
-; RV64I-NEXT:    lbu t6, 11(a0)
-; RV64I-NEXT:    lbu s0, 12(a0)
-; RV64I-NEXT:    lbu s1, 13(a0)
-; RV64I-NEXT:    lbu s2, 14(a0)
-; RV64I-NEXT:    lbu s3, 15(a0)
-; RV64I-NEXT:    lbu s4, 16(a0)
-; RV64I-NEXT:    lbu s5, 17(a0)
-; RV64I-NEXT:    lbu s6, 18(a0)
-; RV64I-NEXT:    lbu s7, 19(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    slli a5, a5, 16
-; RV64I-NEXT:    slli a6, a6, 24
-; RV64I-NEXT:    slli t0, t0, 8
-; RV64I-NEXT:    slli t1, t1, 16
-; RV64I-NEXT:    slli t2, t2, 24
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    or a5, t0, a7
-; RV64I-NEXT:    or a6, t2, t1
-; RV64I-NEXT:    lbu s8, 20(a0)
-; RV64I-NEXT:    lbu s9, 21(a0)
-; RV64I-NEXT:    lbu s10, 22(a0)
-; RV64I-NEXT:    lbu s11, 23(a0)
-; RV64I-NEXT:    slli t4, t4, 8
-; RV64I-NEXT:    slli t5, t5, 16
-; RV64I-NEXT:    slli t6, t6, 24
-; RV64I-NEXT:    slli s1, s1, 8
-; RV64I-NEXT:    slli s2, s2, 16
+; RV64I-NEXT:    lbu a5, 0(a0)
+; RV64I-NEXT:    lbu a7, 1(a0)
+; RV64I-NEXT:    lbu t2, 2(a0)
+; RV64I-NEXT:    lbu s3, 3(a0)
+; RV64I-NEXT:    lbu t0, 4(a0)
+; RV64I-NEXT:    lbu s8, 5(a0)
+; RV64I-NEXT:    lbu s9, 6(a0)
+; RV64I-NEXT:    lbu s10, 7(a0)
+; RV64I-NEXT:    lbu s2, 8(a0)
+; RV64I-NEXT:    lbu s4, 9(a0)
+; RV64I-NEXT:    lbu s5, 10(a0)
+; RV64I-NEXT:    lbu s6, 11(a0)
+; RV64I-NEXT:    lbu s7, 12(a0)
+; RV64I-NEXT:    lbu s11, 13(a0)
+; RV64I-NEXT:    lbu t1, 14(a0)
+; RV64I-NEXT:    lbu t3, 15(a0)
+; RV64I-NEXT:    lbu a3, 16(a0)
+; RV64I-NEXT:    lbu a6, 17(a0)
+; RV64I-NEXT:    lbu t4, 18(a0)
+; RV64I-NEXT:    lbu t5, 19(a0)
+; RV64I-NEXT:    lbu a4, 20(a0)
+; RV64I-NEXT:    lbu t6, 21(a0)
+; RV64I-NEXT:    lbu s0, 22(a0)
+; RV64I-NEXT:    lbu s1, 23(a0)
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    slli t2, t2, 16
 ; RV64I-NEXT:    slli s3, s3, 24
-; RV64I-NEXT:    or a7, t4, t3
-; RV64I-NEXT:    or t0, t6, t5
-; RV64I-NEXT:    or t1, s1, s0
-; RV64I-NEXT:    or t2, s3, s2
-; RV64I-NEXT:    lbu t3, 24(a0)
-; RV64I-NEXT:    lbu t4, 25(a0)
-; RV64I-NEXT:    lbu t5, 26(a0)
-; RV64I-NEXT:    lbu t6, 27(a0)
-; RV64I-NEXT:    slli s5, s5, 8
-; RV64I-NEXT:    slli s6, s6, 16
-; RV64I-NEXT:    slli s7, s7, 24
-; RV64I-NEXT:    slli s9, s9, 8
-; RV64I-NEXT:    or s0, s5, s4
-; RV64I-NEXT:    or s1, s7, s6
-; RV64I-NEXT:    or s2, s9, s8
-; RV64I-NEXT:    lbu s3, 28(a0)
-; RV64I-NEXT:    lbu s4, 29(a0)
-; RV64I-NEXT:    lbu s5, 30(a0)
+; RV64I-NEXT:    slli s8, s8, 8
+; RV64I-NEXT:    slli s9, s9, 16
+; RV64I-NEXT:    slli s10, s10, 24
+; RV64I-NEXT:    or a5, a7, a5
+; RV64I-NEXT:    or a7, s3, t2
+; RV64I-NEXT:    or t0, s8, t0
+; RV64I-NEXT:    or t2, s10, s9
+; RV64I-NEXT:    lbu s3, 24(a0)
+; RV64I-NEXT:    lbu s8, 25(a0)
+; RV64I-NEXT:    lbu s9, 26(a0)
+; RV64I-NEXT:    lbu s10, 27(a0)
+; RV64I-NEXT:    slli s4, s4, 8
+; RV64I-NEXT:    slli s5, s5, 16
+; RV64I-NEXT:    slli s6, s6, 24
+; RV64I-NEXT:    slli s11, s11, 8
+; RV64I-NEXT:    or s2, s4, s2
+; RV64I-NEXT:    or s4, s6, s5
+; RV64I-NEXT:    or s5, s11, s7
+; RV64I-NEXT:    lbu s6, 28(a0)
+; RV64I-NEXT:    lbu s7, 29(a0)
+; RV64I-NEXT:    lbu s11, 30(a0)
 ; RV64I-NEXT:    lbu a0, 31(a0)
 ; RV64I-NEXT:    lbu a1, 0(a1)
 ; RV64I-NEXT:    sd zero, 0(sp)
 ; RV64I-NEXT:    sd zero, 8(sp)
 ; RV64I-NEXT:    sd zero, 16(sp)
 ; RV64I-NEXT:    sd zero, 24(sp)
-; RV64I-NEXT:    slli s10, s10, 16
-; RV64I-NEXT:    slli s11, s11, 24
-; RV64I-NEXT:    or s6, s11, s10
-; RV64I-NEXT:    addi s7, sp, 32
-; RV64I-NEXT:    slli t4, t4, 8
-; RV64I-NEXT:    slli t5, t5, 16
-; RV64I-NEXT:    slli t6, t6, 24
-; RV64I-NEXT:    slli s4, s4, 8
-; RV64I-NEXT:    slli s5, s5, 16
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t3, t3, 24
+; RV64I-NEXT:    or t1, t3, t1
+; RV64I-NEXT:    addi t3, sp, 32
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    slli t4, t4, 16
+; RV64I-NEXT:    slli t5, t5, 24
+; RV64I-NEXT:    slli t6, t6, 8
+; RV64I-NEXT:    slli s0, s0, 16
+; RV64I-NEXT:    slli s1, s1, 24
+; RV64I-NEXT:    slli s8, s8, 8
+; RV64I-NEXT:    slli s9, s9, 16
+; RV64I-NEXT:    slli s10, s10, 24
+; RV64I-NEXT:    slli s7, s7, 8
+; RV64I-NEXT:    slli s11, s11, 16
 ; RV64I-NEXT:    slli a0, a0, 24
 ; RV64I-NEXT:    slli a1, a1, 3
-; RV64I-NEXT:    or t3, t4, t3
-; RV64I-NEXT:    or t4, t6, t5
-; RV64I-NEXT:    or t5, s4, s3
-; RV64I-NEXT:    or a0, a0, s5
-; RV64I-NEXT:    andi a1, a1, 24
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    or a5, t0, a7
-; RV64I-NEXT:    or a6, t2, t1
+; RV64I-NEXT:    or a3, a6, a3
+; RV64I-NEXT:    or a6, t5, t4
+; RV64I-NEXT:    or a4, t6, a4
 ; RV64I-NEXT:    or s0, s1, s0
-; RV64I-NEXT:    or a7, s6, s2
-; RV64I-NEXT:    or t0, t4, t3
-; RV64I-NEXT:    or a0, a0, t5
-; RV64I-NEXT:    sub t1, s7, a1
-; RV64I-NEXT:    slli a4, a4, 32
-; RV64I-NEXT:    slli a6, a6, 32
+; RV64I-NEXT:    or t4, s8, s3
+; RV64I-NEXT:    or t5, s10, s9
+; RV64I-NEXT:    or t6, s7, s6
+; RV64I-NEXT:    or a0, a0, s11
+; RV64I-NEXT:    andi a1, a1, 24
+; RV64I-NEXT:    or a5, a7, a5
+; RV64I-NEXT:    or a7, t2, t0
+; RV64I-NEXT:    or t0, s4, s2
+; RV64I-NEXT:    or t1, t1, s5
+; RV64I-NEXT:    or a3, a6, a3
+; RV64I-NEXT:    or a4, s0, a4
+; RV64I-NEXT:    or a6, t5, t4
+; RV64I-NEXT:    or a0, a0, t6
+; RV64I-NEXT:    sub t2, t3, a1
 ; RV64I-NEXT:    slli a7, a7, 32
+; RV64I-NEXT:    slli t1, t1, 32
+; RV64I-NEXT:    slli a4, a4, 32
 ; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    or a1, a7, a5
+; RV64I-NEXT:    or a5, t1, t0
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    or a1, a6, a5
-; RV64I-NEXT:    or a4, a7, s0
-; RV64I-NEXT:    or a0, a0, t0
-; RV64I-NEXT:    sd a3, 32(sp)
-; RV64I-NEXT:    sd a1, 40(sp)
-; RV64I-NEXT:    sd a4, 48(sp)
+; RV64I-NEXT:    or a0, a0, a6
+; RV64I-NEXT:    sd a1, 32(sp)
+; RV64I-NEXT:    sd a5, 40(sp)
+; RV64I-NEXT:    sd a3, 48(sp)
 ; RV64I-NEXT:    sd a0, 56(sp)
-; RV64I-NEXT:    ld a4, 16(t1)
-; RV64I-NEXT:    ld a0, 8(t1)
-; RV64I-NEXT:    ld a1, 0(t1)
-; RV64I-NEXT:    ld a3, 24(t1)
+; RV64I-NEXT:    ld a4, 16(t2)
+; RV64I-NEXT:    ld a0, 8(t2)
+; RV64I-NEXT:    ld a1, 0(t2)
+; RV64I-NEXT:    ld a3, 24(t2)
 ; RV64I-NEXT:    srli a5, a4, 56
 ; RV64I-NEXT:    srli a6, a4, 48
 ; RV64I-NEXT:    srli a7, a4, 40
@@ -4324,25 +4347,25 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
 ; RV64I-NEXT:    srli s5, a1, 48
 ; RV64I-NEXT:    srli s6, a1, 40
 ; RV64I-NEXT:    srli s7, a1, 32
+; RV64I-NEXT:    srli s8, a1, 24
+; RV64I-NEXT:    srli s9, a1, 16
+; RV64I-NEXT:    srli s10, a1, 8
+; RV64I-NEXT:    srli s11, a0, 56
 ; RV64I-NEXT:    sb t0, 20(a2)
-; RV64I-NEXT:    srli t0, a1, 24
 ; RV64I-NEXT:    sb a7, 21(a2)
-; RV64I-NEXT:    srli a7, a1, 16
 ; RV64I-NEXT:    sb a6, 22(a2)
-; RV64I-NEXT:    srli a6, a1, 8
 ; RV64I-NEXT:    sb a5, 23(a2)
-; RV64I-NEXT:    srli a5, a0, 56
+; RV64I-NEXT:    srli a5, a0, 48
 ; RV64I-NEXT:    sb a4, 16(a2)
-; RV64I-NEXT:    srli a4, a0, 48
 ; RV64I-NEXT:    sb t3, 17(a2)
 ; RV64I-NEXT:    sb t2, 18(a2)
 ; RV64I-NEXT:    sb t1, 19(a2)
-; RV64I-NEXT:    srli t1, a0, 40
+; RV64I-NEXT:    srli a4, a0, 40
 ; RV64I-NEXT:    sb s0, 28(a2)
 ; RV64I-NEXT:    sb t6, 29(a2)
 ; RV64I-NEXT:    sb t5, 30(a2)
 ; RV64I-NEXT:    sb t4, 31(a2)
-; RV64I-NEXT:    srli t2, a0, 32
+; RV64I-NEXT:    srli a6, a0, 32
 ; RV64I-NEXT:    sb a3, 24(a2)
 ; RV64I-NEXT:    sb s3, 25(a2)
 ; RV64I-NEXT:    sb s2, 26(a2)
@@ -4352,19 +4375,19 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
 ; RV64I-NEXT:    sb s6, 5(a2)
 ; RV64I-NEXT:    sb s5, 6(a2)
 ; RV64I-NEXT:    sb s4, 7(a2)
-; RV64I-NEXT:    srli t3, a0, 16
+; RV64I-NEXT:    srli a7, a0, 16
 ; RV64I-NEXT:    sb a1, 0(a2)
-; RV64I-NEXT:    sb a6, 1(a2)
-; RV64I-NEXT:    sb a7, 2(a2)
-; RV64I-NEXT:    sb t0, 3(a2)
+; RV64I-NEXT:    sb s10, 1(a2)
+; RV64I-NEXT:    sb s9, 2(a2)
+; RV64I-NEXT:    sb s8, 3(a2)
 ; RV64I-NEXT:    srli a1, a0, 8
-; RV64I-NEXT:    sb t2, 12(a2)
-; RV64I-NEXT:    sb t1, 13(a2)
-; RV64I-NEXT:    sb a4, 14(a2)
-; RV64I-NEXT:    sb a5, 15(a2)
+; RV64I-NEXT:    sb a6, 12(a2)
+; RV64I-NEXT:    sb a4, 13(a2)
+; RV64I-NEXT:    sb a5, 14(a2)
+; RV64I-NEXT:    sb s11, 15(a2)
 ; RV64I-NEXT:    sb a0, 8(a2)
 ; RV64I-NEXT:    sb a1, 9(a2)
-; RV64I-NEXT:    sb t3, 10(a2)
+; RV64I-NEXT:    sb a7, 10(a2)
 ; RV64I-NEXT:    sb a3, 11(a2)
 ; RV64I-NEXT:    ld s0, 152(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 144(sp) # 8-byte Folded Reload
@@ -4383,128 +4406,132 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
 ;
 ; RV32I-LABEL: shl_32bytes_dwordOff:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -112
-; RV32I-NEXT:    sw s0, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s5, 88(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s6, 84(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s7, 80(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s8, 76(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s9, 72(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s10, 68(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s11, 64(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 0(a0)
-; RV32I-NEXT:    lbu a4, 1(a0)
-; RV32I-NEXT:    lbu a5, 2(a0)
-; RV32I-NEXT:    lbu a6, 3(a0)
-; RV32I-NEXT:    lbu a7, 4(a0)
-; RV32I-NEXT:    lbu t0, 5(a0)
-; RV32I-NEXT:    lbu t1, 6(a0)
-; RV32I-NEXT:    lbu t2, 7(a0)
-; RV32I-NEXT:    lbu t3, 8(a0)
-; RV32I-NEXT:    lbu t4, 9(a0)
-; RV32I-NEXT:    lbu t5, 10(a0)
-; RV32I-NEXT:    lbu t6, 11(a0)
-; RV32I-NEXT:    lbu s2, 12(a0)
-; RV32I-NEXT:    lbu s3, 13(a0)
-; RV32I-NEXT:    lbu s4, 14(a0)
-; RV32I-NEXT:    lbu s5, 15(a0)
-; RV32I-NEXT:    lbu s6, 16(a0)
-; RV32I-NEXT:    lbu s7, 17(a0)
-; RV32I-NEXT:    lbu s8, 18(a0)
-; RV32I-NEXT:    lbu s9, 19(a0)
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    slli t1, t1, 16
-; RV32I-NEXT:    slli t2, t2, 24
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a5, t0, a7
-; RV32I-NEXT:    or a6, t2, t1
-; RV32I-NEXT:    lbu s10, 20(a0)
-; RV32I-NEXT:    lbu s11, 21(a0)
+; RV32I-NEXT:    addi sp, sp, -128
+; RV32I-NEXT:    sw ra, 124(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 120(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 116(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 112(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 108(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 104(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s5, 100(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s6, 96(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s7, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s8, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s9, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s10, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s11, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    mv a3, a1
+; RV32I-NEXT:    lbu a5, 0(a0)
+; RV32I-NEXT:    lbu a7, 1(a0)
+; RV32I-NEXT:    lbu t0, 2(a0)
+; RV32I-NEXT:    lbu t1, 3(a0)
+; RV32I-NEXT:    lbu s2, 4(a0)
+; RV32I-NEXT:    lbu s4, 5(a0)
+; RV32I-NEXT:    lbu s5, 6(a0)
+; RV32I-NEXT:    lbu s6, 7(a0)
+; RV32I-NEXT:    lbu s3, 8(a0)
+; RV32I-NEXT:    lbu s9, 9(a0)
+; RV32I-NEXT:    lbu s10, 10(a0)
+; RV32I-NEXT:    lbu s11, 11(a0)
+; RV32I-NEXT:    lbu ra, 12(a0)
+; RV32I-NEXT:    lbu a1, 13(a0)
+; RV32I-NEXT:    lbu t4, 14(a0)
+; RV32I-NEXT:    lbu t6, 15(a0)
+; RV32I-NEXT:    lbu a4, 16(a0)
+; RV32I-NEXT:    sw a4, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a6, 17(a0)
+; RV32I-NEXT:    lbu t2, 18(a0)
+; RV32I-NEXT:    lbu t3, 19(a0)
+; RV32I-NEXT:    lbu a4, 20(a0)
+; RV32I-NEXT:    lbu t5, 21(a0)
 ; RV32I-NEXT:    lbu s0, 22(a0)
 ; RV32I-NEXT:    lbu s1, 23(a0)
-; RV32I-NEXT:    slli t4, t4, 8
-; RV32I-NEXT:    slli t5, t5, 16
-; RV32I-NEXT:    slli t6, t6, 24
-; RV32I-NEXT:    slli s3, s3, 8
-; RV32I-NEXT:    slli s4, s4, 16
-; RV32I-NEXT:    slli s5, s5, 24
-; RV32I-NEXT:    or a7, t4, t3
-; RV32I-NEXT:    or t0, t6, t5
-; RV32I-NEXT:    or t1, s3, s2
-; RV32I-NEXT:    or t2, s5, s4
-; RV32I-NEXT:    lbu t3, 24(a0)
-; RV32I-NEXT:    lbu s2, 25(a0)
-; RV32I-NEXT:    lbu s3, 26(a0)
-; RV32I-NEXT:    lbu s4, 27(a0)
-; RV32I-NEXT:    slli s7, s7, 8
-; RV32I-NEXT:    slli s8, s8, 16
-; RV32I-NEXT:    slli s9, s9, 24
-; RV32I-NEXT:    slli s11, s11, 8
-; RV32I-NEXT:    or t4, s7, s6
-; RV32I-NEXT:    or t5, s9, s8
-; RV32I-NEXT:    or t6, s11, s10
-; RV32I-NEXT:    lbu s5, 28(a0)
-; RV32I-NEXT:    lbu s6, 29(a0)
-; RV32I-NEXT:    lbu s7, 30(a0)
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli t1, t1, 24
+; RV32I-NEXT:    slli s4, s4, 8
+; RV32I-NEXT:    slli s5, s5, 16
+; RV32I-NEXT:    slli s6, s6, 24
+; RV32I-NEXT:    or a5, a7, a5
+; RV32I-NEXT:    or a7, t1, t0
+; RV32I-NEXT:    or t0, s4, s2
+; RV32I-NEXT:    or t1, s6, s5
+; RV32I-NEXT:    lbu s2, 24(a0)
+; RV32I-NEXT:    lbu s6, 25(a0)
+; RV32I-NEXT:    lbu s7, 26(a0)
+; RV32I-NEXT:    lbu s8, 27(a0)
+; RV32I-NEXT:    slli s9, s9, 8
+; RV32I-NEXT:    slli s10, s10, 16
+; RV32I-NEXT:    slli s11, s11, 24
+; RV32I-NEXT:    slli a1, a1, 8
+; RV32I-NEXT:    or s3, s9, s3
+; RV32I-NEXT:    or s4, s11, s10
+; RV32I-NEXT:    or s5, a1, ra
+; RV32I-NEXT:    lbu s9, 28(a0)
+; RV32I-NEXT:    lbu a1, 29(a0)
+; RV32I-NEXT:    lbu s10, 30(a0)
 ; RV32I-NEXT:    lbu a0, 31(a0)
-; RV32I-NEXT:    lbu a1, 0(a1)
-; RV32I-NEXT:    sw zero, 16(sp)
-; RV32I-NEXT:    sw zero, 20(sp)
+; RV32I-NEXT:    lbu a3, 0(a3)
 ; RV32I-NEXT:    sw zero, 24(sp)
 ; RV32I-NEXT:    sw zero, 28(sp)
-; RV32I-NEXT:    sw zero, 0(sp)
-; RV32I-NEXT:    sw zero, 4(sp)
+; RV32I-NEXT:    sw zero, 32(sp)
+; RV32I-NEXT:    sw zero, 36(sp)
 ; RV32I-NEXT:    sw zero, 8(sp)
 ; RV32I-NEXT:    sw zero, 12(sp)
+; RV32I-NEXT:    sw zero, 16(sp)
+; RV32I-NEXT:    sw zero, 20(sp)
+; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    slli t6, t6, 24
+; RV32I-NEXT:    or t4, t6, t4
+; RV32I-NEXT:    addi t6, sp, 40
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    slli t2, t2, 16
+; RV32I-NEXT:    slli t3, t3, 24
+; RV32I-NEXT:    slli t5, t5, 8
 ; RV32I-NEXT:    slli s0, s0, 16
 ; RV32I-NEXT:    slli s1, s1, 24
-; RV32I-NEXT:    or s0, s1, s0
-; RV32I-NEXT:    addi s1, sp, 32
-; RV32I-NEXT:    slli s2, s2, 8
-; RV32I-NEXT:    slli s3, s3, 16
-; RV32I-NEXT:    slli s4, s4, 24
 ; RV32I-NEXT:    slli s6, s6, 8
 ; RV32I-NEXT:    slli s7, s7, 16
+; RV32I-NEXT:    slli s8, s8, 24
+; RV32I-NEXT:    slli a1, a1, 8
+; RV32I-NEXT:    slli s10, s10, 16
 ; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    slli a1, a1, 3
-; RV32I-NEXT:    or t3, s2, t3
-; RV32I-NEXT:    or s2, s4, s3
-; RV32I-NEXT:    or s3, s6, s5
-; RV32I-NEXT:    or a0, a0, s7
-; RV32I-NEXT:    andi a1, a1, 24
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a5, t0, a7
-; RV32I-NEXT:    or a6, t2, t1
-; RV32I-NEXT:    or a7, t5, t4
-; RV32I-NEXT:    or t0, s0, t6
-; RV32I-NEXT:    or t1, s2, t3
-; RV32I-NEXT:    or a0, a0, s3
-; RV32I-NEXT:    sub s1, s1, a1
-; RV32I-NEXT:    sw a7, 48(sp)
-; RV32I-NEXT:    sw t0, 52(sp)
-; RV32I-NEXT:    sw t1, 56(sp)
-; RV32I-NEXT:    sw a0, 60(sp)
-; RV32I-NEXT:    sw a3, 32(sp)
-; RV32I-NEXT:    sw a4, 36(sp)
+; RV32I-NEXT:    slli a3, a3, 3
+; RV32I-NEXT:    lw s11, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    or a6, a6, s11
+; RV32I-NEXT:    or t2, t3, t2
+; RV32I-NEXT:    or a4, t5, a4
+; RV32I-NEXT:    or s0, s1, s0
+; RV32I-NEXT:    or t3, s6, s2
+; RV32I-NEXT:    or t5, s8, s7
+; RV32I-NEXT:    or a1, a1, s9
+; RV32I-NEXT:    or a0, a0, s10
+; RV32I-NEXT:    andi a3, a3, 24
+; RV32I-NEXT:    or a5, a7, a5
+; RV32I-NEXT:    or a7, t1, t0
+; RV32I-NEXT:    or t0, s4, s3
+; RV32I-NEXT:    or t1, t4, s5
+; RV32I-NEXT:    or a6, t2, a6
+; RV32I-NEXT:    or a4, s0, a4
+; RV32I-NEXT:    or t2, t5, t3
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    sub t3, t6, a3
+; RV32I-NEXT:    sw a6, 56(sp)
+; RV32I-NEXT:    sw a4, 60(sp)
+; RV32I-NEXT:    sw t2, 64(sp)
+; RV32I-NEXT:    sw a0, 68(sp)
 ; RV32I-NEXT:    sw a5, 40(sp)
-; RV32I-NEXT:    sw a6, 44(sp)
-; RV32I-NEXT:    lw a6, 16(s1)
-; RV32I-NEXT:    lw a5, 20(s1)
-; RV32I-NEXT:    lw a7, 24(s1)
-; RV32I-NEXT:    lw a1, 0(s1)
-; RV32I-NEXT:    lw a0, 4(s1)
-; RV32I-NEXT:    lw a4, 8(s1)
-; RV32I-NEXT:    lw a3, 12(s1)
-; RV32I-NEXT:    lw t0, 28(s1)
+; RV32I-NEXT:    sw a7, 44(sp)
+; RV32I-NEXT:    sw t0, 48(sp)
+; RV32I-NEXT:    sw t1, 52(sp)
+; RV32I-NEXT:    lw a6, 16(t3)
+; RV32I-NEXT:    lw a5, 20(t3)
+; RV32I-NEXT:    lw a7, 24(t3)
+; RV32I-NEXT:    lw a1, 0(t3)
+; RV32I-NEXT:    lw a0, 4(t3)
+; RV32I-NEXT:    lw a4, 8(t3)
+; RV32I-NEXT:    lw a3, 12(t3)
+; RV32I-NEXT:    lw t0, 28(t3)
 ; RV32I-NEXT:    srli t1, a7, 24
 ; RV32I-NEXT:    srli t2, a7, 16
 ; RV32I-NEXT:    srli t3, a7, 8
@@ -4519,21 +4546,21 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
 ; RV32I-NEXT:    srli s5, a5, 8
 ; RV32I-NEXT:    srli s6, a4, 24
 ; RV32I-NEXT:    srli s7, a4, 16
+; RV32I-NEXT:    srli s8, a4, 8
+; RV32I-NEXT:    srli s9, a3, 24
+; RV32I-NEXT:    srli s10, a3, 16
+; RV32I-NEXT:    srli s11, a3, 8
 ; RV32I-NEXT:    sb a7, 24(a2)
-; RV32I-NEXT:    srli a7, a4, 8
+; RV32I-NEXT:    srli a7, a1, 24
 ; RV32I-NEXT:    sb t3, 25(a2)
-; RV32I-NEXT:    srli t3, a3, 24
 ; RV32I-NEXT:    sb t2, 26(a2)
-; RV32I-NEXT:    srli t2, a3, 16
 ; RV32I-NEXT:    sb t1, 27(a2)
-; RV32I-NEXT:    srli t1, a3, 8
+; RV32I-NEXT:    srli t1, a1, 16
 ; RV32I-NEXT:    sb t0, 28(a2)
-; RV32I-NEXT:    srli t0, a1, 24
 ; RV32I-NEXT:    sb t6, 29(a2)
-; RV32I-NEXT:    srli t6, a1, 16
 ; RV32I-NEXT:    sb t5, 30(a2)
 ; RV32I-NEXT:    sb t4, 31(a2)
-; RV32I-NEXT:    srli t4, a1, 8
+; RV32I-NEXT:    srli t0, a1, 8
 ; RV32I-NEXT:    sb a6, 16(a2)
 ; RV32I-NEXT:    sb s2, 17(a2)
 ; RV32I-NEXT:    sb s1, 18(a2)
@@ -4545,35 +4572,36 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
 ; RV32I-NEXT:    sb s3, 23(a2)
 ; RV32I-NEXT:    srli a5, a0, 16
 ; RV32I-NEXT:    sb a4, 8(a2)
-; RV32I-NEXT:    sb a7, 9(a2)
+; RV32I-NEXT:    sb s8, 9(a2)
 ; RV32I-NEXT:    sb s7, 10(a2)
 ; RV32I-NEXT:    sb s6, 11(a2)
 ; RV32I-NEXT:    srli a4, a0, 8
 ; RV32I-NEXT:    sb a3, 12(a2)
-; RV32I-NEXT:    sb t1, 13(a2)
-; RV32I-NEXT:    sb t2, 14(a2)
-; RV32I-NEXT:    sb t3, 15(a2)
+; RV32I-NEXT:    sb s11, 13(a2)
+; RV32I-NEXT:    sb s10, 14(a2)
+; RV32I-NEXT:    sb s9, 15(a2)
 ; RV32I-NEXT:    sb a1, 0(a2)
-; RV32I-NEXT:    sb t4, 1(a2)
-; RV32I-NEXT:    sb t6, 2(a2)
-; RV32I-NEXT:    sb t0, 3(a2)
+; RV32I-NEXT:    sb t0, 1(a2)
+; RV32I-NEXT:    sb t1, 2(a2)
+; RV32I-NEXT:    sb a7, 3(a2)
 ; RV32I-NEXT:    sb a0, 4(a2)
 ; RV32I-NEXT:    sb a4, 5(a2)
 ; RV32I-NEXT:    sb a5, 6(a2)
 ; RV32I-NEXT:    sb a6, 7(a2)
-; RV32I-NEXT:    lw s0, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s4, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s5, 88(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s6, 84(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s7, 80(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s8, 76(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s9, 72(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s10, 68(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s11, 64(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 112
+; RV32I-NEXT:    lw ra, 124(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 120(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 116(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 112(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 108(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 104(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s5, 100(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s6, 96(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s7, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s8, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s9, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s10, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s11, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 128
 ; RV32I-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
   %dwordOff = load i256, ptr %dwordOff.ptr, align 1
@@ -4818,137 +4846,140 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ;
 ; RV32I-LABEL: ashr_32bytes:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -112
-; RV32I-NEXT:    sw s0, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s5, 88(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s6, 84(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s7, 80(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s8, 76(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s9, 72(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s10, 68(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s11, 64(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 0(a0)
+; RV32I-NEXT:    addi sp, sp, -128
+; RV32I-NEXT:    sw ra, 124(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 120(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 116(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 112(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 108(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 104(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s5, 100(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s6, 96(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s7, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s8, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s9, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s10, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s11, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu t6, 0(a0)
 ; RV32I-NEXT:    lbu a4, 1(a0)
 ; RV32I-NEXT:    lbu a5, 2(a0)
 ; RV32I-NEXT:    lbu a6, 3(a0)
-; RV32I-NEXT:    lbu a7, 4(a0)
-; RV32I-NEXT:    lbu t0, 5(a0)
-; RV32I-NEXT:    lbu t1, 6(a0)
-; RV32I-NEXT:    lbu t2, 7(a0)
-; RV32I-NEXT:    lbu t3, 8(a0)
-; RV32I-NEXT:    lbu t4, 9(a0)
-; RV32I-NEXT:    lbu t5, 10(a0)
-; RV32I-NEXT:    lbu t6, 11(a0)
-; RV32I-NEXT:    lbu s0, 12(a0)
-; RV32I-NEXT:    lbu s1, 13(a0)
-; RV32I-NEXT:    lbu s2, 14(a0)
-; RV32I-NEXT:    lbu s3, 15(a0)
-; RV32I-NEXT:    lbu s4, 16(a0)
-; RV32I-NEXT:    lbu s5, 17(a0)
-; RV32I-NEXT:    lbu s6, 18(a0)
-; RV32I-NEXT:    lbu s7, 19(a0)
+; RV32I-NEXT:    lbu t1, 4(a0)
+; RV32I-NEXT:    lbu t3, 5(a0)
+; RV32I-NEXT:    lbu t4, 6(a0)
+; RV32I-NEXT:    lbu t5, 7(a0)
+; RV32I-NEXT:    lbu t2, 8(a0)
+; RV32I-NEXT:    lbu s1, 9(a0)
+; RV32I-NEXT:    lbu s7, 10(a0)
+; RV32I-NEXT:    lbu s8, 11(a0)
+; RV32I-NEXT:    lbu s9, 12(a0)
+; RV32I-NEXT:    lbu s10, 13(a0)
+; RV32I-NEXT:    lbu s4, 14(a0)
+; RV32I-NEXT:    lbu s6, 15(a0)
+; RV32I-NEXT:    lbu s5, 16(a0)
+; RV32I-NEXT:    lbu s11, 17(a0)
+; RV32I-NEXT:    lbu ra, 18(a0)
+; RV32I-NEXT:    lbu a3, 19(a0)
+; RV32I-NEXT:    lbu s2, 20(a0)
+; RV32I-NEXT:    lbu s3, 21(a0)
+; RV32I-NEXT:    lbu a7, 22(a0)
+; RV32I-NEXT:    lbu t0, 23(a0)
 ; RV32I-NEXT:    slli a4, a4, 8
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    slli t1, t1, 16
-; RV32I-NEXT:    slli t2, t2, 24
-; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    slli t3, t3, 8
+; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    slli t5, t5, 24
+; RV32I-NEXT:    or a4, a4, t6
+; RV32I-NEXT:    sw a4, 4(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a5, t0, a7
-; RV32I-NEXT:    or a6, t2, t1
-; RV32I-NEXT:    lbu s8, 20(a0)
-; RV32I-NEXT:    lbu s9, 21(a0)
-; RV32I-NEXT:    lbu s10, 22(a0)
-; RV32I-NEXT:    lbu s11, 23(a0)
-; RV32I-NEXT:    slli t4, t4, 8
-; RV32I-NEXT:    slli t5, t5, 16
-; RV32I-NEXT:    slli t6, t6, 24
+; RV32I-NEXT:    or a5, t3, t1
+; RV32I-NEXT:    or a6, t5, t4
+; RV32I-NEXT:    lbu t1, 24(a0)
+; RV32I-NEXT:    lbu t5, 25(a0)
+; RV32I-NEXT:    lbu t6, 26(a0)
+; RV32I-NEXT:    lbu s0, 27(a0)
 ; RV32I-NEXT:    slli s1, s1, 8
-; RV32I-NEXT:    slli s2, s2, 16
-; RV32I-NEXT:    slli s3, s3, 24
-; RV32I-NEXT:    or a7, t4, t3
-; RV32I-NEXT:    or t0, t6, t5
-; RV32I-NEXT:    or t1, s1, s0
-; RV32I-NEXT:    or t2, s3, s2
-; RV32I-NEXT:    lbu t6, 24(a0)
-; RV32I-NEXT:    lbu s0, 25(a0)
-; RV32I-NEXT:    lbu s1, 26(a0)
-; RV32I-NEXT:    lbu s2, 27(a0)
-; RV32I-NEXT:    slli s5, s5, 8
-; RV32I-NEXT:    slli s6, s6, 16
-; RV32I-NEXT:    slli s7, s7, 24
-; RV32I-NEXT:    slli s9, s9, 8
-; RV32I-NEXT:    or t3, s5, s4
-; RV32I-NEXT:    or t4, s7, s6
-; RV32I-NEXT:    or t5, s9, s8
-; RV32I-NEXT:    lbu s3, 28(a0)
-; RV32I-NEXT:    lbu s4, 29(a0)
-; RV32I-NEXT:    lbu s5, 30(a0)
-; RV32I-NEXT:    lbu a0, 31(a0)
-; RV32I-NEXT:    slli s10, s10, 16
-; RV32I-NEXT:    slli s11, s11, 24
-; RV32I-NEXT:    slli s0, s0, 8
-; RV32I-NEXT:    slli s1, s1, 16
-; RV32I-NEXT:    slli s2, s2, 24
-; RV32I-NEXT:    or s6, s11, s10
-; RV32I-NEXT:    or t6, s0, t6
-; RV32I-NEXT:    or s0, s2, s1
-; RV32I-NEXT:    lbu s1, 0(a1)
-; RV32I-NEXT:    lbu s2, 1(a1)
-; RV32I-NEXT:    lbu s7, 2(a1)
-; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli s4, s4, 8
-; RV32I-NEXT:    or s3, s4, s3
-; RV32I-NEXT:    mv s4, sp
-; RV32I-NEXT:    slli s5, s5, 16
-; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    slli s2, s2, 8
 ; RV32I-NEXT:    slli s7, s7, 16
+; RV32I-NEXT:    slli s8, s8, 24
+; RV32I-NEXT:    slli s10, s10, 8
+; RV32I-NEXT:    or t2, s1, t2
+; RV32I-NEXT:    or t3, s8, s7
+; RV32I-NEXT:    or t4, s10, s9
+; RV32I-NEXT:    lbu s1, 28(a0)
+; RV32I-NEXT:    lbu s7, 29(a0)
+; RV32I-NEXT:    lbu s8, 30(a0)
+; RV32I-NEXT:    lbu s9, 31(a0)
+; RV32I-NEXT:    slli s4, s4, 16
+; RV32I-NEXT:    slli s6, s6, 24
+; RV32I-NEXT:    slli s11, s11, 8
+; RV32I-NEXT:    slli ra, ra, 16
+; RV32I-NEXT:    slli a3, a3, 24
+; RV32I-NEXT:    or a0, s6, s4
+; RV32I-NEXT:    or s4, s11, s5
+; RV32I-NEXT:    or s5, a3, ra
+; RV32I-NEXT:    lbu a3, 0(a1)
+; RV32I-NEXT:    lbu s6, 1(a1)
+; RV32I-NEXT:    lbu s10, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli s3, s3, 8
+; RV32I-NEXT:    or s2, s3, s2
+; RV32I-NEXT:    addi s3, sp, 8
+; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    slli t0, t0, 24
+; RV32I-NEXT:    slli t5, t5, 8
+; RV32I-NEXT:    slli t6, t6, 16
+; RV32I-NEXT:    slli s0, s0, 24
+; RV32I-NEXT:    slli s7, s7, 8
+; RV32I-NEXT:    slli s8, s8, 16
+; RV32I-NEXT:    slli s9, s9, 24
+; RV32I-NEXT:    slli s6, s6, 8
+; RV32I-NEXT:    slli s10, s10, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or s5, a0, s5
-; RV32I-NEXT:    or s1, s2, s1
-; RV32I-NEXT:    or a1, a1, s7
-; RV32I-NEXT:    srai a0, a0, 31
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a5, t0, a7
-; RV32I-NEXT:    or a6, t2, t1
-; RV32I-NEXT:    or a7, t4, t3
-; RV32I-NEXT:    or t0, s6, t5
+; RV32I-NEXT:    or a7, t0, a7
+; RV32I-NEXT:    or t0, t5, t1
 ; RV32I-NEXT:    or t1, s0, t6
-; RV32I-NEXT:    or t2, s5, s3
-; RV32I-NEXT:    or a1, a1, s1
-; RV32I-NEXT:    sw a0, 48(sp)
-; RV32I-NEXT:    sw a0, 52(sp)
-; RV32I-NEXT:    sw a0, 56(sp)
-; RV32I-NEXT:    sw a0, 60(sp)
-; RV32I-NEXT:    sw a0, 32(sp)
-; RV32I-NEXT:    sw a0, 36(sp)
-; RV32I-NEXT:    sw a0, 40(sp)
-; RV32I-NEXT:    sw a0, 44(sp)
-; RV32I-NEXT:    sw a7, 16(sp)
-; RV32I-NEXT:    sw t0, 20(sp)
-; RV32I-NEXT:    sw t1, 24(sp)
-; RV32I-NEXT:    sw t2, 28(sp)
-; RV32I-NEXT:    sw a3, 0(sp)
-; RV32I-NEXT:    sw a4, 4(sp)
-; RV32I-NEXT:    sw a5, 8(sp)
-; RV32I-NEXT:    sw a6, 12(sp)
+; RV32I-NEXT:    or t5, s7, s1
+; RV32I-NEXT:    or t6, s9, s8
+; RV32I-NEXT:    or a3, s6, a3
+; RV32I-NEXT:    or a1, a1, s10
+; RV32I-NEXT:    srai s0, s9, 31
+; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    or a4, a4, s1
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    or a6, t3, t2
+; RV32I-NEXT:    or a0, a0, t4
+; RV32I-NEXT:    or t2, s5, s4
+; RV32I-NEXT:    or a7, a7, s2
+; RV32I-NEXT:    or t0, t1, t0
+; RV32I-NEXT:    or t1, t6, t5
+; RV32I-NEXT:    or a1, a1, a3
+; RV32I-NEXT:    sw s0, 56(sp)
+; RV32I-NEXT:    sw s0, 60(sp)
+; RV32I-NEXT:    sw s0, 64(sp)
+; RV32I-NEXT:    sw s0, 68(sp)
+; RV32I-NEXT:    sw s0, 40(sp)
+; RV32I-NEXT:    sw s0, 44(sp)
+; RV32I-NEXT:    sw s0, 48(sp)
+; RV32I-NEXT:    sw s0, 52(sp)
+; RV32I-NEXT:    sw t2, 24(sp)
+; RV32I-NEXT:    sw a7, 28(sp)
+; RV32I-NEXT:    sw t0, 32(sp)
+; RV32I-NEXT:    sw t1, 36(sp)
+; RV32I-NEXT:    sw a4, 8(sp)
+; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a6, 16(sp)
+; RV32I-NEXT:    sw a0, 20(sp)
 ; RV32I-NEXT:    slli t1, a1, 3
 ; RV32I-NEXT:    andi a1, a1, 28
-; RV32I-NEXT:    add a1, s4, a1
+; RV32I-NEXT:    add a1, s3, a1
 ; RV32I-NEXT:    andi a0, t1, 24
-; RV32I-NEXT:    xori a7, a0, 31
+; RV32I-NEXT:    xori t0, a0, 31
 ; RV32I-NEXT:    lw a3, 0(a1)
 ; RV32I-NEXT:    lw a4, 4(a1)
 ; RV32I-NEXT:    lw a5, 8(a1)
 ; RV32I-NEXT:    lw a6, 12(a1)
-; RV32I-NEXT:    lw t0, 16(a1)
+; RV32I-NEXT:    lw a7, 16(a1)
 ; RV32I-NEXT:    lw t2, 20(a1)
 ; RV32I-NEXT:    lw t3, 24(a1)
 ; RV32I-NEXT:    lw t4, 28(a1)
@@ -4957,33 +4988,33 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    srl a1, a3, t1
 ; RV32I-NEXT:    slli t6, a4, 1
 ; RV32I-NEXT:    srl a3, a6, t1
-; RV32I-NEXT:    slli s0, t0, 1
+; RV32I-NEXT:    slli s0, a7, 1
 ; RV32I-NEXT:    srl a4, a5, t1
 ; RV32I-NEXT:    slli s1, a6, 1
 ; RV32I-NEXT:    srl a5, t2, t1
 ; RV32I-NEXT:    slli s2, t3, 1
-; RV32I-NEXT:    srl a6, t0, t1
+; RV32I-NEXT:    srl a6, a7, t1
 ; RV32I-NEXT:    slli t2, t2, 1
-; RV32I-NEXT:    srl t0, t3, t1
+; RV32I-NEXT:    srl a7, t3, t1
 ; RV32I-NEXT:    slli t3, t4, 1
 ; RV32I-NEXT:    sra t1, t4, t1
-; RV32I-NEXT:    sll t4, t5, a7
-; RV32I-NEXT:    sll t5, t6, a7
-; RV32I-NEXT:    sll t6, s0, a7
-; RV32I-NEXT:    sll s0, s1, a7
-; RV32I-NEXT:    sll s1, s2, a7
-; RV32I-NEXT:    sll t2, t2, a7
-; RV32I-NEXT:    sll t3, t3, a7
+; RV32I-NEXT:    sll t4, t5, t0
+; RV32I-NEXT:    sll t5, t6, t0
+; RV32I-NEXT:    sll t6, s0, t0
+; RV32I-NEXT:    sll s0, s1, t0
+; RV32I-NEXT:    sll s1, s2, t0
+; RV32I-NEXT:    sll t2, t2, t0
+; RV32I-NEXT:    sll t3, t3, t0
 ; RV32I-NEXT:    srli s2, t1, 24
 ; RV32I-NEXT:    srli s3, t1, 16
 ; RV32I-NEXT:    srli s4, t1, 8
-; RV32I-NEXT:    or a7, a0, t4
+; RV32I-NEXT:    or t0, a0, t4
 ; RV32I-NEXT:    or t4, a1, t5
 ; RV32I-NEXT:    or t5, a3, t6
 ; RV32I-NEXT:    or s0, a4, s0
 ; RV32I-NEXT:    or s1, a5, s1
 ; RV32I-NEXT:    or t2, a6, t2
-; RV32I-NEXT:    or t3, t0, t3
+; RV32I-NEXT:    or t3, a7, t3
 ; RV32I-NEXT:    sb t1, 28(a2)
 ; RV32I-NEXT:    sb s4, 29(a2)
 ; RV32I-NEXT:    sb s3, 30(a2)
@@ -5000,23 +5031,23 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    srli s6, s0, 24
 ; RV32I-NEXT:    srli s7, s0, 16
 ; RV32I-NEXT:    srli s0, s0, 8
-; RV32I-NEXT:    sb t0, 24(a2)
-; RV32I-NEXT:    srli t0, t5, 24
-; RV32I-NEXT:    sb t3, 25(a2)
-; RV32I-NEXT:    srli t3, t5, 16
+; RV32I-NEXT:    srli s8, t5, 24
+; RV32I-NEXT:    srli s9, t5, 16
 ; RV32I-NEXT:    srli t5, t5, 8
+; RV32I-NEXT:    srli s10, t4, 24
+; RV32I-NEXT:    srli s11, t4, 16
+; RV32I-NEXT:    srli t4, t4, 8
+; RV32I-NEXT:    sb a7, 24(a2)
+; RV32I-NEXT:    sb t3, 25(a2)
 ; RV32I-NEXT:    sb t6, 26(a2)
-; RV32I-NEXT:    srli t6, t4, 24
 ; RV32I-NEXT:    sb t1, 27(a2)
-; RV32I-NEXT:    srli t1, t4, 16
-; RV32I-NEXT:    srli t4, t4, 8
+; RV32I-NEXT:    srli a7, t0, 24
 ; RV32I-NEXT:    sb a6, 16(a2)
-; RV32I-NEXT:    srli a6, a7, 24
 ; RV32I-NEXT:    sb t2, 17(a2)
 ; RV32I-NEXT:    sb s3, 18(a2)
 ; RV32I-NEXT:    sb s2, 19(a2)
-; RV32I-NEXT:    srli t2, a7, 16
-; RV32I-NEXT:    srli a7, a7, 8
+; RV32I-NEXT:    srli a6, t0, 16
+; RV32I-NEXT:    srli t0, t0, 8
 ; RV32I-NEXT:    sb a5, 20(a2)
 ; RV32I-NEXT:    sb s1, 21(a2)
 ; RV32I-NEXT:    sb s5, 22(a2)
@@ -5027,29 +5058,30 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sb s6, 11(a2)
 ; RV32I-NEXT:    sb a3, 12(a2)
 ; RV32I-NEXT:    sb t5, 13(a2)
-; RV32I-NEXT:    sb t3, 14(a2)
-; RV32I-NEXT:    sb t0, 15(a2)
+; RV32I-NEXT:    sb s9, 14(a2)
+; RV32I-NEXT:    sb s8, 15(a2)
 ; RV32I-NEXT:    sb a1, 0(a2)
 ; RV32I-NEXT:    sb t4, 1(a2)
-; RV32I-NEXT:    sb t1, 2(a2)
-; RV32I-NEXT:    sb t6, 3(a2)
+; RV32I-NEXT:    sb s11, 2(a2)
+; RV32I-NEXT:    sb s10, 3(a2)
 ; RV32I-NEXT:    sb a0, 4(a2)
-; RV32I-NEXT:    sb a7, 5(a2)
-; RV32I-NEXT:    sb t2, 6(a2)
-; RV32I-NEXT:    sb a6, 7(a2)
-; RV32I-NEXT:    lw s0, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s4, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s5, 88(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s6, 84(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s7, 80(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s8, 76(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s9, 72(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s10, 68(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s11, 64(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 112
+; RV32I-NEXT:    sb t0, 5(a2)
+; RV32I-NEXT:    sb a6, 6(a2)
+; RV32I-NEXT:    sb a7, 7(a2)
+; RV32I-NEXT:    lw ra, 124(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 120(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 116(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 112(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 108(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 104(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s5, 100(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s6, 96(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s7, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s8, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s9, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s10, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s11, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 128
 ; RV32I-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
   %byteOff = load i256, ptr %byteOff.ptr, align 1
@@ -5295,129 +5327,130 @@ define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
 ;
 ; RV32I-LABEL: ashr_32bytes_wordOff:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -112
-; RV32I-NEXT:    sw s0, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s5, 88(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s6, 84(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s7, 80(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s8, 76(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s9, 72(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s10, 68(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s11, 64(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 0(a0)
-; RV32I-NEXT:    lbu a4, 1(a0)
-; RV32I-NEXT:    lbu a5, 2(a0)
-; RV32I-NEXT:    lbu a6, 3(a0)
-; RV32I-NEXT:    lbu a7, 4(a0)
-; RV32I-NEXT:    lbu t0, 5(a0)
-; RV32I-NEXT:    lbu t1, 6(a0)
-; RV32I-NEXT:    lbu t2, 7(a0)
-; RV32I-NEXT:    lbu t3, 8(a0)
-; RV32I-NEXT:    lbu t4, 9(a0)
-; RV32I-NEXT:    lbu t5, 10(a0)
-; RV32I-NEXT:    lbu t6, 11(a0)
-; RV32I-NEXT:    lbu s0, 12(a0)
-; RV32I-NEXT:    lbu s1, 13(a0)
-; RV32I-NEXT:    lbu s2, 14(a0)
-; RV32I-NEXT:    lbu s3, 15(a0)
-; RV32I-NEXT:    lbu s4, 16(a0)
-; RV32I-NEXT:    lbu s5, 17(a0)
-; RV32I-NEXT:    lbu s6, 18(a0)
-; RV32I-NEXT:    lbu s7, 19(a0)
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli a6, a6, 24
+; RV32I-NEXT:    addi sp, sp, -128
+; RV32I-NEXT:    sw ra, 124(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 120(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 116(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 112(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 108(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 104(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s5, 100(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s6, 96(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s7, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s8, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s9, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s10, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s11, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a5, 0(a0)
+; RV32I-NEXT:    lbu a6, 1(a0)
+; RV32I-NEXT:    lbu a7, 2(a0)
+; RV32I-NEXT:    lbu t1, 3(a0)
+; RV32I-NEXT:    lbu s0, 4(a0)
+; RV32I-NEXT:    lbu s2, 5(a0)
+; RV32I-NEXT:    lbu s3, 6(a0)
+; RV32I-NEXT:    lbu s6, 7(a0)
+; RV32I-NEXT:    lbu s1, 8(a0)
+; RV32I-NEXT:    lbu s7, 9(a0)
+; RV32I-NEXT:    lbu s8, 10(a0)
+; RV32I-NEXT:    lbu s9, 11(a0)
+; RV32I-NEXT:    lbu s10, 12(a0)
+; RV32I-NEXT:    lbu s11, 13(a0)
+; RV32I-NEXT:    lbu s4, 14(a0)
+; RV32I-NEXT:    lbu s5, 15(a0)
+; RV32I-NEXT:    lbu a3, 16(a0)
+; RV32I-NEXT:    lbu t0, 17(a0)
+; RV32I-NEXT:    lbu t2, 18(a0)
+; RV32I-NEXT:    lbu t3, 19(a0)
+; RV32I-NEXT:    lbu a4, 20(a0)
+; RV32I-NEXT:    lbu t4, 21(a0)
+; RV32I-NEXT:    lbu t5, 22(a0)
+; RV32I-NEXT:    lbu t6, 23(a0)
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    slli t1, t1, 24
+; RV32I-NEXT:    slli s2, s2, 8
+; RV32I-NEXT:    slli s3, s3, 16
+; RV32I-NEXT:    slli s6, s6, 24
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    or a6, t1, a7
+; RV32I-NEXT:    or a7, s2, s0
+; RV32I-NEXT:    or t1, s6, s3
+; RV32I-NEXT:    lbu s0, 24(a0)
+; RV32I-NEXT:    lbu s6, 25(a0)
+; RV32I-NEXT:    lbu ra, 26(a0)
+; RV32I-NEXT:    lbu s2, 27(a0)
+; RV32I-NEXT:    slli s7, s7, 8
+; RV32I-NEXT:    slli s8, s8, 16
+; RV32I-NEXT:    slli s9, s9, 24
+; RV32I-NEXT:    slli s11, s11, 8
+; RV32I-NEXT:    or s1, s7, s1
+; RV32I-NEXT:    or s7, s9, s8
+; RV32I-NEXT:    or s3, s11, s10
+; RV32I-NEXT:    lbu s8, 28(a0)
+; RV32I-NEXT:    lbu s9, 29(a0)
+; RV32I-NEXT:    lbu s10, 30(a0)
+; RV32I-NEXT:    lbu a0, 31(a0)
+; RV32I-NEXT:    lbu a1, 0(a1)
+; RV32I-NEXT:    slli s4, s4, 16
+; RV32I-NEXT:    slli s5, s5, 24
+; RV32I-NEXT:    or s4, s5, s4
+; RV32I-NEXT:    addi s5, sp, 8
 ; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    slli t1, t1, 16
-; RV32I-NEXT:    slli t2, t2, 24
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a5, t0, a7
-; RV32I-NEXT:    or a6, t2, t1
-; RV32I-NEXT:    lbu s8, 20(a0)
-; RV32I-NEXT:    lbu s9, 21(a0)
-; RV32I-NEXT:    lbu s10, 22(a0)
-; RV32I-NEXT:    lbu s11, 23(a0)
+; RV32I-NEXT:    slli t2, t2, 16
+; RV32I-NEXT:    slli t3, t3, 24
 ; RV32I-NEXT:    slli t4, t4, 8
 ; RV32I-NEXT:    slli t5, t5, 16
 ; RV32I-NEXT:    slli t6, t6, 24
-; RV32I-NEXT:    slli s1, s1, 8
-; RV32I-NEXT:    slli s2, s2, 16
-; RV32I-NEXT:    slli s3, s3, 24
-; RV32I-NEXT:    or a7, t4, t3
-; RV32I-NEXT:    or t0, t6, t5
-; RV32I-NEXT:    or t1, s1, s0
-; RV32I-NEXT:    or t2, s3, s2
-; RV32I-NEXT:    lbu t3, 24(a0)
-; RV32I-NEXT:    lbu t5, 25(a0)
-; RV32I-NEXT:    lbu t6, 26(a0)
-; RV32I-NEXT:    lbu s0, 27(a0)
-; RV32I-NEXT:    slli s5, s5, 8
-; RV32I-NEXT:    slli s6, s6, 16
-; RV32I-NEXT:    slli s7, s7, 24
+; RV32I-NEXT:    slli s6, s6, 8
+; RV32I-NEXT:    slli ra, ra, 16
+; RV32I-NEXT:    slli s2, s2, 24
 ; RV32I-NEXT:    slli s9, s9, 8
-; RV32I-NEXT:    or t4, s5, s4
-; RV32I-NEXT:    or s1, s7, s6
-; RV32I-NEXT:    or s2, s9, s8
-; RV32I-NEXT:    lbu s3, 28(a0)
-; RV32I-NEXT:    lbu s4, 29(a0)
-; RV32I-NEXT:    lbu s5, 30(a0)
-; RV32I-NEXT:    lbu a0, 31(a0)
-; RV32I-NEXT:    lbu a1, 0(a1)
 ; RV32I-NEXT:    slli s10, s10, 16
-; RV32I-NEXT:    slli s11, s11, 24
-; RV32I-NEXT:    or s6, s11, s10
-; RV32I-NEXT:    mv s7, sp
-; RV32I-NEXT:    slli t5, t5, 8
-; RV32I-NEXT:    slli t6, t6, 16
-; RV32I-NEXT:    slli s0, s0, 24
-; RV32I-NEXT:    slli s4, s4, 8
-; RV32I-NEXT:    slli s5, s5, 16
 ; RV32I-NEXT:    slli a0, a0, 24
 ; RV32I-NEXT:    slli a1, a1, 2
-; RV32I-NEXT:    or t3, t5, t3
-; RV32I-NEXT:    or t5, s0, t6
-; RV32I-NEXT:    or t6, s4, s3
-; RV32I-NEXT:    or s0, a0, s5
+; RV32I-NEXT:    or a3, t0, a3
+; RV32I-NEXT:    or t0, t3, t2
+; RV32I-NEXT:    or a4, t4, a4
+; RV32I-NEXT:    or t2, t6, t5
+; RV32I-NEXT:    or t3, s6, s0
+; RV32I-NEXT:    or t4, s2, ra
+; RV32I-NEXT:    or t5, s9, s8
+; RV32I-NEXT:    or t6, a0, s10
 ; RV32I-NEXT:    srai a0, a0, 31
 ; RV32I-NEXT:    andi a1, a1, 28
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a5, t0, a7
-; RV32I-NEXT:    or a6, t2, t1
-; RV32I-NEXT:    or a7, s1, t4
-; RV32I-NEXT:    or t0, s6, s2
-; RV32I-NEXT:    or t1, t5, t3
-; RV32I-NEXT:    or t2, s0, t6
-; RV32I-NEXT:    sw a0, 48(sp)
-; RV32I-NEXT:    sw a0, 52(sp)
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    or a6, t1, a7
+; RV32I-NEXT:    or a7, s7, s1
+; RV32I-NEXT:    or t1, s4, s3
+; RV32I-NEXT:    or a3, t0, a3
+; RV32I-NEXT:    or a4, t2, a4
+; RV32I-NEXT:    or t0, t4, t3
+; RV32I-NEXT:    or t2, t6, t5
 ; RV32I-NEXT:    sw a0, 56(sp)
 ; RV32I-NEXT:    sw a0, 60(sp)
-; RV32I-NEXT:    sw a0, 32(sp)
-; RV32I-NEXT:    sw a0, 36(sp)
+; RV32I-NEXT:    sw a0, 64(sp)
+; RV32I-NEXT:    sw a0, 68(sp)
 ; RV32I-NEXT:    sw a0, 40(sp)
 ; RV32I-NEXT:    sw a0, 44(sp)
-; RV32I-NEXT:    add s7, s7, a1
-; RV32I-NEXT:    sw a7, 16(sp)
-; RV32I-NEXT:    sw t0, 20(sp)
-; RV32I-NEXT:    sw t1, 24(sp)
-; RV32I-NEXT:    sw t2, 28(sp)
-; RV32I-NEXT:    sw a3, 0(sp)
-; RV32I-NEXT:    sw a4, 4(sp)
+; RV32I-NEXT:    sw a0, 48(sp)
+; RV32I-NEXT:    sw a0, 52(sp)
+; RV32I-NEXT:    add s5, s5, a1
+; RV32I-NEXT:    sw a3, 24(sp)
+; RV32I-NEXT:    sw a4, 28(sp)
+; RV32I-NEXT:    sw t0, 32(sp)
+; RV32I-NEXT:    sw t2, 36(sp)
 ; RV32I-NEXT:    sw a5, 8(sp)
 ; RV32I-NEXT:    sw a6, 12(sp)
-; RV32I-NEXT:    lw a6, 16(s7)
-; RV32I-NEXT:    lw a5, 20(s7)
-; RV32I-NEXT:    lw a7, 24(s7)
-; RV32I-NEXT:    lw a1, 0(s7)
-; RV32I-NEXT:    lw a0, 4(s7)
-; RV32I-NEXT:    lw a4, 8(s7)
-; RV32I-NEXT:    lw a3, 12(s7)
-; RV32I-NEXT:    lw t0, 28(s7)
+; RV32I-NEXT:    sw a7, 16(sp)
+; RV32I-NEXT:    sw t1, 20(sp)
+; RV32I-NEXT:    lw a6, 16(s5)
+; RV32I-NEXT:    lw a5, 20(s5)
+; RV32I-NEXT:    lw a7, 24(s5)
+; RV32I-NEXT:    lw a1, 0(s5)
+; RV32I-NEXT:    lw a0, 4(s5)
+; RV32I-NEXT:    lw a4, 8(s5)
+; RV32I-NEXT:    lw a3, 12(s5)
+; RV32I-NEXT:    lw t0, 28(s5)
 ; RV32I-NEXT:    srli t1, a7, 24
 ; RV32I-NEXT:    srli t2, a7, 16
 ; RV32I-NEXT:    srli t3, a7, 8
@@ -5432,21 +5465,21 @@ define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
 ; RV32I-NEXT:    srli s5, a5, 8
 ; RV32I-NEXT:    srli s6, a4, 24
 ; RV32I-NEXT:    srli s7, a4, 16
+; RV32I-NEXT:    srli s8, a4, 8
+; RV32I-NEXT:    srli s9, a3, 24
+; RV32I-NEXT:    srli s10, a3, 16
+; RV32I-NEXT:    srli s11, a3, 8
 ; RV32I-NEXT:    sb a7, 24(a2)
-; RV32I-NEXT:    srli a7, a4, 8
+; RV32I-NEXT:    srli a7, a1, 24
 ; RV32I-NEXT:    sb t3, 25(a2)
-; RV32I-NEXT:    srli t3, a3, 24
 ; RV32I-NEXT:    sb t2, 26(a2)
-; RV32I-NEXT:    srli t2, a3, 16
 ; RV32I-NEXT:    sb t1, 27(a2)
-; RV32I-NEXT:    srli t1, a3, 8
+; RV32I-NEXT:    srli t1, a1, 16
 ; RV32I-NEXT:    sb t0, 28(a2)
-; RV32I-NEXT:    srli t0, a1, 24
 ; RV32I-NEXT:    sb t6, 29(a2)
-; RV32I-NEXT:    srli t6, a1, 16
 ; RV32I-NEXT:    sb t5, 30(a2)
 ; RV32I-NEXT:    sb t4, 31(a2)
-; RV32I-NEXT:    srli t4, a1, 8
+; RV32I-NEXT:    srli t0, a1, 8
 ; RV32I-NEXT:    sb a6, 16(a2)
 ; RV32I-NEXT:    sb s2, 17(a2)
 ; RV32I-NEXT:    sb s1, 18(a2)
@@ -5458,35 +5491,36 @@ define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
 ; RV32I-NEXT:    sb s3, 23(a2)
 ; RV32I-NEXT:    srli a5, a0, 16
 ; RV32I-NEXT:    sb a4, 8(a2)
-; RV32I-NEXT:    sb a7, 9(a2)
+; RV32I-NEXT:    sb s8, 9(a2)
 ; RV32I-NEXT:    sb s7, 10(a2)
 ; RV32I-NEXT:    sb s6, 11(a2)
 ; RV32I-NEXT:    srli a4, a0, 8
 ; RV32I-NEXT:    sb a3, 12(a2)
-; RV32I-NEXT:    sb t1, 13(a2)
-; RV32I-NEXT:    sb t2, 14(a2)
-; RV32I-NEXT:    sb t3, 15(a2)
+; RV32I-NEXT:    sb s11, 13(a2)
+; RV32I-NEXT:    sb s10, 14(a2)
+; RV32I-NEXT:    sb s9, 15(a2)
 ; RV32I-NEXT:    sb a1, 0(a2)
-; RV32I-NEXT:    sb t4, 1(a2)
-; RV32I-NEXT:    sb t6, 2(a2)
-; RV32I-NEXT:    sb t0, 3(a2)
+; RV32I-NEXT:    sb t0, 1(a2)
+; RV32I-NEXT:    sb t1, 2(a2)
+; RV32I-NEXT:    sb a7, 3(a2)
 ; RV32I-NEXT:    sb a0, 4(a2)
 ; RV32I-NEXT:    sb a4, 5(a2)
 ; RV32I-NEXT:    sb a5, 6(a2)
 ; RV32I-NEXT:    sb a6, 7(a2)
-; RV32I-NEXT:    lw s0, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s4, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s5, 88(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s6, 84(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s7, 80(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s8, 76(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s9, 72(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s10, 68(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s11, 64(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 112
+; RV32I-NEXT:    lw ra, 124(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 120(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 116(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 112(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 108(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 104(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s5, 100(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s6, 96(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s7, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s8, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s9, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s10, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s11, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 128
 ; RV32I-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
   %wordOff = load i256, ptr %wordOff.ptr, align 1
@@ -5512,112 +5546,112 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; RV64I-NEXT:    sd s9, 80(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s10, 72(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s11, 64(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 0(a0)
-; RV64I-NEXT:    lbu a4, 1(a0)
-; RV64I-NEXT:    lbu a5, 2(a0)
-; RV64I-NEXT:    lbu a6, 3(a0)
-; RV64I-NEXT:    lbu a7, 4(a0)
-; RV64I-NEXT:    lbu t0, 5(a0)
-; RV64I-NEXT:    lbu t1, 6(a0)
-; RV64I-NEXT:    lbu t2, 7(a0)
-; RV64I-NEXT:    lbu t3, 8(a0)
-; RV64I-NEXT:    lbu t4, 9(a0)
-; RV64I-NEXT:    lbu t5, 10(a0)
-; RV64I-NEXT:    lbu t6, 11(a0)
-; RV64I-NEXT:    lbu s0, 12(a0)
-; RV64I-NEXT:    lbu s1, 13(a0)
-; RV64I-NEXT:    lbu s2, 14(a0)
-; RV64I-NEXT:    lbu s3, 15(a0)
-; RV64I-NEXT:    lbu s4, 16(a0)
-; RV64I-NEXT:    lbu s5, 17(a0)
-; RV64I-NEXT:    lbu s6, 18(a0)
-; RV64I-NEXT:    lbu s7, 19(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    slli a5, a5, 16
-; RV64I-NEXT:    slli a6, a6, 24
-; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    lbu a5, 0(a0)
+; RV64I-NEXT:    lbu a7, 1(a0)
+; RV64I-NEXT:    lbu t1, 2(a0)
+; RV64I-NEXT:    lbu s3, 3(a0)
+; RV64I-NEXT:    lbu t0, 4(a0)
+; RV64I-NEXT:    lbu s8, 5(a0)
+; RV64I-NEXT:    lbu s9, 6(a0)
+; RV64I-NEXT:    lbu s10, 7(a0)
+; RV64I-NEXT:    lbu s2, 8(a0)
+; RV64I-NEXT:    lbu s4, 9(a0)
+; RV64I-NEXT:    lbu s5, 10(a0)
+; RV64I-NEXT:    lbu s6, 11(a0)
+; RV64I-NEXT:    lbu s7, 12(a0)
+; RV64I-NEXT:    lbu s11, 13(a0)
+; RV64I-NEXT:    lbu t4, 14(a0)
+; RV64I-NEXT:    lbu t5, 15(a0)
+; RV64I-NEXT:    lbu a3, 16(a0)
+; RV64I-NEXT:    lbu a6, 17(a0)
+; RV64I-NEXT:    lbu t2, 18(a0)
+; RV64I-NEXT:    lbu t3, 19(a0)
+; RV64I-NEXT:    lbu a4, 20(a0)
+; RV64I-NEXT:    lbu t6, 21(a0)
+; RV64I-NEXT:    lbu s0, 22(a0)
+; RV64I-NEXT:    lbu s1, 23(a0)
+; RV64I-NEXT:    slli a7, a7, 8
 ; RV64I-NEXT:    slli t1, t1, 16
-; RV64I-NEXT:    slli t2, t2, 24
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    or a5, t0, a7
-; RV64I-NEXT:    or a6, t2, t1
-; RV64I-NEXT:    lbu s8, 20(a0)
-; RV64I-NEXT:    lbu s9, 21(a0)
-; RV64I-NEXT:    lbu s10, 22(a0)
-; RV64I-NEXT:    lbu s11, 23(a0)
-; RV64I-NEXT:    slli t4, t4, 8
-; RV64I-NEXT:    slli t5, t5, 16
-; RV64I-NEXT:    slli t6, t6, 24
-; RV64I-NEXT:    slli s1, s1, 8
-; RV64I-NEXT:    slli s2, s2, 16
 ; RV64I-NEXT:    slli s3, s3, 24
-; RV64I-NEXT:    or a7, t4, t3
-; RV64I-NEXT:    or t0, t6, t5
-; RV64I-NEXT:    or t1, s1, s0
-; RV64I-NEXT:    or t2, s3, s2
-; RV64I-NEXT:    lbu t3, 24(a0)
-; RV64I-NEXT:    lbu t4, 25(a0)
-; RV64I-NEXT:    lbu t5, 26(a0)
-; RV64I-NEXT:    lbu t6, 27(a0)
-; RV64I-NEXT:    slli s5, s5, 8
-; RV64I-NEXT:    slli s6, s6, 16
-; RV64I-NEXT:    slli s7, s7, 24
-; RV64I-NEXT:    slli s9, s9, 8
-; RV64I-NEXT:    or s0, s5, s4
-; RV64I-NEXT:    or s1, s7, s6
-; RV64I-NEXT:    or s2, s9, s8
-; RV64I-NEXT:    lbu s3, 28(a0)
-; RV64I-NEXT:    lbu s4, 29(a0)
-; RV64I-NEXT:    lbu s5, 30(a0)
-; RV64I-NEXT:    lbu a0, 31(a0)
-; RV64I-NEXT:    lbu a1, 0(a1)
-; RV64I-NEXT:    slli s10, s10, 16
-; RV64I-NEXT:    slli s11, s11, 24
-; RV64I-NEXT:    or s6, s11, s10
-; RV64I-NEXT:    mv s7, sp
-; RV64I-NEXT:    slli t4, t4, 8
-; RV64I-NEXT:    slli t5, t5, 16
-; RV64I-NEXT:    slli t6, t6, 24
+; RV64I-NEXT:    slli s8, s8, 8
+; RV64I-NEXT:    slli s9, s9, 16
+; RV64I-NEXT:    slli s10, s10, 24
+; RV64I-NEXT:    or a5, a7, a5
+; RV64I-NEXT:    or a7, s3, t1
+; RV64I-NEXT:    or t0, s8, t0
+; RV64I-NEXT:    or t1, s10, s9
+; RV64I-NEXT:    lbu s3, 24(a0)
+; RV64I-NEXT:    lbu s8, 25(a0)
+; RV64I-NEXT:    lbu s9, 26(a0)
+; RV64I-NEXT:    lbu s10, 27(a0)
 ; RV64I-NEXT:    slli s4, s4, 8
 ; RV64I-NEXT:    slli s5, s5, 16
+; RV64I-NEXT:    slli s6, s6, 24
+; RV64I-NEXT:    slli s11, s11, 8
+; RV64I-NEXT:    or s2, s4, s2
+; RV64I-NEXT:    or s4, s6, s5
+; RV64I-NEXT:    or s5, s11, s7
+; RV64I-NEXT:    lbu s6, 28(a0)
+; RV64I-NEXT:    lbu s7, 29(a0)
+; RV64I-NEXT:    lbu s11, 30(a0)
+; RV64I-NEXT:    lbu a0, 31(a0)
+; RV64I-NEXT:    lbu a1, 0(a1)
+; RV64I-NEXT:    slli t4, t4, 16
+; RV64I-NEXT:    slli t5, t5, 24
+; RV64I-NEXT:    or t4, t5, t4
+; RV64I-NEXT:    mv t5, sp
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    slli t2, t2, 16
+; RV64I-NEXT:    slli t3, t3, 24
+; RV64I-NEXT:    slli t6, t6, 8
+; RV64I-NEXT:    slli s0, s0, 16
+; RV64I-NEXT:    slli s1, s1, 24
+; RV64I-NEXT:    slli s8, s8, 8
+; RV64I-NEXT:    slli s9, s9, 16
+; RV64I-NEXT:    slli s10, s10, 24
+; RV64I-NEXT:    slli s7, s7, 8
+; RV64I-NEXT:    slli s11, s11, 16
 ; RV64I-NEXT:    slli a0, a0, 24
 ; RV64I-NEXT:    slli a1, a1, 3
-; RV64I-NEXT:    or t3, t4, t3
-; RV64I-NEXT:    or t4, t6, t5
-; RV64I-NEXT:    or t5, s4, s3
-; RV64I-NEXT:    or a0, a0, s5
-; RV64I-NEXT:    andi a1, a1, 24
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    or a5, t0, a7
-; RV64I-NEXT:    or a6, t2, t1
+; RV64I-NEXT:    or a3, a6, a3
+; RV64I-NEXT:    or a6, t3, t2
+; RV64I-NEXT:    or a4, t6, a4
 ; RV64I-NEXT:    or s0, s1, s0
-; RV64I-NEXT:    or a7, s6, s2
-; RV64I-NEXT:    or t0, t4, t3
-; RV64I-NEXT:    or a0, a0, t5
-; RV64I-NEXT:    add s7, s7, a1
-; RV64I-NEXT:    slli a4, a4, 32
-; RV64I-NEXT:    slli a6, a6, 32
+; RV64I-NEXT:    or t2, s8, s3
+; RV64I-NEXT:    or t3, s10, s9
+; RV64I-NEXT:    or t6, s7, s6
+; RV64I-NEXT:    or a0, a0, s11
+; RV64I-NEXT:    andi a1, a1, 24
+; RV64I-NEXT:    or a5, a7, a5
+; RV64I-NEXT:    or a7, t1, t0
+; RV64I-NEXT:    or t0, s4, s2
+; RV64I-NEXT:    or t1, t4, s5
+; RV64I-NEXT:    or a3, a6, a3
+; RV64I-NEXT:    or a4, s0, a4
+; RV64I-NEXT:    or a6, t3, t2
+; RV64I-NEXT:    or a0, a0, t6
+; RV64I-NEXT:    add t5, t5, a1
 ; RV64I-NEXT:    slli a7, a7, 32
+; RV64I-NEXT:    slli t1, t1, 32
+; RV64I-NEXT:    slli a4, a4, 32
 ; RV64I-NEXT:    slli a1, a0, 32
 ; RV64I-NEXT:    sraiw a0, a0, 31
+; RV64I-NEXT:    or a5, a7, a5
+; RV64I-NEXT:    or a7, t1, t0
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    or a5, a7, s0
-; RV64I-NEXT:    or a1, a1, t0
+; RV64I-NEXT:    or a1, a1, a6
 ; RV64I-NEXT:    sd a0, 32(sp)
 ; RV64I-NEXT:    sd a0, 40(sp)
 ; RV64I-NEXT:    sd a0, 48(sp)
 ; RV64I-NEXT:    sd a0, 56(sp)
-; RV64I-NEXT:    sd a3, 0(sp)
-; RV64I-NEXT:    sd a4, 8(sp)
-; RV64I-NEXT:    sd a5, 16(sp)
+; RV64I-NEXT:    sd a5, 0(sp)
+; RV64I-NEXT:    sd a7, 8(sp)
+; RV64I-NEXT:    sd a3, 16(sp)
 ; RV64I-NEXT:    sd a1, 24(sp)
-; RV64I-NEXT:    ld a4, 16(s7)
-; RV64I-NEXT:    ld a0, 8(s7)
-; RV64I-NEXT:    ld a1, 0(s7)
-; RV64I-NEXT:    ld a3, 24(s7)
+; RV64I-NEXT:    ld a4, 16(t5)
+; RV64I-NEXT:    ld a0, 8(t5)
+; RV64I-NEXT:    ld a1, 0(t5)
+; RV64I-NEXT:    ld a3, 24(t5)
 ; RV64I-NEXT:    srli a5, a4, 56
 ; RV64I-NEXT:    srli a6, a4, 48
 ; RV64I-NEXT:    srli a7, a4, 40
@@ -5636,25 +5670,25 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; RV64I-NEXT:    srli s5, a1, 48
 ; RV64I-NEXT:    srli s6, a1, 40
 ; RV64I-NEXT:    srli s7, a1, 32
+; RV64I-NEXT:    srli s8, a1, 24
+; RV64I-NEXT:    srli s9, a1, 16
+; RV64I-NEXT:    srli s10, a1, 8
+; RV64I-NEXT:    srli s11, a0, 56
 ; RV64I-NEXT:    sb t0, 20(a2)
-; RV64I-NEXT:    srli t0, a1, 24
 ; RV64I-NEXT:    sb a7, 21(a2)
-; RV64I-NEXT:    srli a7, a1, 16
 ; RV64I-NEXT:    sb a6, 22(a2)
-; RV64I-NEXT:    srli a6, a1, 8
 ; RV64I-NEXT:    sb a5, 23(a2)
-; RV64I-NEXT:    srli a5, a0, 56
+; RV64I-NEXT:    srli a5, a0, 48
 ; RV64I-NEXT:    sb a4, 16(a2)
-; RV64I-NEXT:    srli a4, a0, 48
 ; RV64I-NEXT:    sb t3, 17(a2)
 ; RV64I-NEXT:    sb t2, 18(a2)
 ; RV64I-NEXT:    sb t1, 19(a2)
-; RV64I-NEXT:    srli t1, a0, 40
+; RV64I-NEXT:    srli a4, a0, 40
 ; RV64I-NEXT:    sb s0, 28(a2)
 ; RV64I-NEXT:    sb t6, 29(a2)
 ; RV64I-NEXT:    sb t5, 30(a2)
 ; RV64I-NEXT:    sb t4, 31(a2)
-; RV64I-NEXT:    srli t2, a0, 32
+; RV64I-NEXT:    srli a6, a0, 32
 ; RV64I-NEXT:    sb a3, 24(a2)
 ; RV64I-NEXT:    sb s3, 25(a2)
 ; RV64I-NEXT:    sb s2, 26(a2)
@@ -5664,19 +5698,19 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; RV64I-NEXT:    sb s6, 5(a2)
 ; RV64I-NEXT:    sb s5, 6(a2)
 ; RV64I-NEXT:    sb s4, 7(a2)
-; RV64I-NEXT:    srli t3, a0, 16
+; RV64I-NEXT:    srli a7, a0, 16
 ; RV64I-NEXT:    sb a1, 0(a2)
-; RV64I-NEXT:    sb a6, 1(a2)
-; RV64I-NEXT:    sb a7, 2(a2)
-; RV64I-NEXT:    sb t0, 3(a2)
+; RV64I-NEXT:    sb s10, 1(a2)
+; RV64I-NEXT:    sb s9, 2(a2)
+; RV64I-NEXT:    sb s8, 3(a2)
 ; RV64I-NEXT:    srli a1, a0, 8
-; RV64I-NEXT:    sb t2, 12(a2)
-; RV64I-NEXT:    sb t1, 13(a2)
-; RV64I-NEXT:    sb a4, 14(a2)
-; RV64I-NEXT:    sb a5, 15(a2)
+; RV64I-NEXT:    sb a6, 12(a2)
+; RV64I-NEXT:    sb a4, 13(a2)
+; RV64I-NEXT:    sb a5, 14(a2)
+; RV64I-NEXT:    sb s11, 15(a2)
 ; RV64I-NEXT:    sb a0, 8(a2)
 ; RV64I-NEXT:    sb a1, 9(a2)
-; RV64I-NEXT:    sb t3, 10(a2)
+; RV64I-NEXT:    sb a7, 10(a2)
 ; RV64I-NEXT:    sb a3, 11(a2)
 ; RV64I-NEXT:    ld s0, 152(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 144(sp) # 8-byte Folded Reload
@@ -5695,129 +5729,130 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ;
 ; RV32I-LABEL: ashr_32bytes_dwordOff:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -112
-; RV32I-NEXT:    sw s0, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s5, 88(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s6, 84(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s7, 80(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s8, 76(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s9, 72(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s10, 68(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s11, 64(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 0(a0)
-; RV32I-NEXT:    lbu a4, 1(a0)
-; RV32I-NEXT:    lbu a5, 2(a0)
-; RV32I-NEXT:    lbu a6, 3(a0)
-; RV32I-NEXT:    lbu a7, 4(a0)
-; RV32I-NEXT:    lbu t0, 5(a0)
-; RV32I-NEXT:    lbu t1, 6(a0)
-; RV32I-NEXT:    lbu t2, 7(a0)
-; RV32I-NEXT:    lbu t3, 8(a0)
-; RV32I-NEXT:    lbu t4, 9(a0)
-; RV32I-NEXT:    lbu t5, 10(a0)
-; RV32I-NEXT:    lbu t6, 11(a0)
-; RV32I-NEXT:    lbu s0, 12(a0)
-; RV32I-NEXT:    lbu s1, 13(a0)
-; RV32I-NEXT:    lbu s2, 14(a0)
-; RV32I-NEXT:    lbu s3, 15(a0)
-; RV32I-NEXT:    lbu s4, 16(a0)
-; RV32I-NEXT:    lbu s5, 17(a0)
-; RV32I-NEXT:    lbu s6, 18(a0)
-; RV32I-NEXT:    lbu s7, 19(a0)
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli a6, a6, 24
+; RV32I-NEXT:    addi sp, sp, -128
+; RV32I-NEXT:    sw ra, 124(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 120(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 116(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 112(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 108(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 104(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s5, 100(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s6, 96(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s7, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s8, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s9, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s10, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s11, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a5, 0(a0)
+; RV32I-NEXT:    lbu a6, 1(a0)
+; RV32I-NEXT:    lbu a7, 2(a0)
+; RV32I-NEXT:    lbu t1, 3(a0)
+; RV32I-NEXT:    lbu s0, 4(a0)
+; RV32I-NEXT:    lbu s2, 5(a0)
+; RV32I-NEXT:    lbu s3, 6(a0)
+; RV32I-NEXT:    lbu s6, 7(a0)
+; RV32I-NEXT:    lbu s1, 8(a0)
+; RV32I-NEXT:    lbu s7, 9(a0)
+; RV32I-NEXT:    lbu s8, 10(a0)
+; RV32I-NEXT:    lbu s9, 11(a0)
+; RV32I-NEXT:    lbu s10, 12(a0)
+; RV32I-NEXT:    lbu s11, 13(a0)
+; RV32I-NEXT:    lbu s4, 14(a0)
+; RV32I-NEXT:    lbu s5, 15(a0)
+; RV32I-NEXT:    lbu a3, 16(a0)
+; RV32I-NEXT:    lbu t0, 17(a0)
+; RV32I-NEXT:    lbu t2, 18(a0)
+; RV32I-NEXT:    lbu t3, 19(a0)
+; RV32I-NEXT:    lbu a4, 20(a0)
+; RV32I-NEXT:    lbu t4, 21(a0)
+; RV32I-NEXT:    lbu t5, 22(a0)
+; RV32I-NEXT:    lbu t6, 23(a0)
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    slli t1, t1, 24
+; RV32I-NEXT:    slli s2, s2, 8
+; RV32I-NEXT:    slli s3, s3, 16
+; RV32I-NEXT:    slli s6, s6, 24
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    or a6, t1, a7
+; RV32I-NEXT:    or a7, s2, s0
+; RV32I-NEXT:    or t1, s6, s3
+; RV32I-NEXT:    lbu s0, 24(a0)
+; RV32I-NEXT:    lbu s6, 25(a0)
+; RV32I-NEXT:    lbu ra, 26(a0)
+; RV32I-NEXT:    lbu s2, 27(a0)
+; RV32I-NEXT:    slli s7, s7, 8
+; RV32I-NEXT:    slli s8, s8, 16
+; RV32I-NEXT:    slli s9, s9, 24
+; RV32I-NEXT:    slli s11, s11, 8
+; RV32I-NEXT:    or s1, s7, s1
+; RV32I-NEXT:    or s7, s9, s8
+; RV32I-NEXT:    or s3, s11, s10
+; RV32I-NEXT:    lbu s8, 28(a0)
+; RV32I-NEXT:    lbu s9, 29(a0)
+; RV32I-NEXT:    lbu s10, 30(a0)
+; RV32I-NEXT:    lbu a0, 31(a0)
+; RV32I-NEXT:    lbu a1, 0(a1)
+; RV32I-NEXT:    slli s4, s4, 16
+; RV32I-NEXT:    slli s5, s5, 24
+; RV32I-NEXT:    or s4, s5, s4
+; RV32I-NEXT:    addi s5, sp, 8
 ; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    slli t1, t1, 16
-; RV32I-NEXT:    slli t2, t2, 24
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a5, t0, a7
-; RV32I-NEXT:    or a6, t2, t1
-; RV32I-NEXT:    lbu s8, 20(a0)
-; RV32I-NEXT:    lbu s9, 21(a0)
-; RV32I-NEXT:    lbu s10, 22(a0)
-; RV32I-NEXT:    lbu s11, 23(a0)
+; RV32I-NEXT:    slli t2, t2, 16
+; RV32I-NEXT:    slli t3, t3, 24
 ; RV32I-NEXT:    slli t4, t4, 8
 ; RV32I-NEXT:    slli t5, t5, 16
 ; RV32I-NEXT:    slli t6, t6, 24
-; RV32I-NEXT:    slli s1, s1, 8
-; RV32I-NEXT:    slli s2, s2, 16
-; RV32I-NEXT:    slli s3, s3, 24
-; RV32I-NEXT:    or a7, t4, t3
-; RV32I-NEXT:    or t0, t6, t5
-; RV32I-NEXT:    or t1, s1, s0
-; RV32I-NEXT:    or t2, s3, s2
-; RV32I-NEXT:    lbu t3, 24(a0)
-; RV32I-NEXT:    lbu t5, 25(a0)
-; RV32I-NEXT:    lbu t6, 26(a0)
-; RV32I-NEXT:    lbu s0, 27(a0)
-; RV32I-NEXT:    slli s5, s5, 8
-; RV32I-NEXT:    slli s6, s6, 16
-; RV32I-NEXT:    slli s7, s7, 24
+; RV32I-NEXT:    slli s6, s6, 8
+; RV32I-NEXT:    slli ra, ra, 16
+; RV32I-NEXT:    slli s2, s2, 24
 ; RV32I-NEXT:    slli s9, s9, 8
-; RV32I-NEXT:    or t4, s5, s4
-; RV32I-NEXT:    or s1, s7, s6
-; RV32I-NEXT:    or s2, s9, s8
-; RV32I-NEXT:    lbu s3, 28(a0)
-; RV32I-NEXT:    lbu s4, 29(a0)
-; RV32I-NEXT:    lbu s5, 30(a0)
-; RV32I-NEXT:    lbu a0, 31(a0)
-; RV32I-NEXT:    lbu a1, 0(a1)
 ; RV32I-NEXT:    slli s10, s10, 16
-; RV32I-NEXT:    slli s11, s11, 24
-; RV32I-NEXT:    or s6, s11, s10
-; RV32I-NEXT:    mv s7, sp
-; RV32I-NEXT:    slli t5, t5, 8
-; RV32I-NEXT:    slli t6, t6, 16
-; RV32I-NEXT:    slli s0, s0, 24
-; RV32I-NEXT:    slli s4, s4, 8
-; RV32I-NEXT:    slli s5, s5, 16
 ; RV32I-NEXT:    slli a0, a0, 24
 ; RV32I-NEXT:    slli a1, a1, 3
-; RV32I-NEXT:    or t3, t5, t3
-; RV32I-NEXT:    or t5, s0, t6
-; RV32I-NEXT:    or t6, s4, s3
-; RV32I-NEXT:    or s0, a0, s5
+; RV32I-NEXT:    or a3, t0, a3
+; RV32I-NEXT:    or t0, t3, t2
+; RV32I-NEXT:    or a4, t4, a4
+; RV32I-NEXT:    or t2, t6, t5
+; RV32I-NEXT:    or t3, s6, s0
+; RV32I-NEXT:    or t4, s2, ra
+; RV32I-NEXT:    or t5, s9, s8
+; RV32I-NEXT:    or t6, a0, s10
 ; RV32I-NEXT:    srai a0, a0, 31
 ; RV32I-NEXT:    andi a1, a1, 24
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a5, t0, a7
-; RV32I-NEXT:    or a6, t2, t1
-; RV32I-NEXT:    or a7, s1, t4
-; RV32I-NEXT:    or t0, s6, s2
-; RV32I-NEXT:    or t1, t5, t3
-; RV32I-NEXT:    or t2, s0, t6
-; RV32I-NEXT:    sw a0, 48(sp)
-; RV32I-NEXT:    sw a0, 52(sp)
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    or a6, t1, a7
+; RV32I-NEXT:    or a7, s7, s1
+; RV32I-NEXT:    or t1, s4, s3
+; RV32I-NEXT:    or a3, t0, a3
+; RV32I-NEXT:    or a4, t2, a4
+; RV32I-NEXT:    or t0, t4, t3
+; RV32I-NEXT:    or t2, t6, t5
 ; RV32I-NEXT:    sw a0, 56(sp)
 ; RV32I-NEXT:    sw a0, 60(sp)
-; RV32I-NEXT:    sw a0, 32(sp)
-; RV32I-NEXT:    sw a0, 36(sp)
+; RV32I-NEXT:    sw a0, 64(sp)
+; RV32I-NEXT:    sw a0, 68(sp)
 ; RV32I-NEXT:    sw a0, 40(sp)
 ; RV32I-NEXT:    sw a0, 44(sp)
-; RV32I-NEXT:    add s7, s7, a1
-; RV32I-NEXT:    sw a7, 16(sp)
-; RV32I-NEXT:    sw t0, 20(sp)
-; RV32I-NEXT:    sw t1, 24(sp)
-; RV32I-NEXT:    sw t2, 28(sp)
-; RV32I-NEXT:    sw a3, 0(sp)
-; RV32I-NEXT:    sw a4, 4(sp)
+; RV32I-NEXT:    sw a0, 48(sp)
+; RV32I-NEXT:    sw a0, 52(sp)
+; RV32I-NEXT:    add s5, s5, a1
+; RV32I-NEXT:    sw a3, 24(sp)
+; RV32I-NEXT:    sw a4, 28(sp)
+; RV32I-NEXT:    sw t0, 32(sp)
+; RV32I-NEXT:    sw t2, 36(sp)
 ; RV32I-NEXT:    sw a5, 8(sp)
 ; RV32I-NEXT:    sw a6, 12(sp)
-; RV32I-NEXT:    lw a6, 16(s7)
-; RV32I-NEXT:    lw a5, 20(s7)
-; RV32I-NEXT:    lw a7, 24(s7)
-; RV32I-NEXT:    lw a1, 0(s7)
-; RV32I-NEXT:    lw a0, 4(s7)
-; RV32I-NEXT:    lw a4, 8(s7)
-; RV32I-NEXT:    lw a3, 12(s7)
-; RV32I-NEXT:    lw t0, 28(s7)
+; RV32I-NEXT:    sw a7, 16(sp)
+; RV32I-NEXT:    sw t1, 20(sp)
+; RV32I-NEXT:    lw a6, 16(s5)
+; RV32I-NEXT:    lw a5, 20(s5)
+; RV32I-NEXT:    lw a7, 24(s5)
+; RV32I-NEXT:    lw a1, 0(s5)
+; RV32I-NEXT:    lw a0, 4(s5)
+; RV32I-NEXT:    lw a4, 8(s5)
+; RV32I-NEXT:    lw a3, 12(s5)
+; RV32I-NEXT:    lw t0, 28(s5)
 ; RV32I-NEXT:    srli t1, a7, 24
 ; RV32I-NEXT:    srli t2, a7, 16
 ; RV32I-NEXT:    srli t3, a7, 8
@@ -5832,21 +5867,21 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; RV32I-NEXT:    srli s5, a5, 8
 ; RV32I-NEXT:    srli s6, a4, 24
 ; RV32I-NEXT:    srli s7, a4, 16
+; RV32I-NEXT:    srli s8, a4, 8
+; RV32I-NEXT:    srli s9, a3, 24
+; RV32I-NEXT:    srli s10, a3, 16
+; RV32I-NEXT:    srli s11, a3, 8
 ; RV32I-NEXT:    sb a7, 24(a2)
-; RV32I-NEXT:    srli a7, a4, 8
+; RV32I-NEXT:    srli a7, a1, 24
 ; RV32I-NEXT:    sb t3, 25(a2)
-; RV32I-NEXT:    srli t3, a3, 24
 ; RV32I-NEXT:    sb t2, 26(a2)
-; RV32I-NEXT:    srli t2, a3, 16
 ; RV32I-NEXT:    sb t1, 27(a2)
-; RV32I-NEXT:    srli t1, a3, 8
+; RV32I-NEXT:    srli t1, a1, 16
 ; RV32I-NEXT:    sb t0, 28(a2)
-; RV32I-NEXT:    srli t0, a1, 24
 ; RV32I-NEXT:    sb t6, 29(a2)
-; RV32I-NEXT:    srli t6, a1, 16
 ; RV32I-NEXT:    sb t5, 30(a2)
 ; RV32I-NEXT:    sb t4, 31(a2)
-; RV32I-NEXT:    srli t4, a1, 8
+; RV32I-NEXT:    srli t0, a1, 8
 ; RV32I-NEXT:    sb a6, 16(a2)
 ; RV32I-NEXT:    sb s2, 17(a2)
 ; RV32I-NEXT:    sb s1, 18(a2)
@@ -5858,35 +5893,36 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; RV32I-NEXT:    sb s3, 23(a2)
 ; RV32I-NEXT:    srli a5, a0, 16
 ; RV32I-NEXT:    sb a4, 8(a2)
-; RV32I-NEXT:    sb a7, 9(a2)
+; RV32I-NEXT:    sb s8, 9(a2)
 ; RV32I-NEXT:    sb s7, 10(a2)
 ; RV32I-NEXT:    sb s6, 11(a2)
 ; RV32I-NEXT:    srli a4, a0, 8
 ; RV32I-NEXT:    sb a3, 12(a2)
-; RV32I-NEXT:    sb t1, 13(a2)
-; RV32I-NEXT:    sb t2, 14(a2)
-; RV32I-NEXT:    sb t3, 15(a2)
+; RV32I-NEXT:    sb s11, 13(a2)
+; RV32I-NEXT:    sb s10, 14(a2)
+; RV32I-NEXT:    sb s9, 15(a2)
 ; RV32I-NEXT:    sb a1, 0(a2)
-; RV32I-NEXT:    sb t4, 1(a2)
-; RV32I-NEXT:    sb t6, 2(a2)
-; RV32I-NEXT:    sb t0, 3(a2)
+; RV32I-NEXT:    sb t0, 1(a2)
+; RV32I-NEXT:    sb t1, 2(a2)
+; RV32I-NEXT:    sb a7, 3(a2)
 ; RV32I-NEXT:    sb a0, 4(a2)
 ; RV32I-NEXT:    sb a4, 5(a2)
 ; RV32I-NEXT:    sb a5, 6(a2)
 ; RV32I-NEXT:    sb a6, 7(a2)
-; RV32I-NEXT:    lw s0, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s4, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s5, 88(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s6, 84(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s7, 80(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s8, 76(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s9, 72(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s10, 68(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s11, 64(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 112
+; RV32I-NEXT:    lw ra, 124(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 120(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 116(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 112(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 108(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 104(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s5, 100(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s6, 96(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s7, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s8, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s9, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s10, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s11, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 128
 ; RV32I-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
   %dwordOff = load i256, ptr %dwordOff.ptr, align 1
diff --git a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
index b8952d2cb2b29e..b2c130c2d7c10a 100644
--- a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
+++ b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
@@ -1530,24 +1530,25 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ;
 ; RV32I-LABEL: lshr_32bytes:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -112
-; RV32I-NEXT:    sw s0, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s5, 88(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s6, 84(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s7, 80(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s8, 76(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s9, 72(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s10, 68(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s11, 64(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    addi sp, sp, -128
+; RV32I-NEXT:    sw ra, 124(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 120(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 116(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 112(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 108(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 104(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s5, 100(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s6, 96(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s7, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s8, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s9, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s10, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s11, 76(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    lbu a3, 0(a0)
 ; RV32I-NEXT:    lbu a4, 1(a0)
-; RV32I-NEXT:    lbu a5, 2(a0)
-; RV32I-NEXT:    lbu a6, 3(a0)
-; RV32I-NEXT:    lbu a7, 4(a0)
+; RV32I-NEXT:    lbu a6, 2(a0)
+; RV32I-NEXT:    lbu a7, 3(a0)
+; RV32I-NEXT:    lbu a5, 4(a0)
 ; RV32I-NEXT:    lbu t0, 5(a0)
 ; RV32I-NEXT:    lbu t1, 6(a0)
 ; RV32I-NEXT:    lbu t2, 7(a0)
@@ -1556,105 +1557,107 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    lbu t5, 10(a0)
 ; RV32I-NEXT:    lbu t6, 11(a0)
 ; RV32I-NEXT:    lbu s0, 12(a0)
-; RV32I-NEXT:    lbu s1, 13(a0)
-; RV32I-NEXT:    lbu s2, 14(a0)
-; RV32I-NEXT:    lbu s3, 15(a0)
-; RV32I-NEXT:    lbu s4, 16(a0)
-; RV32I-NEXT:    lbu s5, 17(a0)
-; RV32I-NEXT:    lbu s6, 18(a0)
-; RV32I-NEXT:    lbu s7, 19(a0)
+; RV32I-NEXT:    lbu s2, 13(a0)
+; RV32I-NEXT:    lbu s4, 14(a0)
+; RV32I-NEXT:    lbu s5, 15(a0)
+; RV32I-NEXT:    lbu s6, 16(a0)
+; RV32I-NEXT:    lbu s7, 17(a0)
+; RV32I-NEXT:    lbu s8, 18(a0)
+; RV32I-NEXT:    lbu s9, 19(a0)
 ; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli a6, a6, 24
+; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a7, a7, 24
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    sw a3, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    or a4, a7, a6
+; RV32I-NEXT:    lbu s10, 20(a0)
+; RV32I-NEXT:    lbu s11, 21(a0)
+; RV32I-NEXT:    lbu ra, 22(a0)
+; RV32I-NEXT:    lbu a3, 23(a0)
 ; RV32I-NEXT:    slli t0, t0, 8
 ; RV32I-NEXT:    slli t1, t1, 16
 ; RV32I-NEXT:    slli t2, t2, 24
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a5, t0, a7
-; RV32I-NEXT:    or a6, t2, t1
-; RV32I-NEXT:    lbu s8, 20(a0)
-; RV32I-NEXT:    lbu s9, 21(a0)
-; RV32I-NEXT:    lbu s10, 22(a0)
-; RV32I-NEXT:    lbu s11, 23(a0)
 ; RV32I-NEXT:    slli t4, t4, 8
 ; RV32I-NEXT:    slli t5, t5, 16
 ; RV32I-NEXT:    slli t6, t6, 24
-; RV32I-NEXT:    slli s1, s1, 8
-; RV32I-NEXT:    slli s2, s2, 16
-; RV32I-NEXT:    slli s3, s3, 24
+; RV32I-NEXT:    or a5, t0, a5
+; RV32I-NEXT:    or a6, t2, t1
 ; RV32I-NEXT:    or a7, t4, t3
 ; RV32I-NEXT:    or t0, t6, t5
-; RV32I-NEXT:    or t1, s1, s0
-; RV32I-NEXT:    or t2, s3, s2
-; RV32I-NEXT:    lbu t6, 24(a0)
-; RV32I-NEXT:    lbu s0, 25(a0)
-; RV32I-NEXT:    lbu s1, 26(a0)
-; RV32I-NEXT:    lbu s2, 27(a0)
-; RV32I-NEXT:    slli s5, s5, 8
-; RV32I-NEXT:    slli s6, s6, 16
-; RV32I-NEXT:    slli s7, s7, 24
-; RV32I-NEXT:    slli s9, s9, 8
-; RV32I-NEXT:    or t3, s5, s4
-; RV32I-NEXT:    or t4, s7, s6
-; RV32I-NEXT:    or t5, s9, s8
-; RV32I-NEXT:    lbu s3, 28(a0)
+; RV32I-NEXT:    lbu s1, 24(a0)
+; RV32I-NEXT:    lbu s3, 25(a0)
+; RV32I-NEXT:    lbu t4, 26(a0)
+; RV32I-NEXT:    lbu t5, 27(a0)
+; RV32I-NEXT:    slli s2, s2, 8
+; RV32I-NEXT:    slli s4, s4, 16
+; RV32I-NEXT:    slli s5, s5, 24
+; RV32I-NEXT:    slli s7, s7, 8
+; RV32I-NEXT:    or t1, s2, s0
+; RV32I-NEXT:    or t2, s5, s4
+; RV32I-NEXT:    or t3, s7, s6
+; RV32I-NEXT:    lbu t6, 28(a0)
 ; RV32I-NEXT:    lbu s4, 29(a0)
 ; RV32I-NEXT:    lbu s5, 30(a0)
 ; RV32I-NEXT:    lbu s6, 31(a0)
-; RV32I-NEXT:    slli s10, s10, 16
-; RV32I-NEXT:    slli s11, s11, 24
-; RV32I-NEXT:    slli s0, s0, 8
-; RV32I-NEXT:    slli s1, s1, 16
-; RV32I-NEXT:    slli s2, s2, 24
-; RV32I-NEXT:    or a0, s11, s10
-; RV32I-NEXT:    or t6, s0, t6
-; RV32I-NEXT:    or s0, s2, s1
-; RV32I-NEXT:    lbu s1, 0(a1)
-; RV32I-NEXT:    lbu s2, 1(a1)
-; RV32I-NEXT:    lbu s7, 2(a1)
+; RV32I-NEXT:    slli s8, s8, 16
+; RV32I-NEXT:    slli s9, s9, 24
+; RV32I-NEXT:    slli s11, s11, 8
+; RV32I-NEXT:    slli ra, ra, 16
+; RV32I-NEXT:    slli a3, a3, 24
+; RV32I-NEXT:    or a0, s9, s8
+; RV32I-NEXT:    or s0, s11, s10
+; RV32I-NEXT:    or s2, a3, ra
+; RV32I-NEXT:    lbu a3, 0(a1)
+; RV32I-NEXT:    lbu s7, 1(a1)
+; RV32I-NEXT:    lbu s8, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    sw zero, 48(sp)
-; RV32I-NEXT:    sw zero, 52(sp)
 ; RV32I-NEXT:    sw zero, 56(sp)
 ; RV32I-NEXT:    sw zero, 60(sp)
-; RV32I-NEXT:    sw zero, 32(sp)
-; RV32I-NEXT:    sw zero, 36(sp)
+; RV32I-NEXT:    sw zero, 64(sp)
+; RV32I-NEXT:    sw zero, 68(sp)
 ; RV32I-NEXT:    sw zero, 40(sp)
 ; RV32I-NEXT:    sw zero, 44(sp)
+; RV32I-NEXT:    sw zero, 48(sp)
+; RV32I-NEXT:    sw zero, 52(sp)
+; RV32I-NEXT:    slli s3, s3, 8
+; RV32I-NEXT:    or s1, s3, s1
+; RV32I-NEXT:    addi s3, sp, 8
+; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    slli t5, t5, 24
 ; RV32I-NEXT:    slli s4, s4, 8
-; RV32I-NEXT:    or s3, s4, s3
-; RV32I-NEXT:    mv s4, sp
 ; RV32I-NEXT:    slli s5, s5, 16
 ; RV32I-NEXT:    slli s6, s6, 24
-; RV32I-NEXT:    slli s2, s2, 8
-; RV32I-NEXT:    slli s7, s7, 16
+; RV32I-NEXT:    slli s7, s7, 8
+; RV32I-NEXT:    slli s8, s8, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or s5, s6, s5
-; RV32I-NEXT:    or s1, s2, s1
-; RV32I-NEXT:    or a1, a1, s7
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a5, t0, a7
-; RV32I-NEXT:    or a6, t2, t1
-; RV32I-NEXT:    or a7, t4, t3
-; RV32I-NEXT:    or t0, a0, t5
-; RV32I-NEXT:    or t1, s0, t6
-; RV32I-NEXT:    or t2, s5, s3
-; RV32I-NEXT:    or a0, a1, s1
-; RV32I-NEXT:    sw a7, 16(sp)
-; RV32I-NEXT:    sw t0, 20(sp)
-; RV32I-NEXT:    sw t1, 24(sp)
-; RV32I-NEXT:    sw t2, 28(sp)
-; RV32I-NEXT:    sw a3, 0(sp)
-; RV32I-NEXT:    sw a4, 4(sp)
-; RV32I-NEXT:    sw a5, 8(sp)
-; RV32I-NEXT:    sw a6, 12(sp)
+; RV32I-NEXT:    or t4, t5, t4
+; RV32I-NEXT:    or t5, s4, t6
+; RV32I-NEXT:    or t6, s6, s5
+; RV32I-NEXT:    or a3, s7, a3
+; RV32I-NEXT:    or a1, a1, s8
+; RV32I-NEXT:    lw s4, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    or a4, a4, s4
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    or a6, t0, a7
+; RV32I-NEXT:    or a7, t2, t1
+; RV32I-NEXT:    or t0, a0, t3
+; RV32I-NEXT:    or t1, s2, s0
+; RV32I-NEXT:    or t2, t4, s1
+; RV32I-NEXT:    or t3, t6, t5
+; RV32I-NEXT:    or a0, a1, a3
+; RV32I-NEXT:    sw t0, 24(sp)
+; RV32I-NEXT:    sw t1, 28(sp)
+; RV32I-NEXT:    sw t2, 32(sp)
+; RV32I-NEXT:    sw t3, 36(sp)
+; RV32I-NEXT:    sw a4, 8(sp)
+; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a6, 16(sp)
+; RV32I-NEXT:    sw a7, 20(sp)
 ; RV32I-NEXT:    srli a1, a0, 3
 ; RV32I-NEXT:    andi a3, a0, 31
 ; RV32I-NEXT:    andi a4, a1, 28
 ; RV32I-NEXT:    xori a1, a3, 31
-; RV32I-NEXT:    add a4, s4, a4
+; RV32I-NEXT:    add a4, s3, a4
 ; RV32I-NEXT:    lw a3, 0(a4)
 ; RV32I-NEXT:    lw a5, 4(a4)
 ; RV32I-NEXT:    lw a6, 8(a4)
@@ -1714,13 +1717,13 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    srli s5, a3, 24
 ; RV32I-NEXT:    srli s6, a3, 16
 ; RV32I-NEXT:    srli s7, a3, 8
+; RV32I-NEXT:    srli s8, a1, 24
+; RV32I-NEXT:    srli s9, a1, 16
 ; RV32I-NEXT:    sb a7, 24(a2)
-; RV32I-NEXT:    srli a7, a1, 24
 ; RV32I-NEXT:    sb t2, 25(a2)
-; RV32I-NEXT:    srli t2, a1, 16
 ; RV32I-NEXT:    sb t1, 26(a2)
 ; RV32I-NEXT:    sb t0, 27(a2)
-; RV32I-NEXT:    srli t0, a1, 8
+; RV32I-NEXT:    srli a7, a1, 8
 ; RV32I-NEXT:    sb a6, 16(a2)
 ; RV32I-NEXT:    sb t5, 17(a2)
 ; RV32I-NEXT:    sb t4, 18(a2)
@@ -1741,26 +1744,27 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sb s6, 14(a2)
 ; RV32I-NEXT:    sb s5, 15(a2)
 ; RV32I-NEXT:    sb a1, 0(a2)
-; RV32I-NEXT:    sb t0, 1(a2)
-; RV32I-NEXT:    sb t2, 2(a2)
-; RV32I-NEXT:    sb a7, 3(a2)
+; RV32I-NEXT:    sb a7, 1(a2)
+; RV32I-NEXT:    sb s9, 2(a2)
+; RV32I-NEXT:    sb s8, 3(a2)
 ; RV32I-NEXT:    sb a0, 4(a2)
 ; RV32I-NEXT:    sb a4, 5(a2)
 ; RV32I-NEXT:    sb a5, 6(a2)
 ; RV32I-NEXT:    sb a6, 7(a2)
-; RV32I-NEXT:    lw s0, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s4, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s5, 88(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s6, 84(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s7, 80(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s8, 76(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s9, 72(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s10, 68(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s11, 64(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 112
+; RV32I-NEXT:    lw ra, 124(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 120(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 116(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 112(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 108(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 104(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s5, 100(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s6, 96(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s7, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s8, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s9, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s10, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s11, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 128
 ; RV32I-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
   %bitOff = load i256, ptr %bitOff.ptr, align 1
@@ -2002,24 +2006,25 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ;
 ; RV32I-LABEL: shl_32bytes:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -112
-; RV32I-NEXT:    sw s0, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s5, 88(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s6, 84(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s7, 80(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s8, 76(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s9, 72(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s10, 68(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s11, 64(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    addi sp, sp, -128
+; RV32I-NEXT:    sw ra, 124(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 120(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 116(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 112(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 108(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 104(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s5, 100(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s6, 96(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s7, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s8, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s9, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s10, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s11, 76(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    lbu a3, 0(a0)
 ; RV32I-NEXT:    lbu a4, 1(a0)
-; RV32I-NEXT:    lbu a5, 2(a0)
-; RV32I-NEXT:    lbu a6, 3(a0)
-; RV32I-NEXT:    lbu a7, 4(a0)
+; RV32I-NEXT:    lbu a6, 2(a0)
+; RV32I-NEXT:    lbu a7, 3(a0)
+; RV32I-NEXT:    lbu a5, 4(a0)
 ; RV32I-NEXT:    lbu t0, 5(a0)
 ; RV32I-NEXT:    lbu t1, 6(a0)
 ; RV32I-NEXT:    lbu t2, 7(a0)
@@ -2028,105 +2033,107 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    lbu t5, 10(a0)
 ; RV32I-NEXT:    lbu t6, 11(a0)
 ; RV32I-NEXT:    lbu s0, 12(a0)
-; RV32I-NEXT:    lbu s1, 13(a0)
-; RV32I-NEXT:    lbu s2, 14(a0)
-; RV32I-NEXT:    lbu s3, 15(a0)
-; RV32I-NEXT:    lbu s4, 16(a0)
-; RV32I-NEXT:    lbu s5, 17(a0)
-; RV32I-NEXT:    lbu s6, 18(a0)
-; RV32I-NEXT:    lbu s7, 19(a0)
+; RV32I-NEXT:    lbu s2, 13(a0)
+; RV32I-NEXT:    lbu s4, 14(a0)
+; RV32I-NEXT:    lbu s5, 15(a0)
+; RV32I-NEXT:    lbu s6, 16(a0)
+; RV32I-NEXT:    lbu s7, 17(a0)
+; RV32I-NEXT:    lbu s8, 18(a0)
+; RV32I-NEXT:    lbu s9, 19(a0)
 ; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli a6, a6, 24
+; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a7, a7, 24
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    sw a3, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    or a4, a7, a6
+; RV32I-NEXT:    lbu s10, 20(a0)
+; RV32I-NEXT:    lbu s11, 21(a0)
+; RV32I-NEXT:    lbu ra, 22(a0)
+; RV32I-NEXT:    lbu a3, 23(a0)
 ; RV32I-NEXT:    slli t0, t0, 8
 ; RV32I-NEXT:    slli t1, t1, 16
 ; RV32I-NEXT:    slli t2, t2, 24
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a5, t0, a7
-; RV32I-NEXT:    or a6, t2, t1
-; RV32I-NEXT:    lbu s8, 20(a0)
-; RV32I-NEXT:    lbu s9, 21(a0)
-; RV32I-NEXT:    lbu s10, 22(a0)
-; RV32I-NEXT:    lbu s11, 23(a0)
 ; RV32I-NEXT:    slli t4, t4, 8
 ; RV32I-NEXT:    slli t5, t5, 16
 ; RV32I-NEXT:    slli t6, t6, 24
-; RV32I-NEXT:    slli s1, s1, 8
-; RV32I-NEXT:    slli s2, s2, 16
-; RV32I-NEXT:    slli s3, s3, 24
+; RV32I-NEXT:    or a5, t0, a5
+; RV32I-NEXT:    or a6, t2, t1
 ; RV32I-NEXT:    or a7, t4, t3
 ; RV32I-NEXT:    or t0, t6, t5
-; RV32I-NEXT:    or t1, s1, s0
-; RV32I-NEXT:    or t2, s3, s2
-; RV32I-NEXT:    lbu t6, 24(a0)
-; RV32I-NEXT:    lbu s0, 25(a0)
-; RV32I-NEXT:    lbu s1, 26(a0)
-; RV32I-NEXT:    lbu s2, 27(a0)
-; RV32I-NEXT:    slli s5, s5, 8
-; RV32I-NEXT:    slli s6, s6, 16
-; RV32I-NEXT:    slli s7, s7, 24
-; RV32I-NEXT:    slli s9, s9, 8
-; RV32I-NEXT:    or t3, s5, s4
-; RV32I-NEXT:    or t4, s7, s6
-; RV32I-NEXT:    or t5, s9, s8
-; RV32I-NEXT:    lbu s3, 28(a0)
+; RV32I-NEXT:    lbu s1, 24(a0)
+; RV32I-NEXT:    lbu s3, 25(a0)
+; RV32I-NEXT:    lbu t4, 26(a0)
+; RV32I-NEXT:    lbu t5, 27(a0)
+; RV32I-NEXT:    slli s2, s2, 8
+; RV32I-NEXT:    slli s4, s4, 16
+; RV32I-NEXT:    slli s5, s5, 24
+; RV32I-NEXT:    slli s7, s7, 8
+; RV32I-NEXT:    or t1, s2, s0
+; RV32I-NEXT:    or t2, s5, s4
+; RV32I-NEXT:    or t3, s7, s6
+; RV32I-NEXT:    lbu t6, 28(a0)
 ; RV32I-NEXT:    lbu s4, 29(a0)
 ; RV32I-NEXT:    lbu s5, 30(a0)
 ; RV32I-NEXT:    lbu s6, 31(a0)
-; RV32I-NEXT:    slli s10, s10, 16
-; RV32I-NEXT:    slli s11, s11, 24
-; RV32I-NEXT:    slli s0, s0, 8
-; RV32I-NEXT:    slli s1, s1, 16
-; RV32I-NEXT:    slli s2, s2, 24
-; RV32I-NEXT:    or a0, s11, s10
-; RV32I-NEXT:    or t6, s0, t6
-; RV32I-NEXT:    or s0, s2, s1
-; RV32I-NEXT:    lbu s1, 0(a1)
-; RV32I-NEXT:    lbu s2, 1(a1)
-; RV32I-NEXT:    lbu s7, 2(a1)
+; RV32I-NEXT:    slli s8, s8, 16
+; RV32I-NEXT:    slli s9, s9, 24
+; RV32I-NEXT:    slli s11, s11, 8
+; RV32I-NEXT:    slli ra, ra, 16
+; RV32I-NEXT:    slli a3, a3, 24
+; RV32I-NEXT:    or a0, s9, s8
+; RV32I-NEXT:    or s0, s11, s10
+; RV32I-NEXT:    or s2, a3, ra
+; RV32I-NEXT:    lbu a3, 0(a1)
+; RV32I-NEXT:    lbu s7, 1(a1)
+; RV32I-NEXT:    lbu s8, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    sw zero, 16(sp)
-; RV32I-NEXT:    sw zero, 20(sp)
 ; RV32I-NEXT:    sw zero, 24(sp)
 ; RV32I-NEXT:    sw zero, 28(sp)
-; RV32I-NEXT:    sw zero, 0(sp)
-; RV32I-NEXT:    sw zero, 4(sp)
+; RV32I-NEXT:    sw zero, 32(sp)
+; RV32I-NEXT:    sw zero, 36(sp)
 ; RV32I-NEXT:    sw zero, 8(sp)
 ; RV32I-NEXT:    sw zero, 12(sp)
+; RV32I-NEXT:    sw zero, 16(sp)
+; RV32I-NEXT:    sw zero, 20(sp)
+; RV32I-NEXT:    slli s3, s3, 8
+; RV32I-NEXT:    or s1, s3, s1
+; RV32I-NEXT:    addi s3, sp, 40
+; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    slli t5, t5, 24
 ; RV32I-NEXT:    slli s4, s4, 8
-; RV32I-NEXT:    or s3, s4, s3
-; RV32I-NEXT:    addi s4, sp, 32
 ; RV32I-NEXT:    slli s5, s5, 16
 ; RV32I-NEXT:    slli s6, s6, 24
-; RV32I-NEXT:    slli s2, s2, 8
-; RV32I-NEXT:    slli s7, s7, 16
+; RV32I-NEXT:    slli s7, s7, 8
+; RV32I-NEXT:    slli s8, s8, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or s5, s6, s5
-; RV32I-NEXT:    or s1, s2, s1
-; RV32I-NEXT:    or a1, a1, s7
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a5, t0, a7
-; RV32I-NEXT:    or a6, t2, t1
-; RV32I-NEXT:    or a7, t4, t3
-; RV32I-NEXT:    or t0, a0, t5
-; RV32I-NEXT:    or t1, s0, t6
-; RV32I-NEXT:    or t2, s5, s3
-; RV32I-NEXT:    or a0, a1, s1
-; RV32I-NEXT:    sw a7, 48(sp)
-; RV32I-NEXT:    sw t0, 52(sp)
-; RV32I-NEXT:    sw t1, 56(sp)
-; RV32I-NEXT:    sw t2, 60(sp)
-; RV32I-NEXT:    sw a3, 32(sp)
-; RV32I-NEXT:    sw a4, 36(sp)
-; RV32I-NEXT:    sw a5, 40(sp)
-; RV32I-NEXT:    sw a6, 44(sp)
+; RV32I-NEXT:    or t4, t5, t4
+; RV32I-NEXT:    or t5, s4, t6
+; RV32I-NEXT:    or t6, s6, s5
+; RV32I-NEXT:    or a3, s7, a3
+; RV32I-NEXT:    or a1, a1, s8
+; RV32I-NEXT:    lw s4, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    or a4, a4, s4
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    or a6, t0, a7
+; RV32I-NEXT:    or a7, t2, t1
+; RV32I-NEXT:    or t0, a0, t3
+; RV32I-NEXT:    or t1, s2, s0
+; RV32I-NEXT:    or t2, t4, s1
+; RV32I-NEXT:    or t3, t6, t5
+; RV32I-NEXT:    or a0, a1, a3
+; RV32I-NEXT:    sw t0, 56(sp)
+; RV32I-NEXT:    sw t1, 60(sp)
+; RV32I-NEXT:    sw t2, 64(sp)
+; RV32I-NEXT:    sw t3, 68(sp)
+; RV32I-NEXT:    sw a4, 40(sp)
+; RV32I-NEXT:    sw a5, 44(sp)
+; RV32I-NEXT:    sw a6, 48(sp)
+; RV32I-NEXT:    sw a7, 52(sp)
 ; RV32I-NEXT:    srli a1, a0, 3
 ; RV32I-NEXT:    andi a3, a0, 31
 ; RV32I-NEXT:    andi a4, a1, 28
 ; RV32I-NEXT:    xori a1, a3, 31
-; RV32I-NEXT:    sub a3, s4, a4
+; RV32I-NEXT:    sub a3, s3, a4
 ; RV32I-NEXT:    lw a4, 0(a3)
 ; RV32I-NEXT:    lw a5, 4(a3)
 ; RV32I-NEXT:    lw a6, 8(a3)
@@ -2186,13 +2193,13 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    srli s5, a3, 24
 ; RV32I-NEXT:    srli s6, a3, 16
 ; RV32I-NEXT:    srli s7, a3, 8
+; RV32I-NEXT:    srli s8, a1, 24
+; RV32I-NEXT:    srli s9, a1, 16
 ; RV32I-NEXT:    sb a7, 24(a2)
-; RV32I-NEXT:    srli a7, a1, 24
 ; RV32I-NEXT:    sb t2, 25(a2)
-; RV32I-NEXT:    srli t2, a1, 16
 ; RV32I-NEXT:    sb t1, 26(a2)
 ; RV32I-NEXT:    sb t0, 27(a2)
-; RV32I-NEXT:    srli t0, a1, 8
+; RV32I-NEXT:    srli a7, a1, 8
 ; RV32I-NEXT:    sb a6, 28(a2)
 ; RV32I-NEXT:    sb t5, 29(a2)
 ; RV32I-NEXT:    sb t4, 30(a2)
@@ -2213,26 +2220,27 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sb s6, 10(a2)
 ; RV32I-NEXT:    sb s5, 11(a2)
 ; RV32I-NEXT:    sb a1, 12(a2)
-; RV32I-NEXT:    sb t0, 13(a2)
-; RV32I-NEXT:    sb t2, 14(a2)
-; RV32I-NEXT:    sb a7, 15(a2)
+; RV32I-NEXT:    sb a7, 13(a2)
+; RV32I-NEXT:    sb s9, 14(a2)
+; RV32I-NEXT:    sb s8, 15(a2)
 ; RV32I-NEXT:    sb a0, 4(a2)
 ; RV32I-NEXT:    sb a4, 5(a2)
 ; RV32I-NEXT:    sb a5, 6(a2)
 ; RV32I-NEXT:    sb a6, 7(a2)
-; RV32I-NEXT:    lw s0, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s4, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s5, 88(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s6, 84(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s7, 80(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s8, 76(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s9, 72(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s10, 68(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s11, 64(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 112
+; RV32I-NEXT:    lw ra, 124(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 120(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 116(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 112(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 108(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 104(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s5, 100(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s6, 96(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s7, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s8, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s9, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s10, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s11, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 128
 ; RV32I-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
   %bitOff = load i256, ptr %bitOff.ptr, align 1
@@ -2475,24 +2483,25 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ;
 ; RV32I-LABEL: ashr_32bytes:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -112
-; RV32I-NEXT:    sw s0, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s5, 88(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s6, 84(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s7, 80(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s8, 76(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s9, 72(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s10, 68(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s11, 64(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    addi sp, sp, -128
+; RV32I-NEXT:    sw ra, 124(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 120(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 116(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 112(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 108(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 104(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s5, 100(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s6, 96(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s7, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s8, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s9, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s10, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s11, 76(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    lbu a3, 0(a0)
 ; RV32I-NEXT:    lbu a4, 1(a0)
-; RV32I-NEXT:    lbu a5, 2(a0)
-; RV32I-NEXT:    lbu a6, 3(a0)
-; RV32I-NEXT:    lbu a7, 4(a0)
+; RV32I-NEXT:    lbu a6, 2(a0)
+; RV32I-NEXT:    lbu a7, 3(a0)
+; RV32I-NEXT:    lbu a5, 4(a0)
 ; RV32I-NEXT:    lbu t0, 5(a0)
 ; RV32I-NEXT:    lbu t1, 6(a0)
 ; RV32I-NEXT:    lbu t2, 7(a0)
@@ -2509,98 +2518,100 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    lbu s6, 18(a0)
 ; RV32I-NEXT:    lbu s7, 19(a0)
 ; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    slli t1, t1, 16
-; RV32I-NEXT:    slli t2, t2, 24
+; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a7, a7, 24
 ; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a5, t0, a7
-; RV32I-NEXT:    or a6, t2, t1
+; RV32I-NEXT:    sw a3, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    or a4, a7, a6
 ; RV32I-NEXT:    lbu s8, 20(a0)
 ; RV32I-NEXT:    lbu s9, 21(a0)
 ; RV32I-NEXT:    lbu s10, 22(a0)
 ; RV32I-NEXT:    lbu s11, 23(a0)
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t2, t2, 24
 ; RV32I-NEXT:    slli t4, t4, 8
 ; RV32I-NEXT:    slli t5, t5, 16
 ; RV32I-NEXT:    slli t6, t6, 24
+; RV32I-NEXT:    or a5, t0, a5
+; RV32I-NEXT:    or a6, t2, t1
+; RV32I-NEXT:    or a7, t4, t3
+; RV32I-NEXT:    or t0, t6, t5
+; RV32I-NEXT:    lbu ra, 24(a0)
+; RV32I-NEXT:    lbu a3, 25(a0)
+; RV32I-NEXT:    lbu t4, 26(a0)
+; RV32I-NEXT:    lbu t5, 27(a0)
 ; RV32I-NEXT:    slli s1, s1, 8
 ; RV32I-NEXT:    slli s2, s2, 16
 ; RV32I-NEXT:    slli s3, s3, 24
-; RV32I-NEXT:    or a7, t4, t3
-; RV32I-NEXT:    or t0, t6, t5
+; RV32I-NEXT:    slli s5, s5, 8
 ; RV32I-NEXT:    or t1, s1, s0
 ; RV32I-NEXT:    or t2, s3, s2
-; RV32I-NEXT:    lbu t6, 24(a0)
-; RV32I-NEXT:    lbu s0, 25(a0)
-; RV32I-NEXT:    lbu s1, 26(a0)
-; RV32I-NEXT:    lbu s2, 27(a0)
-; RV32I-NEXT:    slli s5, s5, 8
+; RV32I-NEXT:    or t3, s5, s4
+; RV32I-NEXT:    lbu t6, 28(a0)
+; RV32I-NEXT:    lbu s0, 29(a0)
+; RV32I-NEXT:    lbu s1, 30(a0)
+; RV32I-NEXT:    lbu a0, 31(a0)
 ; RV32I-NEXT:    slli s6, s6, 16
 ; RV32I-NEXT:    slli s7, s7, 24
 ; RV32I-NEXT:    slli s9, s9, 8
-; RV32I-NEXT:    or t3, s5, s4
-; RV32I-NEXT:    or t4, s7, s6
-; RV32I-NEXT:    or t5, s9, s8
-; RV32I-NEXT:    lbu s3, 28(a0)
-; RV32I-NEXT:    lbu s4, 29(a0)
-; RV32I-NEXT:    lbu s5, 30(a0)
-; RV32I-NEXT:    lbu a0, 31(a0)
 ; RV32I-NEXT:    slli s10, s10, 16
 ; RV32I-NEXT:    slli s11, s11, 24
-; RV32I-NEXT:    slli s0, s0, 8
-; RV32I-NEXT:    slli s1, s1, 16
-; RV32I-NEXT:    slli s2, s2, 24
-; RV32I-NEXT:    or s6, s11, s10
-; RV32I-NEXT:    or t6, s0, t6
-; RV32I-NEXT:    or s0, s2, s1
-; RV32I-NEXT:    lbu s1, 0(a1)
-; RV32I-NEXT:    lbu s2, 1(a1)
+; RV32I-NEXT:    or s2, s7, s6
+; RV32I-NEXT:    or s3, s9, s8
+; RV32I-NEXT:    or s4, s11, s10
+; RV32I-NEXT:    lbu s5, 0(a1)
+; RV32I-NEXT:    lbu s6, 1(a1)
 ; RV32I-NEXT:    lbu s7, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli s4, s4, 8
-; RV32I-NEXT:    or s3, s4, s3
-; RV32I-NEXT:    mv s4, sp
-; RV32I-NEXT:    slli s5, s5, 16
+; RV32I-NEXT:    slli a3, a3, 8
+; RV32I-NEXT:    or a3, a3, ra
+; RV32I-NEXT:    addi s8, sp, 8
+; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    slli t5, t5, 24
+; RV32I-NEXT:    slli s0, s0, 8
+; RV32I-NEXT:    slli s1, s1, 16
 ; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    slli s2, s2, 8
+; RV32I-NEXT:    slli s6, s6, 8
 ; RV32I-NEXT:    slli s7, s7, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or s5, a0, s5
-; RV32I-NEXT:    or s1, s2, s1
+; RV32I-NEXT:    or t4, t5, t4
+; RV32I-NEXT:    or t5, s0, t6
+; RV32I-NEXT:    or s1, a0, s1
+; RV32I-NEXT:    or t6, s6, s5
 ; RV32I-NEXT:    or a1, a1, s7
-; RV32I-NEXT:    srai s2, a0, 31
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a5, t0, a7
-; RV32I-NEXT:    or a6, t2, t1
-; RV32I-NEXT:    or a7, t4, t3
-; RV32I-NEXT:    or t0, s6, t5
-; RV32I-NEXT:    or t1, s0, t6
-; RV32I-NEXT:    or t2, s5, s3
-; RV32I-NEXT:    or a0, a1, s1
-; RV32I-NEXT:    sw s2, 48(sp)
-; RV32I-NEXT:    sw s2, 52(sp)
-; RV32I-NEXT:    sw s2, 56(sp)
-; RV32I-NEXT:    sw s2, 60(sp)
-; RV32I-NEXT:    sw s2, 32(sp)
-; RV32I-NEXT:    sw s2, 36(sp)
-; RV32I-NEXT:    sw s2, 40(sp)
-; RV32I-NEXT:    sw s2, 44(sp)
-; RV32I-NEXT:    sw a7, 16(sp)
-; RV32I-NEXT:    sw t0, 20(sp)
-; RV32I-NEXT:    sw t1, 24(sp)
-; RV32I-NEXT:    sw t2, 28(sp)
-; RV32I-NEXT:    sw a3, 0(sp)
-; RV32I-NEXT:    sw a4, 4(sp)
-; RV32I-NEXT:    sw a5, 8(sp)
-; RV32I-NEXT:    sw a6, 12(sp)
+; RV32I-NEXT:    srai s0, a0, 31
+; RV32I-NEXT:    lw a0, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    or a4, a4, a0
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    or a6, t0, a7
+; RV32I-NEXT:    or a7, t2, t1
+; RV32I-NEXT:    or t0, s2, t3
+; RV32I-NEXT:    or t1, s4, s3
+; RV32I-NEXT:    or a3, t4, a3
+; RV32I-NEXT:    or t2, s1, t5
+; RV32I-NEXT:    or a0, a1, t6
+; RV32I-NEXT:    sw s0, 56(sp)
+; RV32I-NEXT:    sw s0, 60(sp)
+; RV32I-NEXT:    sw s0, 64(sp)
+; RV32I-NEXT:    sw s0, 68(sp)
+; RV32I-NEXT:    sw s0, 40(sp)
+; RV32I-NEXT:    sw s0, 44(sp)
+; RV32I-NEXT:    sw s0, 48(sp)
+; RV32I-NEXT:    sw s0, 52(sp)
+; RV32I-NEXT:    sw t0, 24(sp)
+; RV32I-NEXT:    sw t1, 28(sp)
+; RV32I-NEXT:    sw a3, 32(sp)
+; RV32I-NEXT:    sw t2, 36(sp)
+; RV32I-NEXT:    sw a4, 8(sp)
+; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a6, 16(sp)
+; RV32I-NEXT:    sw a7, 20(sp)
 ; RV32I-NEXT:    srli a1, a0, 3
 ; RV32I-NEXT:    andi a3, a0, 31
 ; RV32I-NEXT:    andi a4, a1, 28
 ; RV32I-NEXT:    xori a1, a3, 31
-; RV32I-NEXT:    add a4, s4, a4
+; RV32I-NEXT:    add a4, s8, a4
 ; RV32I-NEXT:    lw a3, 0(a4)
 ; RV32I-NEXT:    lw a5, 4(a4)
 ; RV32I-NEXT:    lw a6, 8(a4)
@@ -2660,13 +2671,13 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    srli s5, a3, 24
 ; RV32I-NEXT:    srli s6, a3, 16
 ; RV32I-NEXT:    srli s7, a3, 8
+; RV32I-NEXT:    srli s8, a1, 24
+; RV32I-NEXT:    srli s9, a1, 16
 ; RV32I-NEXT:    sb a7, 24(a2)
-; RV32I-NEXT:    srli a7, a1, 24
 ; RV32I-NEXT:    sb t2, 25(a2)
-; RV32I-NEXT:    srli t2, a1, 16
 ; RV32I-NEXT:    sb t1, 26(a2)
 ; RV32I-NEXT:    sb t0, 27(a2)
-; RV32I-NEXT:    srli t0, a1, 8
+; RV32I-NEXT:    srli a7, a1, 8
 ; RV32I-NEXT:    sb a6, 16(a2)
 ; RV32I-NEXT:    sb t5, 17(a2)
 ; RV32I-NEXT:    sb t4, 18(a2)
@@ -2687,26 +2698,27 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sb s6, 14(a2)
 ; RV32I-NEXT:    sb s5, 15(a2)
 ; RV32I-NEXT:    sb a1, 0(a2)
-; RV32I-NEXT:    sb t0, 1(a2)
-; RV32I-NEXT:    sb t2, 2(a2)
-; RV32I-NEXT:    sb a7, 3(a2)
+; RV32I-NEXT:    sb a7, 1(a2)
+; RV32I-NEXT:    sb s9, 2(a2)
+; RV32I-NEXT:    sb s8, 3(a2)
 ; RV32I-NEXT:    sb a0, 4(a2)
 ; RV32I-NEXT:    sb a4, 5(a2)
 ; RV32I-NEXT:    sb a5, 6(a2)
 ; RV32I-NEXT:    sb a6, 7(a2)
-; RV32I-NEXT:    lw s0, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s4, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s5, 88(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s6, 84(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s7, 80(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s8, 76(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s9, 72(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s10, 68(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s11, 64(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 112
+; RV32I-NEXT:    lw ra, 124(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 120(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 116(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 112(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 108(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 104(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s5, 100(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s6, 96(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s7, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s8, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s9, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s10, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s11, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 128
 ; RV32I-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
   %bitOff = load i256, ptr %bitOff.ptr, align 1
diff --git a/llvm/unittests/CodeGen/MFCommon.inc b/llvm/unittests/CodeGen/MFCommon.inc
index 67759bd5c4632e..2c4b1f36ffd23d 100644
--- a/llvm/unittests/CodeGen/MFCommon.inc
+++ b/llvm/unittests/CodeGen/MFCommon.inc
@@ -50,8 +50,8 @@ public:
   const char *getRegPressureSetName(unsigned Idx) const override {
     return "bogus";
   }
-  unsigned getRegPressureSetLimit(const MachineFunction &MF,
-                                  unsigned Idx) const override {
+  unsigned getRegPressureSetLimit(const MachineFunction &MF, unsigned Idx,
+                                  bool RemoveReserved) const override {
     return 0;
   }
   const int *
diff --git a/llvm/utils/TableGen/RegisterInfoEmitter.cpp b/llvm/utils/TableGen/RegisterInfoEmitter.cpp
index a6f87119aca5ba..674925c1b2acd3 100644
--- a/llvm/utils/TableGen/RegisterInfoEmitter.cpp
+++ b/llvm/utils/TableGen/RegisterInfoEmitter.cpp
@@ -275,7 +275,8 @@ void RegisterInfoEmitter::EmitRegUnitPressure(raw_ostream &OS,
   OS << "// Get the register unit pressure limit for this dimension.\n"
      << "// This limit must be adjusted dynamically for reserved registers.\n"
      << "unsigned " << ClassName << "::\n"
-     << "getRegPressureSetLimit(const MachineFunction &MF, unsigned Idx) const "
+     << "getRegPressureSetLimit(const MachineFunction &MF, unsigned Idx, bool "
+        "RemoveReserved) const "
         "{\n"
      << "  static const " << getMinimalTypeForRange(MaxRegUnitWeight, 32)
      << " PressureLimitTable[] = {\n";
@@ -1130,7 +1131,7 @@ void RegisterInfoEmitter::runTargetHeader(raw_ostream &OS) {
      << "  unsigned getNumRegPressureSets() const override;\n"
      << "  const char *getRegPressureSetName(unsigned Idx) const override;\n"
      << "  unsigned getRegPressureSetLimit(const MachineFunction &MF, unsigned "
-        "Idx) const override;\n"
+        "Idx, bool RemoveReserved = true) const override;\n"
      << "  const int *getRegClassPressureSets("
      << "const TargetRegisterClass *RC) const override;\n"
      << "  const int *getRegUnitPressureSets("

>From 6b0461f0b6b90dcd983cf288220879d6c087e99d Mon Sep 17 00:00:00 2001
From: Wang Pengcheng <wangpengcheng.pp at bytedance.com>
Date: Tue, 3 Dec 2024 21:47:30 +0800
Subject: [PATCH 3/3] Revert "Test commit: add a parameter to keep reserved"

This reverts commit e96f7f7898790da1fe9cdc5cd3be7e3ae8eb8705.
---
 .../include/llvm/CodeGen/TargetRegisterInfo.h |    4 +-
 llvm/lib/CodeGen/RegisterClassInfo.cpp        |    3 +-
 llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp     |    3 +-
 llvm/lib/Target/AMDGPU/SIRegisterInfo.h       |    4 +-
 llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp   |    8 +-
 llvm/lib/Target/RISCV/RISCVRegisterInfo.h     |    4 +-
 llvm/test/CodeGen/RISCV/pr69586.ll            |  844 ++---
 .../RISCV/rvv/fixed-vectors-masked-scatter.ll |   78 +-
 .../RISCV/rvv/fixed-vectors-setcc-fp-vp.ll    | 2104 ++++++-----
 .../RISCV/rvv/intrinsic-vector-match.ll       |  472 ++-
 ...lar-shift-by-byte-multiple-legalization.ll | 3242 ++++++++---------
 .../RISCV/wide-scalar-shift-legalization.ll   |  646 ++--
 llvm/unittests/CodeGen/MFCommon.inc           |    4 +-
 llvm/utils/TableGen/RegisterInfoEmitter.cpp   |    5 +-
 14 files changed, 3606 insertions(+), 3815 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
index eaed26e33c4eb5..292fa3c94969be 100644
--- a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
@@ -914,10 +914,8 @@ class TargetRegisterInfo : public MCRegisterInfo {
 
   /// Get the register unit pressure limit for this dimension.
   /// This limit must be adjusted dynamically for reserved registers.
-  /// If RemoveReserved is true, the target should remove reserved registers.
   virtual unsigned getRegPressureSetLimit(const MachineFunction &MF,
-                                          unsigned Idx,
-                                          bool RemoveReserved = true) const = 0;
+                                          unsigned Idx) const = 0;
 
   /// Get the dimensions of register pressure impacted by this register class.
   /// Returns a -1 terminated array of pressure set IDs.
diff --git a/llvm/lib/CodeGen/RegisterClassInfo.cpp b/llvm/lib/CodeGen/RegisterClassInfo.cpp
index 0a33915ed1e40b..9312bc03bc522a 100644
--- a/llvm/lib/CodeGen/RegisterClassInfo.cpp
+++ b/llvm/lib/CodeGen/RegisterClassInfo.cpp
@@ -222,8 +222,7 @@ unsigned RegisterClassInfo::computePSetLimit(unsigned Idx) const {
   assert(RC && "Failed to find register class");
   compute(RC);
   unsigned NAllocatableRegs = getNumAllocatableRegs(RC);
-  unsigned RegPressureSetLimit =
-      TRI->getRegPressureSetLimit(*MF, Idx, /*RemoveReserved=*/false);
+  unsigned RegPressureSetLimit = TRI->getRegPressureSetLimit(*MF, Idx);
   // If all the regs are reserved, return raw RegPressureSetLimit.
   // One example is VRSAVERC in PowerPC.
   // Avoid returning zero, getRegPressureSetLimit(Idx) assumes computePSetLimit
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 9883454ed78298..049f4af4dd2f93 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -3640,8 +3640,7 @@ unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
 }
 
 unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF,
-                                                unsigned Idx,
-                                                bool RemoveReserved) const {
+                                                unsigned Idx) const {
   if (Idx == AMDGPU::RegisterPressureSets::VGPR_32 ||
       Idx == AMDGPU::RegisterPressureSets::AGPR_32)
     return getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index b55f5f2c418b09..8e481e3ac23043 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -331,8 +331,8 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
   unsigned getRegPressureLimit(const TargetRegisterClass *RC,
                                MachineFunction &MF) const override;
 
-  unsigned getRegPressureSetLimit(const MachineFunction &MF, unsigned Idx,
-                                  bool RemoveReserved = true) const override;
+  unsigned getRegPressureSetLimit(const MachineFunction &MF,
+                                  unsigned Idx) const override;
 
   const int *getRegUnitPressureSets(unsigned RegUnit) const override;
 
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
index d5a769b6c78c7c..a73bd1621a739d 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
@@ -936,12 +936,8 @@ bool RISCVRegisterInfo::getRegAllocationHints(
 }
 
 unsigned RISCVRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF,
-                                                   unsigned Idx,
-                                                   bool RemoveReserved) const {
+                                                   unsigned Idx) const {
   if (Idx == RISCV::RegisterPressureSets::GPRAll) {
-    if (!RemoveReserved)
-      return 32;
-
     unsigned Reserved = 0;
     BitVector ReservedRegs = getReservedRegs(MF);
     for (MCPhysReg Reg = RISCV::X0_H; Reg <= RISCV::X31_H; Reg++)
@@ -950,5 +946,5 @@ unsigned RISCVRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF,
 
     return 32 - Reserved;
   }
-  return RISCVGenRegisterInfo::getRegPressureSetLimit(MF, Idx, RemoveReserved);
+  return RISCVGenRegisterInfo::getRegPressureSetLimit(MF, Idx);
 }
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.h b/llvm/lib/Target/RISCV/RISCVRegisterInfo.h
index 58f97394ec559b..ca4934de2f52d2 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.h
@@ -144,8 +144,8 @@ struct RISCVRegisterInfo : public RISCVGenRegisterInfo {
   static bool isRVVRegClass(const TargetRegisterClass *RC) {
     return RISCVRI::isVRegClass(RC->TSFlags);
   }
-  unsigned getRegPressureSetLimit(const MachineFunction &MF, unsigned Idx,
-                                  bool RemoveReserved = true) const override;
+  unsigned getRegPressureSetLimit(const MachineFunction &MF,
+                                  unsigned Idx) const override;
 };
 } // namespace llvm
 
diff --git a/llvm/test/CodeGen/RISCV/pr69586.ll b/llvm/test/CodeGen/RISCV/pr69586.ll
index 8e6a7add781c93..21e64ada7061aa 100644
--- a/llvm/test/CodeGen/RISCV/pr69586.ll
+++ b/llvm/test/CodeGen/RISCV/pr69586.ll
@@ -39,388 +39,384 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; NOREMAT-NEXT:    slli a2, a2, 1
 ; NOREMAT-NEXT:    sub sp, sp, a2
 ; NOREMAT-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xf0, 0x05, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 752 + 2 * vlenb
-; NOREMAT-NEXT:    li a7, 32
-; NOREMAT-NEXT:    addi a6, a0, 512
-; NOREMAT-NEXT:    addi a4, a0, 1024
-; NOREMAT-NEXT:    addi a5, a0, 1536
-; NOREMAT-NEXT:    li t0, 1
+; NOREMAT-NEXT:    mv a7, a0
+; NOREMAT-NEXT:    li a0, 32
+; NOREMAT-NEXT:    addi a5, a7, 512
+; NOREMAT-NEXT:    addi a4, a7, 1024
+; NOREMAT-NEXT:    addi a6, a7, 1536
+; NOREMAT-NEXT:    li t1, 1
 ; NOREMAT-NEXT:    li a3, 5
-; NOREMAT-NEXT:    li t1, 3
+; NOREMAT-NEXT:    li t0, 3
 ; NOREMAT-NEXT:    li a2, 7
 ; NOREMAT-NEXT:    lui t2, 1
-; NOREMAT-NEXT:    li s4, 9
-; NOREMAT-NEXT:    li s6, 11
-; NOREMAT-NEXT:    li s9, 13
-; NOREMAT-NEXT:    lui s7, 2
-; NOREMAT-NEXT:    lui s1, 3
-; NOREMAT-NEXT:    lui ra, 4
-; NOREMAT-NEXT:    lui t3, 5
-; NOREMAT-NEXT:    lui s0, 6
-; NOREMAT-NEXT:    lui s3, 7
-; NOREMAT-NEXT:    vsetvli zero, a7, e32, m2, ta, ma
-; NOREMAT-NEXT:    slli t0, t0, 11
-; NOREMAT-NEXT:    sd t0, 504(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    slli t5, a3, 9
-; NOREMAT-NEXT:    slli t6, t1, 10
-; NOREMAT-NEXT:    slli s2, a2, 9
-; NOREMAT-NEXT:    add a7, a0, t2
-; NOREMAT-NEXT:    lui s11, 1
-; NOREMAT-NEXT:    slli s4, s4, 9
-; NOREMAT-NEXT:    slli s5, a3, 10
-; NOREMAT-NEXT:    vle32.v v8, (a6)
-; NOREMAT-NEXT:    slli s6, s6, 9
-; NOREMAT-NEXT:    slli s8, t1, 11
+; NOREMAT-NEXT:    li s5, 9
+; NOREMAT-NEXT:    li s8, 11
+; NOREMAT-NEXT:    lui s1, 2
+; NOREMAT-NEXT:    lui t5, 3
+; NOREMAT-NEXT:    lui s11, 4
+; NOREMAT-NEXT:    lui ra, 5
+; NOREMAT-NEXT:    lui t3, 6
+; NOREMAT-NEXT:    lui s0, 7
+; NOREMAT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; NOREMAT-NEXT:    slli t4, t1, 11
+; NOREMAT-NEXT:    slli t6, a3, 9
+; NOREMAT-NEXT:    slli s2, t0, 10
+; NOREMAT-NEXT:    slli s4, a2, 9
+; NOREMAT-NEXT:    add a0, a7, t2
+; NOREMAT-NEXT:    vle32.v v8, (a5)
+; NOREMAT-NEXT:    slli s5, s5, 9
 ; NOREMAT-NEXT:    vle32.v v10, (a4)
 ; NOREMAT-NEXT:    vle32.v v2, (a4)
-; NOREMAT-NEXT:    slli s9, s9, 9
-; NOREMAT-NEXT:    vle32.v v0, (a5)
-; NOREMAT-NEXT:    vle32.v v12, (a5)
-; NOREMAT-NEXT:    slli s10, a2, 10
-; NOREMAT-NEXT:    vle32.v v4, (a7)
-; NOREMAT-NEXT:    vle32.v v20, (a7)
-; NOREMAT-NEXT:    add a4, a0, s7
+; NOREMAT-NEXT:    slli s6, a3, 10
+; NOREMAT-NEXT:    vle32.v v0, (a6)
+; NOREMAT-NEXT:    vle32.v v12, (a6)
+; NOREMAT-NEXT:    slli s8, s8, 9
+; NOREMAT-NEXT:    slli s9, t0, 11
+; NOREMAT-NEXT:    vle32.v v4, (a0)
+; NOREMAT-NEXT:    vle32.v v20, (a0)
+; NOREMAT-NEXT:    add a4, a7, s1
 ; NOREMAT-NEXT:    vle32.v v6, (a4)
 ; NOREMAT-NEXT:    vle32.v v30, (a4)
-; NOREMAT-NEXT:    add a4, a0, s1
+; NOREMAT-NEXT:    add a4, a7, t5
 ; NOREMAT-NEXT:    vle32.v v28, (a4)
 ; NOREMAT-NEXT:    vle32.v v26, (a4)
-; NOREMAT-NEXT:    add a4, a0, ra
+; NOREMAT-NEXT:    add a4, a7, s11
 ; NOREMAT-NEXT:    vle32.v v24, (a4)
 ; NOREMAT-NEXT:    vle32.v v22, (a4)
-; NOREMAT-NEXT:    add a4, a0, t3
-; NOREMAT-NEXT:    vle32.v v14, (a0)
+; NOREMAT-NEXT:    add a4, a7, ra
+; NOREMAT-NEXT:    vle32.v v14, (a7)
 ; NOREMAT-NEXT:    vle32.v v18, (a4)
 ; NOREMAT-NEXT:    vle32.v v16, (a4)
-; NOREMAT-NEXT:    add a4, a0, s0
+; NOREMAT-NEXT:    add a4, a7, t3
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v14, v8
 ; NOREMAT-NEXT:    vle32.v v14, (a4)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v8, v10
 ; NOREMAT-NEXT:    vle32.v v8, (a4)
-; NOREMAT-NEXT:    addi a4, sp, 640
-; NOREMAT-NEXT:    vs2r.v v8, (a4) # Unknown-size Folded Spill
-; NOREMAT-NEXT:    add a4, a0, t0
+; NOREMAT-NEXT:    addi a0, sp, 640
+; NOREMAT-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
+; NOREMAT-NEXT:    add a4, a7, t4
 ; NOREMAT-NEXT:    vle32.v v10, (a4)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v2, v0
 ; NOREMAT-NEXT:    vle32.v v2, (a4)
-; NOREMAT-NEXT:    add a4, a0, t5
+; NOREMAT-NEXT:    add a4, a7, t6
 ; NOREMAT-NEXT:    vle32.v v0, (a4)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v12, v10
 ; NOREMAT-NEXT:    vle32.v v10, (a4)
-; NOREMAT-NEXT:    add a4, a0, t6
+; NOREMAT-NEXT:    add a4, a7, s2
 ; NOREMAT-NEXT:    vle32.v v12, (a4)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v2, v0
 ; NOREMAT-NEXT:    vle32.v v2, (a4)
-; NOREMAT-NEXT:    add a4, a0, s2
+; NOREMAT-NEXT:    add a4, a7, s4
 ; NOREMAT-NEXT:    vle32.v v8, (a4)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
 ; NOREMAT-NEXT:    vle32.v v12, (a4)
-; NOREMAT-NEXT:    add a4, a0, s3
+; NOREMAT-NEXT:    add a4, a7, s0
 ; NOREMAT-NEXT:    vle32.v v0, (a4)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v2, v8
 ; NOREMAT-NEXT:    vle32.v v10, (a4)
-; NOREMAT-NEXT:    add a4, a0, s4
+; NOREMAT-NEXT:    add a4, a7, s5
 ; NOREMAT-NEXT:    vle32.v v8, (a4)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v12, v4
 ; NOREMAT-NEXT:    vle32.v v12, (a4)
-; NOREMAT-NEXT:    add a4, a0, s5
+; NOREMAT-NEXT:    add a4, a7, s6
 ; NOREMAT-NEXT:    vle32.v v4, (a4)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v20, v8
 ; NOREMAT-NEXT:    vle32.v v8, (a4)
-; NOREMAT-NEXT:    add a4, a0, s6
+; NOREMAT-NEXT:    add a4, a7, s8
 ; NOREMAT-NEXT:    vle32.v v20, (a4)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v12, v4
 ; NOREMAT-NEXT:    vle32.v v12, (a4)
-; NOREMAT-NEXT:    add a4, a0, s8
+; NOREMAT-NEXT:    add a4, a7, s9
 ; NOREMAT-NEXT:    vle32.v v4, (a4)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v8, v20
 ; NOREMAT-NEXT:    vle32.v v8, (a4)
-; NOREMAT-NEXT:    add a4, a0, s9
+; NOREMAT-NEXT:    li t5, 13
+; NOREMAT-NEXT:    slli a4, t5, 9
+; NOREMAT-NEXT:    sd a4, 624(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a4, a7, a4
 ; NOREMAT-NEXT:    vle32.v v20, (a4)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v12, v4
 ; NOREMAT-NEXT:    vle32.v v12, (a4)
-; NOREMAT-NEXT:    add a4, a0, s10
+; NOREMAT-NEXT:    slli a4, a2, 10
+; NOREMAT-NEXT:    sd a4, 616(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a4, a7, a4
 ; NOREMAT-NEXT:    vle32.v v4, (a4)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v8, v20
 ; NOREMAT-NEXT:    vle32.v v8, (a4)
-; NOREMAT-NEXT:    li t2, 15
-; NOREMAT-NEXT:    slli a4, t2, 9
-; NOREMAT-NEXT:    sd a4, 624(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a4, a0, a4
+; NOREMAT-NEXT:    li a6, 15
+; NOREMAT-NEXT:    slli a4, a6, 9
+; NOREMAT-NEXT:    sd a4, 608(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a4, a7, a4
 ; NOREMAT-NEXT:    vle32.v v2, (a4)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v12, v4
-; NOREMAT-NEXT:    lui t4, 8
-; NOREMAT-NEXT:    add a5, a0, t4
+; NOREMAT-NEXT:    lui t1, 8
+; NOREMAT-NEXT:    add a5, a7, t1
 ; NOREMAT-NEXT:    vle32.v v20, (a5)
 ; NOREMAT-NEXT:    vle32.v v12, (a4)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v8, v2
 ; NOREMAT-NEXT:    li a4, 17
 ; NOREMAT-NEXT:    slli a4, a4, 9
-; NOREMAT-NEXT:    li s1, 17
-; NOREMAT-NEXT:    sd a4, 616(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a4, a0, a4
+; NOREMAT-NEXT:    li t2, 17
+; NOREMAT-NEXT:    sd a4, 600(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a4, a7, a4
 ; NOREMAT-NEXT:    vle32.v v8, (a4)
 ; NOREMAT-NEXT:    vle32.v v4, (a4)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v12, v6
 ; NOREMAT-NEXT:    li a5, 9
 ; NOREMAT-NEXT:    slli a4, a5, 10
-; NOREMAT-NEXT:    sd a4, 608(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a4, a0, a4
+; NOREMAT-NEXT:    sd a4, 592(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a4, a7, a4
 ; NOREMAT-NEXT:    vle32.v v12, (a4)
 ; NOREMAT-NEXT:    vle32.v v6, (a4)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v30, v8
 ; NOREMAT-NEXT:    li a4, 19
 ; NOREMAT-NEXT:    slli a4, a4, 9
-; NOREMAT-NEXT:    li t1, 19
-; NOREMAT-NEXT:    sd a4, 600(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a4, a0, a4
+; NOREMAT-NEXT:    li s1, 19
+; NOREMAT-NEXT:    sd a4, 584(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a4, a7, a4
 ; NOREMAT-NEXT:    vle32.v v8, (a4)
 ; NOREMAT-NEXT:    vle32.v v30, (a4)
 ; NOREMAT-NEXT:    slli a3, a3, 11
-; NOREMAT-NEXT:    sd a3, 592(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd a3, 576(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v4, v12
-; NOREMAT-NEXT:    add a3, a0, a3
+; NOREMAT-NEXT:    add a3, a7, a3
 ; NOREMAT-NEXT:    vle32.v v12, (a3)
 ; NOREMAT-NEXT:    vle32.v v4, (a3)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v6, v8
 ; NOREMAT-NEXT:    li s7, 21
 ; NOREMAT-NEXT:    slli a3, s7, 9
-; NOREMAT-NEXT:    sd a3, 584(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a3, a0, a3
+; NOREMAT-NEXT:    sd a3, 568(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a3, a7, a3
 ; NOREMAT-NEXT:    vle32.v v8, (a3)
 ; NOREMAT-NEXT:    vle32.v v6, (a3)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v30, v12
-; NOREMAT-NEXT:    li a6, 11
-; NOREMAT-NEXT:    slli a3, a6, 10
-; NOREMAT-NEXT:    sd a3, 576(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a3, a0, a3
+; NOREMAT-NEXT:    li a4, 11
+; NOREMAT-NEXT:    slli a3, a4, 10
+; NOREMAT-NEXT:    sd a3, 560(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a3, a7, a3
 ; NOREMAT-NEXT:    vle32.v v12, (a3)
 ; NOREMAT-NEXT:    vle32.v v30, (a3)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v4, v8
 ; NOREMAT-NEXT:    li s3, 23
-; NOREMAT-NEXT:    slli a3, s3, 9
-; NOREMAT-NEXT:    sd a3, 568(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a3, a0, a3
+; NOREMAT-NEXT:    slli s10, s3, 9
+; NOREMAT-NEXT:    add a3, a7, s10
 ; NOREMAT-NEXT:    vle32.v v8, (a3)
 ; NOREMAT-NEXT:    vle32.v v4, (a3)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v6, v12
 ; NOREMAT-NEXT:    li s0, 25
 ; NOREMAT-NEXT:    slli a3, s0, 9
-; NOREMAT-NEXT:    sd a3, 560(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a3, a0, a3
+; NOREMAT-NEXT:    sd a3, 552(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a3, a7, a3
 ; NOREMAT-NEXT:    vle32.v v12, (a3)
 ; NOREMAT-NEXT:    vle32.v v6, (a3)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v30, v8
-; NOREMAT-NEXT:    li a7, 13
-; NOREMAT-NEXT:    slli a3, a7, 10
-; NOREMAT-NEXT:    sd a3, 552(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a3, a0, a3
+; NOREMAT-NEXT:    slli a3, t5, 10
+; NOREMAT-NEXT:    sd a3, 544(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a3, a7, a3
 ; NOREMAT-NEXT:    vle32.v v8, (a3)
 ; NOREMAT-NEXT:    vle32.v v30, (a3)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v4, v28
 ; NOREMAT-NEXT:    li t3, 27
 ; NOREMAT-NEXT:    slli a3, t3, 9
-; NOREMAT-NEXT:    sd a3, 544(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a3, a0, a3
+; NOREMAT-NEXT:    sd a3, 536(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a3, a7, a3
 ; NOREMAT-NEXT:    vle32.v v28, (a3)
 ; NOREMAT-NEXT:    vle32.v v4, (a3)
 ; NOREMAT-NEXT:    slli a2, a2, 11
-; NOREMAT-NEXT:    sd a2, 536(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd a2, 528(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v26, v12
-; NOREMAT-NEXT:    add a2, a0, a2
+; NOREMAT-NEXT:    add a2, a7, a2
 ; NOREMAT-NEXT:    vle32.v v12, (a2)
 ; NOREMAT-NEXT:    vle32.v v26, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v6, v8
 ; NOREMAT-NEXT:    li t0, 29
 ; NOREMAT-NEXT:    slli a2, t0, 9
-; NOREMAT-NEXT:    sd a2, 528(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a0, a2
+; NOREMAT-NEXT:    sd a2, 520(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a2, a7, a2
 ; NOREMAT-NEXT:    vle32.v v8, (a2)
 ; NOREMAT-NEXT:    vle32.v v6, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v30, v28
-; NOREMAT-NEXT:    slli a2, t2, 10
-; NOREMAT-NEXT:    sd a2, 520(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    li t2, 15
-; NOREMAT-NEXT:    add a2, a0, a2
+; NOREMAT-NEXT:    slli a2, a6, 10
+; NOREMAT-NEXT:    sd a2, 512(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a2, a7, a2
 ; NOREMAT-NEXT:    vle32.v v28, (a2)
 ; NOREMAT-NEXT:    vle32.v v30, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v4, v12
 ; NOREMAT-NEXT:    li a3, 31
-; NOREMAT-NEXT:    slli a2, a3, 9
-; NOREMAT-NEXT:    sd a2, 512(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a0, a2
-; NOREMAT-NEXT:    vle32.v v12, (a2)
-; NOREMAT-NEXT:    vle32.v v4, (a2)
+; NOREMAT-NEXT:    slli a0, a3, 9
+; NOREMAT-NEXT:    sd a0, 504(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a0, a7, a0
+; NOREMAT-NEXT:    vle32.v v12, (a0)
+; NOREMAT-NEXT:    vle32.v v4, (a0)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v26, v8
-; NOREMAT-NEXT:    addiw a2, ra, 512
+; NOREMAT-NEXT:    addiw a2, s11, 512
 ; NOREMAT-NEXT:    sd a2, 496(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a0, a2
+; NOREMAT-NEXT:    add a2, a7, a2
 ; NOREMAT-NEXT:    vle32.v v8, (a2)
 ; NOREMAT-NEXT:    vle32.v v26, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v6, v28
-; NOREMAT-NEXT:    slli a2, s1, 10
+; NOREMAT-NEXT:    slli a2, t2, 10
 ; NOREMAT-NEXT:    sd a2, 488(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a0, a2
+; NOREMAT-NEXT:    add a2, a7, a2
 ; NOREMAT-NEXT:    vle32.v v28, (a2)
 ; NOREMAT-NEXT:    vle32.v v6, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v30, v12
-; NOREMAT-NEXT:    addiw a2, ra, 1536
+; NOREMAT-NEXT:    addiw a2, s11, 1536
 ; NOREMAT-NEXT:    sd a2, 480(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a0, a2
+; NOREMAT-NEXT:    add a2, a7, a2
 ; NOREMAT-NEXT:    vle32.v v12, (a2)
 ; NOREMAT-NEXT:    vle32.v v30, (a2)
 ; NOREMAT-NEXT:    slli a2, a5, 11
 ; NOREMAT-NEXT:    sd a2, 472(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v4, v24
-; NOREMAT-NEXT:    add a2, a0, a2
+; NOREMAT-NEXT:    add a2, a7, a2
 ; NOREMAT-NEXT:    vle32.v v24, (a2)
 ; NOREMAT-NEXT:    vle32.v v4, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v22, v8
-; NOREMAT-NEXT:    lui a4, 5
-; NOREMAT-NEXT:    addiw a2, a4, -1536
+; NOREMAT-NEXT:    addiw a2, ra, -1536
 ; NOREMAT-NEXT:    sd a2, 464(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a0, a2
+; NOREMAT-NEXT:    add a2, a7, a2
 ; NOREMAT-NEXT:    vle32.v v8, (a2)
 ; NOREMAT-NEXT:    vle32.v v22, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v26, v28
-; NOREMAT-NEXT:    slli a2, t1, 10
+; NOREMAT-NEXT:    slli a2, s1, 10
 ; NOREMAT-NEXT:    sd a2, 456(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    li t1, 19
-; NOREMAT-NEXT:    add a2, a0, a2
+; NOREMAT-NEXT:    add a2, a7, a2
 ; NOREMAT-NEXT:    vle32.v v26, (a2)
 ; NOREMAT-NEXT:    vle32.v v28, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v6, v12
-; NOREMAT-NEXT:    addiw a2, a4, -512
+; NOREMAT-NEXT:    addiw a2, ra, -512
 ; NOREMAT-NEXT:    sd a2, 448(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a0, a2
+; NOREMAT-NEXT:    add a2, a7, a2
 ; NOREMAT-NEXT:    vle32.v v12, (a2)
 ; NOREMAT-NEXT:    vle32.v v6, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v30, v24
-; NOREMAT-NEXT:    addiw a2, a4, 512
+; NOREMAT-NEXT:    addiw a2, ra, 512
 ; NOREMAT-NEXT:    sd a2, 440(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a0, a2
+; NOREMAT-NEXT:    add a2, a7, a2
 ; NOREMAT-NEXT:    vle32.v v24, (a2)
 ; NOREMAT-NEXT:    vle32.v v30, (a2)
 ; NOREMAT-NEXT:    slli a2, s7, 10
 ; NOREMAT-NEXT:    sd a2, 432(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v4, v8
-; NOREMAT-NEXT:    add a2, a0, a2
+; NOREMAT-NEXT:    add a2, a7, a2
 ; NOREMAT-NEXT:    vle32.v v8, (a2)
 ; NOREMAT-NEXT:    vle32.v v4, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v22, v26
-; NOREMAT-NEXT:    addiw a2, a4, 1536
+; NOREMAT-NEXT:    addiw a2, ra, 1536
 ; NOREMAT-NEXT:    sd a2, 424(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a0, a2
+; NOREMAT-NEXT:    add a2, a7, a2
 ; NOREMAT-NEXT:    vle32.v v22, (a2)
 ; NOREMAT-NEXT:    vle32.v v26, (a2)
-; NOREMAT-NEXT:    slli a2, a6, 11
+; NOREMAT-NEXT:    slli a2, a4, 11
 ; NOREMAT-NEXT:    sd a2, 416(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v28, v12
-; NOREMAT-NEXT:    add a2, a0, a2
+; NOREMAT-NEXT:    add a2, a7, a2
 ; NOREMAT-NEXT:    vle32.v v12, (a2)
 ; NOREMAT-NEXT:    vle32.v v28, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v6, v18
-; NOREMAT-NEXT:    lui a5, 6
-; NOREMAT-NEXT:    addiw a2, a5, -1536
+; NOREMAT-NEXT:    lui a4, 6
+; NOREMAT-NEXT:    addiw a2, a4, -1536
 ; NOREMAT-NEXT:    sd a2, 408(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a0, a2
+; NOREMAT-NEXT:    add a2, a7, a2
 ; NOREMAT-NEXT:    vle32.v v18, (a2)
 ; NOREMAT-NEXT:    vle32.v v6, (a2)
 ; NOREMAT-NEXT:    slli a2, s3, 10
 ; NOREMAT-NEXT:    sd a2, 400(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v16, v24
-; NOREMAT-NEXT:    add a2, a0, a2
+; NOREMAT-NEXT:    add a2, a7, a2
 ; NOREMAT-NEXT:    vle32.v v16, (a2)
 ; NOREMAT-NEXT:    vle32.v v24, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v30, v8
-; NOREMAT-NEXT:    addiw a2, a5, -512
+; NOREMAT-NEXT:    addiw a2, a4, -512
 ; NOREMAT-NEXT:    sd a2, 392(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a0, a2
+; NOREMAT-NEXT:    add a2, a7, a2
 ; NOREMAT-NEXT:    vle32.v v8, (a2)
 ; NOREMAT-NEXT:    vle32.v v30, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v4, v22
-; NOREMAT-NEXT:    addiw a2, a5, 512
+; NOREMAT-NEXT:    addiw a2, a4, 512
 ; NOREMAT-NEXT:    sd a2, 384(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a0, a2
+; NOREMAT-NEXT:    add a2, a7, a2
 ; NOREMAT-NEXT:    vle32.v v22, (a2)
 ; NOREMAT-NEXT:    vle32.v v4, (a2)
 ; NOREMAT-NEXT:    slli a2, s0, 10
 ; NOREMAT-NEXT:    sd a2, 376(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v26, v12
-; NOREMAT-NEXT:    add a2, a0, a2
+; NOREMAT-NEXT:    add a2, a7, a2
 ; NOREMAT-NEXT:    vle32.v v26, (a2)
 ; NOREMAT-NEXT:    vle32.v v2, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v28, v18
-; NOREMAT-NEXT:    addiw a2, a5, 1536
+; NOREMAT-NEXT:    addiw a2, a4, 1536
 ; NOREMAT-NEXT:    sd a2, 368(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a0, a2
+; NOREMAT-NEXT:    add a2, a7, a2
 ; NOREMAT-NEXT:    vle32.v v18, (a2)
 ; NOREMAT-NEXT:    vle32.v v28, (a2)
-; NOREMAT-NEXT:    slli a2, a7, 11
+; NOREMAT-NEXT:    slli a2, t5, 11
 ; NOREMAT-NEXT:    sd a2, 360(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v6, v16
-; NOREMAT-NEXT:    add a2, a0, a2
+; NOREMAT-NEXT:    add a2, a7, a2
 ; NOREMAT-NEXT:    vle32.v v16, (a2)
 ; NOREMAT-NEXT:    vle32.v v6, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v24, v8
-; NOREMAT-NEXT:    lui a7, 7
-; NOREMAT-NEXT:    addiw a2, a7, -1536
+; NOREMAT-NEXT:    lui a5, 7
+; NOREMAT-NEXT:    addiw a2, a5, -1536
 ; NOREMAT-NEXT:    sd a2, 352(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a0, a2
+; NOREMAT-NEXT:    add a2, a7, a2
 ; NOREMAT-NEXT:    vle32.v v8, (a2)
 ; NOREMAT-NEXT:    vle32.v v24, (a2)
 ; NOREMAT-NEXT:    slli a2, t3, 10
 ; NOREMAT-NEXT:    sd a2, 344(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v30, v14
-; NOREMAT-NEXT:    add a2, a0, a2
+; NOREMAT-NEXT:    add a2, a7, a2
 ; NOREMAT-NEXT:    vle32.v v14, (a2)
 ; NOREMAT-NEXT:    vle32.v v30, (a2)
-; NOREMAT-NEXT:    addi a2, sp, 640
-; NOREMAT-NEXT:    vl2r.v v12, (a2) # Unknown-size Folded Reload
+; NOREMAT-NEXT:    addi a0, sp, 640
+; NOREMAT-NEXT:    vl2r.v v12, (a0) # Unknown-size Folded Reload
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v12, v22
-; NOREMAT-NEXT:    addiw a2, a7, -512
+; NOREMAT-NEXT:    addiw a2, a5, -512
 ; NOREMAT-NEXT:    sd a2, 336(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a0, a2
+; NOREMAT-NEXT:    add a2, a7, a2
 ; NOREMAT-NEXT:    vle32.v v22, (a2)
 ; NOREMAT-NEXT:    vle32.v v12, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v4, v26
-; NOREMAT-NEXT:    addiw a2, a7, 512
+; NOREMAT-NEXT:    addiw a2, a5, 512
 ; NOREMAT-NEXT:    sd a2, 328(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a0, a2
+; NOREMAT-NEXT:    add a2, a7, a2
 ; NOREMAT-NEXT:    vle32.v v26, (a2)
 ; NOREMAT-NEXT:    vle32.v v4, (a2)
 ; NOREMAT-NEXT:    slli a2, t0, 10
 ; NOREMAT-NEXT:    sd a2, 320(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v2, v18
-; NOREMAT-NEXT:    add a2, a0, a2
+; NOREMAT-NEXT:    add a2, a7, a2
 ; NOREMAT-NEXT:    vle32.v v18, (a2)
 ; NOREMAT-NEXT:    vle32.v v2, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v28, v16
-; NOREMAT-NEXT:    addiw a2, a7, 1536
+; NOREMAT-NEXT:    addiw a2, a5, 1536
 ; NOREMAT-NEXT:    sd a2, 312(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a0, a2
+; NOREMAT-NEXT:    add a2, a7, a2
 ; NOREMAT-NEXT:    vle32.v v16, (a2)
 ; NOREMAT-NEXT:    vle32.v v28, (a2)
-; NOREMAT-NEXT:    slli a2, t2, 11
+; NOREMAT-NEXT:    slli a2, a6, 11
 ; NOREMAT-NEXT:    sd a2, 304(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v6, v8
-; NOREMAT-NEXT:    add a2, a0, a2
+; NOREMAT-NEXT:    add a2, a7, a2
 ; NOREMAT-NEXT:    vle32.v v8, (a2)
 ; NOREMAT-NEXT:    vle32.v v6, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v24, v14
-; NOREMAT-NEXT:    addiw a2, t4, -1536
+; NOREMAT-NEXT:    addiw a2, t1, -1536
 ; NOREMAT-NEXT:    sd a2, 296(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a0, a2
+; NOREMAT-NEXT:    add a2, a7, a2
 ; NOREMAT-NEXT:    vle32.v v14, (a2)
 ; NOREMAT-NEXT:    vle32.v v24, (a2)
 ; NOREMAT-NEXT:    slli a2, a3, 10
 ; NOREMAT-NEXT:    sd a2, 288(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v30, v22
-; NOREMAT-NEXT:    add a2, a0, a2
+; NOREMAT-NEXT:    add a2, a7, a2
 ; NOREMAT-NEXT:    vle32.v v22, (a2)
 ; NOREMAT-NEXT:    vle32.v v30, (a2)
-; NOREMAT-NEXT:    addiw a2, t4, -512
-; NOREMAT-NEXT:    sd a2, 280(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a0, a0, a2
+; NOREMAT-NEXT:    addiw a0, t1, -512
+; NOREMAT-NEXT:    sd a0, 280(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a0, a7, a0
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v12, v0
 ; NOREMAT-NEXT:    vle32.v v12, (a0)
 ; NOREMAT-NEXT:    vle32.v v0, (a0)
@@ -435,32 +431,33 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; NOREMAT-NEXT:    addi a0, a1, 1024
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
-; NOREMAT-NEXT:    add s11, a1, s11
-; NOREMAT-NEXT:    sd s11, 272(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    lui a0, 1
+; NOREMAT-NEXT:    add a0, a1, a0
+; NOREMAT-NEXT:    sd a0, 272(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    lui a0, 2
 ; NOREMAT-NEXT:    add a0, a1, a0
 ; NOREMAT-NEXT:    sd a0, 264(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    lui a0, 3
 ; NOREMAT-NEXT:    add a0, a1, a0
 ; NOREMAT-NEXT:    sd a0, 256(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add s11, a1, s11
+; NOREMAT-NEXT:    sd s11, 248(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    add ra, a1, ra
-; NOREMAT-NEXT:    sd ra, 248(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd ra, 240(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    add a4, a1, a4
-; NOREMAT-NEXT:    sd a4, 240(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd a4, 232(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    add a5, a1, a5
-; NOREMAT-NEXT:    sd a5, 232(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a7, a1, a7
-; NOREMAT-NEXT:    sd a7, 224(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a0, a1, t4
+; NOREMAT-NEXT:    sd a5, 224(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a0, a1, t1
 ; NOREMAT-NEXT:    sd a0, 216(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    addiw a0, t4, 512
+; NOREMAT-NEXT:    addiw a0, t1, 512
 ; NOREMAT-NEXT:    sd a0, 192(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    addiw a0, t4, 1024
+; NOREMAT-NEXT:    addiw a0, t1, 1024
 ; NOREMAT-NEXT:    sd a0, 176(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    addiw a0, t4, 1536
+; NOREMAT-NEXT:    addiw a0, t1, 1536
 ; NOREMAT-NEXT:    sd a0, 160(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    slli s1, s1, 11
-; NOREMAT-NEXT:    sd s1, 128(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    slli t2, t2, 11
+; NOREMAT-NEXT:    sd t2, 128(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    lui a0, 9
 ; NOREMAT-NEXT:    addiw a2, a0, -1536
 ; NOREMAT-NEXT:    sd a2, 88(sp) # 8-byte Folded Spill
@@ -473,7 +470,7 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; NOREMAT-NEXT:    addiw s11, a0, 512
 ; NOREMAT-NEXT:    addiw s7, a0, 1024
 ; NOREMAT-NEXT:    addiw s3, a0, 1536
-; NOREMAT-NEXT:    slli s1, t1, 11
+; NOREMAT-NEXT:    slli s1, s1, 11
 ; NOREMAT-NEXT:    lui a0, 10
 ; NOREMAT-NEXT:    addiw t2, a0, -1536
 ; NOREMAT-NEXT:    addiw a7, a0, -1024
@@ -481,52 +478,52 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; NOREMAT-NEXT:    add a2, a1, a0
 ; NOREMAT-NEXT:    sd a2, 200(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    addiw a0, a0, 512
-; NOREMAT-NEXT:    ld a2, 504(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    add a2, a1, a2
-; NOREMAT-NEXT:    add a3, a1, t5
-; NOREMAT-NEXT:    add a5, a1, t6
-; NOREMAT-NEXT:    add a6, a1, s2
-; NOREMAT-NEXT:    add t0, a1, s4
-; NOREMAT-NEXT:    add t1, a1, s5
-; NOREMAT-NEXT:    add t3, a1, s6
-; NOREMAT-NEXT:    add t4, a1, s8
-; NOREMAT-NEXT:    add t5, a1, s9
-; NOREMAT-NEXT:    add t6, a1, s10
-; NOREMAT-NEXT:    ld s0, 624(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add a2, a1, t4
+; NOREMAT-NEXT:    add a3, a1, t6
+; NOREMAT-NEXT:    add a5, a1, s2
+; NOREMAT-NEXT:    add a6, a1, s4
+; NOREMAT-NEXT:    add t0, a1, s5
+; NOREMAT-NEXT:    add t1, a1, s6
+; NOREMAT-NEXT:    add t3, a1, s8
+; NOREMAT-NEXT:    add t4, a1, s9
+; NOREMAT-NEXT:    ld t5, 624(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add t5, a1, t5
+; NOREMAT-NEXT:    ld t6, 616(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add t6, a1, t6
+; NOREMAT-NEXT:    ld s0, 608(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add s0, a1, s0
-; NOREMAT-NEXT:    ld s2, 616(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld s2, 600(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add s2, a1, s2
-; NOREMAT-NEXT:    ld s4, 608(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld s4, 592(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add s4, a1, s4
-; NOREMAT-NEXT:    ld s5, 600(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld s5, 584(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add s5, a1, s5
-; NOREMAT-NEXT:    ld s6, 592(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld s6, 576(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add s6, a1, s6
-; NOREMAT-NEXT:    ld s8, 584(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld s8, 568(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add s8, a1, s8
-; NOREMAT-NEXT:    ld s9, 576(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld s9, 560(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add s9, a1, s9
-; NOREMAT-NEXT:    ld s10, 568(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add s10, a1, s10
-; NOREMAT-NEXT:    ld ra, 560(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld ra, 552(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add ra, a1, ra
 ; NOREMAT-NEXT:    sd ra, 16(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    ld ra, 552(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld ra, 544(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add ra, a1, ra
 ; NOREMAT-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    ld ra, 544(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld ra, 536(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add ra, a1, ra
 ; NOREMAT-NEXT:    sd ra, 32(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    ld ra, 536(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld ra, 528(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add ra, a1, ra
 ; NOREMAT-NEXT:    sd ra, 48(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    ld ra, 528(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld ra, 520(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add ra, a1, ra
 ; NOREMAT-NEXT:    sd ra, 56(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    ld ra, 520(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld ra, 512(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add ra, a1, ra
 ; NOREMAT-NEXT:    sd ra, 64(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    ld ra, 512(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld ra, 504(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add ra, a1, ra
 ; NOREMAT-NEXT:    sd ra, 80(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    ld ra, 496(sp) # 8-byte Folded Reload
@@ -920,10 +917,9 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; REMAT-NEXT:    .cfi_offset s10, -96
 ; REMAT-NEXT:    .cfi_offset s11, -104
 ; REMAT-NEXT:    csrr a2, vlenb
-; REMAT-NEXT:    li a3, 14
-; REMAT-NEXT:    mul a2, a2, a3
+; REMAT-NEXT:    slli a2, a2, 3
 ; REMAT-NEXT:    sub sp, sp, a2
-; REMAT-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xa0, 0x04, 0x22, 0x11, 0x0e, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 544 + 14 * vlenb
+; REMAT-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xa0, 0x04, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 544 + 8 * vlenb
 ; REMAT-NEXT:    li a4, 32
 ; REMAT-NEXT:    addi a5, a0, 512
 ; REMAT-NEXT:    addi a3, a0, 1024
@@ -960,20 +956,13 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; REMAT-NEXT:    slli s6, s6, 9
 ; REMAT-NEXT:    li s7, 5
 ; REMAT-NEXT:    slli s7, s7, 11
-; REMAT-NEXT:    li s8, 21
-; REMAT-NEXT:    slli s8, s8, 9
-; REMAT-NEXT:    li s9, 11
-; REMAT-NEXT:    slli s9, s9, 10
-; REMAT-NEXT:    li s10, 23
-; REMAT-NEXT:    slli s10, s10, 9
-; REMAT-NEXT:    lui s11, 3
 ; REMAT-NEXT:    vsetvli zero, a4, e32, m2, ta, ma
 ; REMAT-NEXT:    vle32.v v8, (a5)
-; REMAT-NEXT:    li a4, 25
+; REMAT-NEXT:    li a4, 21
 ; REMAT-NEXT:    slli a4, a4, 9
 ; REMAT-NEXT:    vle32.v v10, (a3)
 ; REMAT-NEXT:    vle32.v v12, (a3)
-; REMAT-NEXT:    li a3, 13
+; REMAT-NEXT:    li a3, 11
 ; REMAT-NEXT:    slli a3, a3, 10
 ; REMAT-NEXT:    vle32.v v14, (a2)
 ; REMAT-NEXT:    vle32.v v16, (a2)
@@ -990,7 +979,7 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; REMAT-NEXT:    vle32.v v30, (a2)
 ; REMAT-NEXT:    vle32.v v6, (a2)
 ; REMAT-NEXT:    csrr a2, vlenb
-; REMAT-NEXT:    li a5, 12
+; REMAT-NEXT:    li a5, 6
 ; REMAT-NEXT:    mul a2, a2, a5
 ; REMAT-NEXT:    add a2, sp, a2
 ; REMAT-NEXT:    addi a2, a2, 432
@@ -1000,8 +989,7 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; REMAT-NEXT:    vle32.v v2, (a2)
 ; REMAT-NEXT:    vle32.v v6, (a2)
 ; REMAT-NEXT:    csrr a2, vlenb
-; REMAT-NEXT:    li a5, 10
-; REMAT-NEXT:    mul a2, a2, a5
+; REMAT-NEXT:    slli a2, a2, 2
 ; REMAT-NEXT:    add a2, sp, a2
 ; REMAT-NEXT:    addi a2, a2, 432
 ; REMAT-NEXT:    vs2r.v v6, (a2) # Unknown-size Folded Spill
@@ -1015,16 +1003,11 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v12, v14
 ; REMAT-NEXT:    vle32.v v0, (a2)
 ; REMAT-NEXT:    add a2, a0, t5
-; REMAT-NEXT:    vle32.v v14, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v16, v18
 ; REMAT-NEXT:    vle32.v v8, (a2)
-; REMAT-NEXT:    csrr a2, vlenb
-; REMAT-NEXT:    slli a2, a2, 3
-; REMAT-NEXT:    add a2, sp, a2
-; REMAT-NEXT:    addi a2, a2, 432
-; REMAT-NEXT:    vs2r.v v8, (a2) # Unknown-size Folded Spill
-; REMAT-NEXT:    add a2, a0, t6
+; REMAT-NEXT:    sf.vc.vv 3, 0, v16, v18
 ; REMAT-NEXT:    vle32.v v18, (a2)
+; REMAT-NEXT:    add a2, a0, t6
+; REMAT-NEXT:    vle32.v v16, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v20, v22
 ; REMAT-NEXT:    vle32.v v20, (a2)
 ; REMAT-NEXT:    add a2, a0, s0
@@ -1034,383 +1017,340 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; REMAT-NEXT:    add a2, a0, s1
 ; REMAT-NEXT:    vle32.v v26, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v28, v30
-; REMAT-NEXT:    vle32.v v28, (a2)
+; REMAT-NEXT:    vle32.v v14, (a2)
 ; REMAT-NEXT:    add a2, a0, s2
-; REMAT-NEXT:    vle32.v v8, (a2)
+; REMAT-NEXT:    vle32.v v12, (a2)
 ; REMAT-NEXT:    csrr a5, vlenb
-; REMAT-NEXT:    li a6, 12
+; REMAT-NEXT:    li a6, 6
 ; REMAT-NEXT:    mul a5, a5, a6
 ; REMAT-NEXT:    add a5, sp, a5
 ; REMAT-NEXT:    addi a5, a5, 432
-; REMAT-NEXT:    vl2r.v v12, (a5) # Unknown-size Folded Reload
-; REMAT-NEXT:    sf.vc.vv 3, 0, v12, v2
+; REMAT-NEXT:    vl2r.v v28, (a5) # Unknown-size Folded Reload
+; REMAT-NEXT:    sf.vc.vv 3, 0, v28, v2
 ; REMAT-NEXT:    vle32.v v2, (a2)
 ; REMAT-NEXT:    add a2, a0, s3
-; REMAT-NEXT:    vle32.v v12, (a2)
+; REMAT-NEXT:    vle32.v v28, (a2)
 ; REMAT-NEXT:    csrr a5, vlenb
-; REMAT-NEXT:    li a6, 10
-; REMAT-NEXT:    mul a5, a5, a6
+; REMAT-NEXT:    slli a5, a5, 2
 ; REMAT-NEXT:    add a5, sp, a5
 ; REMAT-NEXT:    addi a5, a5, 432
-; REMAT-NEXT:    vl2r.v v16, (a5) # Unknown-size Folded Reload
-; REMAT-NEXT:    sf.vc.vv 3, 0, v16, v4
-; REMAT-NEXT:    vle32.v v30, (a2)
+; REMAT-NEXT:    vl2r.v v30, (a5) # Unknown-size Folded Reload
+; REMAT-NEXT:    sf.vc.vv 3, 0, v30, v4
+; REMAT-NEXT:    vle32.v v4, (a2)
 ; REMAT-NEXT:    add a2, a0, s4
-; REMAT-NEXT:    vle32.v v16, (a2)
+; REMAT-NEXT:    vle32.v v30, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v6, v10
-; REMAT-NEXT:    vle32.v v6, (a2)
-; REMAT-NEXT:    add a2, a0, s5
 ; REMAT-NEXT:    vle32.v v10, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v0, v14
-; REMAT-NEXT:    vle32.v v4, (a2)
-; REMAT-NEXT:    add a2, a0, s6
-; REMAT-NEXT:    vle32.v v14, (a2)
-; REMAT-NEXT:    csrr a5, vlenb
-; REMAT-NEXT:    slli a5, a5, 3
-; REMAT-NEXT:    add a5, sp, a5
-; REMAT-NEXT:    addi a5, a5, 432
-; REMAT-NEXT:    vl2r.v v0, (a5) # Unknown-size Folded Reload
-; REMAT-NEXT:    sf.vc.vv 3, 0, v0, v18
+; REMAT-NEXT:    add a2, a0, s5
+; REMAT-NEXT:    vle32.v v6, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v0, v8
 ; REMAT-NEXT:    vle32.v v0, (a2)
-; REMAT-NEXT:    add a2, a0, s7
+; REMAT-NEXT:    add a2, a0, s6
+; REMAT-NEXT:    vle32.v v8, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v18, v16
 ; REMAT-NEXT:    vle32.v v18, (a2)
+; REMAT-NEXT:    add a2, a0, s7
+; REMAT-NEXT:    vle32.v v16, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v20, v22
-; REMAT-NEXT:    vle32.v v20, (a2)
-; REMAT-NEXT:    csrr a2, vlenb
-; REMAT-NEXT:    slli a2, a2, 3
-; REMAT-NEXT:    add a2, sp, a2
-; REMAT-NEXT:    addi a2, a2, 432
-; REMAT-NEXT:    vs2r.v v20, (a2) # Unknown-size Folded Spill
-; REMAT-NEXT:    add a2, a0, s8
+; REMAT-NEXT:    vle32.v v22, (a2)
+; REMAT-NEXT:    add a2, a0, a4
 ; REMAT-NEXT:    vle32.v v20, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v24, v26
 ; REMAT-NEXT:    vle32.v v24, (a2)
-; REMAT-NEXT:    add a2, a0, s9
-; REMAT-NEXT:    vle32.v v22, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v28, v8
-; REMAT-NEXT:    vle32.v v26, (a2)
-; REMAT-NEXT:    add a2, a0, s10
-; REMAT-NEXT:    vle32.v v8, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v2, v12
-; REMAT-NEXT:    vle32.v v12, (a2)
-; REMAT-NEXT:    add a2, a0, s11
-; REMAT-NEXT:    vle32.v v2, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v30, v16
-; REMAT-NEXT:    vle32.v v16, (a2)
 ; REMAT-NEXT:    addi a2, sp, 432
-; REMAT-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
-; REMAT-NEXT:    add a2, a0, a4
-; REMAT-NEXT:    vle32.v v16, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v6, v10
-; REMAT-NEXT:    vle32.v v10, (a2)
-; REMAT-NEXT:    csrr a2, vlenb
-; REMAT-NEXT:    slli a2, a2, 1
-; REMAT-NEXT:    add a2, sp, a2
-; REMAT-NEXT:    addi a2, a2, 432
-; REMAT-NEXT:    vs2r.v v10, (a2) # Unknown-size Folded Spill
+; REMAT-NEXT:    vs2r.v v24, (a2) # Unknown-size Folded Spill
 ; REMAT-NEXT:    add a2, a0, a3
-; REMAT-NEXT:    vle32.v v10, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v4, v14
-; REMAT-NEXT:    vle32.v v14, (a2)
-; REMAT-NEXT:    csrr a2, vlenb
-; REMAT-NEXT:    li a3, 12
-; REMAT-NEXT:    mul a2, a2, a3
-; REMAT-NEXT:    add a2, sp, a2
-; REMAT-NEXT:    addi a2, a2, 432
-; REMAT-NEXT:    vs2r.v v14, (a2) # Unknown-size Folded Spill
-; REMAT-NEXT:    li a5, 27
+; REMAT-NEXT:    vle32.v v24, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v14, v12
+; REMAT-NEXT:    vle32.v v12, (a2)
+; REMAT-NEXT:    li a5, 23
 ; REMAT-NEXT:    slli a5, a5, 9
 ; REMAT-NEXT:    add a2, a0, a5
+; REMAT-NEXT:    vle32.v v26, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v2, v28
 ; REMAT-NEXT:    vle32.v v14, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v0, v18
-; REMAT-NEXT:    vle32.v v18, (a2)
 ; REMAT-NEXT:    csrr a2, vlenb
-; REMAT-NEXT:    li a3, 10
+; REMAT-NEXT:    li a3, 6
 ; REMAT-NEXT:    mul a2, a2, a3
 ; REMAT-NEXT:    add a2, sp, a2
 ; REMAT-NEXT:    addi a2, a2, 432
-; REMAT-NEXT:    vs2r.v v18, (a2) # Unknown-size Folded Spill
-; REMAT-NEXT:    li ra, 7
-; REMAT-NEXT:    slli ra, ra, 11
-; REMAT-NEXT:    add a2, a0, ra
+; REMAT-NEXT:    vs2r.v v14, (a2) # Unknown-size Folded Spill
+; REMAT-NEXT:    lui s8, 3
+; REMAT-NEXT:    add a2, a0, s8
 ; REMAT-NEXT:    vle32.v v28, (a2)
-; REMAT-NEXT:    csrr a3, vlenb
-; REMAT-NEXT:    slli a3, a3, 3
-; REMAT-NEXT:    add a3, sp, a3
-; REMAT-NEXT:    addi a3, a3, 432
-; REMAT-NEXT:    vl2r.v v18, (a3) # Unknown-size Folded Reload
-; REMAT-NEXT:    sf.vc.vv 3, 0, v18, v20
-; REMAT-NEXT:    vle32.v v18, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v4, v30
+; REMAT-NEXT:    vle32.v v14, (a2)
 ; REMAT-NEXT:    csrr a2, vlenb
-; REMAT-NEXT:    slli a2, a2, 3
+; REMAT-NEXT:    slli a2, a2, 2
 ; REMAT-NEXT:    add a2, sp, a2
 ; REMAT-NEXT:    addi a2, a2, 432
-; REMAT-NEXT:    vs2r.v v18, (a2) # Unknown-size Folded Spill
-; REMAT-NEXT:    li a2, 29
-; REMAT-NEXT:    slli a2, a2, 9
-; REMAT-NEXT:    add a2, a0, a2
+; REMAT-NEXT:    vs2r.v v14, (a2) # Unknown-size Folded Spill
+; REMAT-NEXT:    li s9, 25
+; REMAT-NEXT:    slli s9, s9, 9
+; REMAT-NEXT:    add a2, a0, s9
 ; REMAT-NEXT:    vle32.v v30, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v24, v22
-; REMAT-NEXT:    vle32.v v18, (a2)
-; REMAT-NEXT:    csrr a2, vlenb
-; REMAT-NEXT:    li a3, 6
-; REMAT-NEXT:    mul a2, a2, a3
-; REMAT-NEXT:    add a2, sp, a2
-; REMAT-NEXT:    addi a2, a2, 432
-; REMAT-NEXT:    vs2r.v v18, (a2) # Unknown-size Folded Spill
-; REMAT-NEXT:    li a2, 15
-; REMAT-NEXT:    slli a2, a2, 10
-; REMAT-NEXT:    add a2, a0, a2
+; REMAT-NEXT:    sf.vc.vv 3, 0, v10, v6
+; REMAT-NEXT:    vle32.v v14, (a2)
+; REMAT-NEXT:    li s10, 13
+; REMAT-NEXT:    slli s10, s10, 10
+; REMAT-NEXT:    add a2, a0, s10
 ; REMAT-NEXT:    vle32.v v6, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v26, v8
+; REMAT-NEXT:    sf.vc.vv 3, 0, v0, v8
 ; REMAT-NEXT:    vle32.v v8, (a2)
 ; REMAT-NEXT:    csrr a2, vlenb
-; REMAT-NEXT:    slli a2, a2, 2
+; REMAT-NEXT:    slli a2, a2, 1
 ; REMAT-NEXT:    add a2, sp, a2
 ; REMAT-NEXT:    addi a2, a2, 432
 ; REMAT-NEXT:    vs2r.v v8, (a2) # Unknown-size Folded Spill
-; REMAT-NEXT:    li a2, 31
-; REMAT-NEXT:    slli a2, a2, 9
-; REMAT-NEXT:    add a2, a0, a2
+; REMAT-NEXT:    li s11, 27
+; REMAT-NEXT:    slli s11, s11, 9
+; REMAT-NEXT:    add a2, a0, s11
 ; REMAT-NEXT:    vle32.v v4, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v12, v2
+; REMAT-NEXT:    sf.vc.vv 3, 0, v18, v16
 ; REMAT-NEXT:    vle32.v v18, (a2)
-; REMAT-NEXT:    lui a2, 4
-; REMAT-NEXT:    add a2, a0, a2
+; REMAT-NEXT:    li ra, 7
+; REMAT-NEXT:    slli ra, ra, 11
+; REMAT-NEXT:    add a2, a0, ra
 ; REMAT-NEXT:    vle32.v v2, (a2)
-; REMAT-NEXT:    addi a3, sp, 432
-; REMAT-NEXT:    vl2r.v v8, (a3) # Unknown-size Folded Reload
-; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v16
+; REMAT-NEXT:    sf.vc.vv 3, 0, v22, v20
 ; REMAT-NEXT:    vle32.v v20, (a2)
-; REMAT-NEXT:    lui a2, 4
-; REMAT-NEXT:    addiw a2, a2, 512
+; REMAT-NEXT:    li a2, 29
+; REMAT-NEXT:    slli a2, a2, 9
 ; REMAT-NEXT:    add a2, a0, a2
 ; REMAT-NEXT:    vle32.v v0, (a2)
-; REMAT-NEXT:    csrr a3, vlenb
-; REMAT-NEXT:    slli a3, a3, 1
-; REMAT-NEXT:    add a3, sp, a3
-; REMAT-NEXT:    addi a3, a3, 432
+; REMAT-NEXT:    addi a3, sp, 432
 ; REMAT-NEXT:    vl2r.v v8, (a3) # Unknown-size Folded Reload
-; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v10
+; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v24
 ; REMAT-NEXT:    vle32.v v22, (a2)
-; REMAT-NEXT:    li a2, 17
+; REMAT-NEXT:    li a2, 15
 ; REMAT-NEXT:    slli a2, a2, 10
 ; REMAT-NEXT:    add a2, a0, a2
 ; REMAT-NEXT:    vle32.v v24, (a2)
-; REMAT-NEXT:    csrr a3, vlenb
-; REMAT-NEXT:    li a4, 12
-; REMAT-NEXT:    mul a3, a3, a4
-; REMAT-NEXT:    add a3, sp, a3
-; REMAT-NEXT:    addi a3, a3, 432
-; REMAT-NEXT:    vl2r.v v8, (a3) # Unknown-size Folded Reload
-; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v14
+; REMAT-NEXT:    sf.vc.vv 3, 0, v12, v26
 ; REMAT-NEXT:    vle32.v v8, (a2)
-; REMAT-NEXT:    lui a2, 4
-; REMAT-NEXT:    addiw a2, a2, 1536
+; REMAT-NEXT:    li a2, 31
+; REMAT-NEXT:    slli a2, a2, 9
 ; REMAT-NEXT:    add a2, a0, a2
 ; REMAT-NEXT:    vle32.v v26, (a2)
 ; REMAT-NEXT:    csrr a3, vlenb
-; REMAT-NEXT:    li a4, 10
+; REMAT-NEXT:    li a4, 6
 ; REMAT-NEXT:    mul a3, a3, a4
 ; REMAT-NEXT:    add a3, sp, a3
 ; REMAT-NEXT:    addi a3, a3, 432
 ; REMAT-NEXT:    vl2r.v v10, (a3) # Unknown-size Folded Reload
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v10, v28
 ; REMAT-NEXT:    vle32.v v10, (a2)
-; REMAT-NEXT:    li a2, 9
-; REMAT-NEXT:    slli a2, a2, 11
+; REMAT-NEXT:    lui a2, 4
 ; REMAT-NEXT:    add a2, a0, a2
 ; REMAT-NEXT:    vle32.v v28, (a2)
 ; REMAT-NEXT:    csrr a3, vlenb
-; REMAT-NEXT:    slli a3, a3, 3
+; REMAT-NEXT:    slli a3, a3, 2
 ; REMAT-NEXT:    add a3, sp, a3
 ; REMAT-NEXT:    addi a3, a3, 432
 ; REMAT-NEXT:    vl2r.v v12, (a3) # Unknown-size Folded Reload
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v12, v30
 ; REMAT-NEXT:    vle32.v v12, (a2)
-; REMAT-NEXT:    lui a2, 5
-; REMAT-NEXT:    addiw a2, a2, -1536
+; REMAT-NEXT:    lui a2, 4
+; REMAT-NEXT:    addiw a2, a2, 512
 ; REMAT-NEXT:    add a2, a0, a2
 ; REMAT-NEXT:    vle32.v v30, (a2)
-; REMAT-NEXT:    csrr a3, vlenb
-; REMAT-NEXT:    li a4, 6
-; REMAT-NEXT:    mul a3, a3, a4
-; REMAT-NEXT:    add a3, sp, a3
-; REMAT-NEXT:    addi a3, a3, 432
-; REMAT-NEXT:    vl2r.v v14, (a3) # Unknown-size Folded Reload
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v14, v6
 ; REMAT-NEXT:    vle32.v v14, (a2)
-; REMAT-NEXT:    li a2, 19
+; REMAT-NEXT:    li a2, 17
 ; REMAT-NEXT:    slli a2, a2, 10
 ; REMAT-NEXT:    add a2, a0, a2
 ; REMAT-NEXT:    vle32.v v6, (a2)
 ; REMAT-NEXT:    csrr a3, vlenb
-; REMAT-NEXT:    slli a3, a3, 2
+; REMAT-NEXT:    slli a3, a3, 1
 ; REMAT-NEXT:    add a3, sp, a3
 ; REMAT-NEXT:    addi a3, a3, 432
 ; REMAT-NEXT:    vl2r.v v16, (a3) # Unknown-size Folded Reload
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v16, v4
 ; REMAT-NEXT:    vle32.v v16, (a2)
-; REMAT-NEXT:    lui a2, 5
-; REMAT-NEXT:    addiw a2, a2, -512
+; REMAT-NEXT:    lui a2, 4
+; REMAT-NEXT:    addiw a2, a2, 1536
 ; REMAT-NEXT:    add a2, a0, a2
 ; REMAT-NEXT:    vle32.v v4, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v18, v2
 ; REMAT-NEXT:    vle32.v v18, (a2)
-; REMAT-NEXT:    lui a2, 5
+; REMAT-NEXT:    li a2, 9
+; REMAT-NEXT:    slli a2, a2, 11
 ; REMAT-NEXT:    add a2, a0, a2
 ; REMAT-NEXT:    vle32.v v2, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v20, v0
 ; REMAT-NEXT:    vle32.v v20, (a2)
 ; REMAT-NEXT:    lui a2, 5
-; REMAT-NEXT:    addiw a2, a2, 512
+; REMAT-NEXT:    addiw a2, a2, -1536
 ; REMAT-NEXT:    add a2, a0, a2
 ; REMAT-NEXT:    vle32.v v0, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v22, v24
 ; REMAT-NEXT:    vle32.v v22, (a2)
-; REMAT-NEXT:    li a2, 21
+; REMAT-NEXT:    li a2, 19
 ; REMAT-NEXT:    slli a2, a2, 10
 ; REMAT-NEXT:    add a2, a0, a2
 ; REMAT-NEXT:    vle32.v v24, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v26
 ; REMAT-NEXT:    vle32.v v8, (a2)
-; REMAT-NEXT:    lui s4, 5
-; REMAT-NEXT:    addiw s4, s4, 1536
-; REMAT-NEXT:    add a2, a0, s4
+; REMAT-NEXT:    lui a2, 5
+; REMAT-NEXT:    addiw a2, a2, -512
+; REMAT-NEXT:    add a2, a0, a2
 ; REMAT-NEXT:    vle32.v v26, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v10, v28
 ; REMAT-NEXT:    vle32.v v10, (a2)
-; REMAT-NEXT:    li a2, 11
-; REMAT-NEXT:    slli a2, a2, 11
+; REMAT-NEXT:    lui a2, 5
 ; REMAT-NEXT:    add a2, a0, a2
 ; REMAT-NEXT:    vle32.v v28, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v12, v30
 ; REMAT-NEXT:    vle32.v v12, (a2)
-; REMAT-NEXT:    lui s3, 6
-; REMAT-NEXT:    addiw s3, s3, -1536
-; REMAT-NEXT:    add a2, a0, s3
+; REMAT-NEXT:    lui a2, 5
+; REMAT-NEXT:    addiw a2, a2, 512
+; REMAT-NEXT:    add a2, a0, a2
 ; REMAT-NEXT:    vle32.v v30, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v14, v6
 ; REMAT-NEXT:    vle32.v v14, (a2)
-; REMAT-NEXT:    li s2, 23
-; REMAT-NEXT:    slli s2, s2, 10
-; REMAT-NEXT:    add a2, a0, s2
+; REMAT-NEXT:    li a2, 21
+; REMAT-NEXT:    slli a2, a2, 10
+; REMAT-NEXT:    add a2, a0, a2
 ; REMAT-NEXT:    vle32.v v6, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v16, v4
 ; REMAT-NEXT:    vle32.v v16, (a2)
-; REMAT-NEXT:    lui a2, 6
-; REMAT-NEXT:    addiw a2, a2, -512
+; REMAT-NEXT:    lui a2, 5
+; REMAT-NEXT:    addiw a2, a2, 1536
 ; REMAT-NEXT:    add a2, a0, a2
 ; REMAT-NEXT:    vle32.v v4, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v18, v2
 ; REMAT-NEXT:    vle32.v v18, (a2)
-; REMAT-NEXT:    lui a2, 6
+; REMAT-NEXT:    li a2, 11
+; REMAT-NEXT:    slli a2, a2, 11
 ; REMAT-NEXT:    add a2, a0, a2
-; REMAT-NEXT:    lui s1, 6
 ; REMAT-NEXT:    vle32.v v2, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v20, v0
 ; REMAT-NEXT:    vle32.v v20, (a2)
-; REMAT-NEXT:    lui s0, 6
-; REMAT-NEXT:    addiw s0, s0, 512
-; REMAT-NEXT:    add a2, a0, s0
+; REMAT-NEXT:    lui a2, 6
+; REMAT-NEXT:    addiw a2, a2, -1536
+; REMAT-NEXT:    add a2, a0, a2
 ; REMAT-NEXT:    vle32.v v0, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v22, v24
 ; REMAT-NEXT:    vle32.v v22, (a2)
-; REMAT-NEXT:    li a2, 25
+; REMAT-NEXT:    li a2, 23
 ; REMAT-NEXT:    slli a2, a2, 10
 ; REMAT-NEXT:    add a2, a0, a2
 ; REMAT-NEXT:    vle32.v v24, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v26
 ; REMAT-NEXT:    vle32.v v8, (a2)
-; REMAT-NEXT:    lui t6, 6
-; REMAT-NEXT:    addiw t6, t6, 1536
-; REMAT-NEXT:    add a2, a0, t6
+; REMAT-NEXT:    lui a2, 6
+; REMAT-NEXT:    addiw a2, a2, -512
+; REMAT-NEXT:    add a2, a0, a2
 ; REMAT-NEXT:    vle32.v v26, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v10, v28
 ; REMAT-NEXT:    vle32.v v10, (a2)
-; REMAT-NEXT:    li t5, 13
-; REMAT-NEXT:    slli t5, t5, 11
-; REMAT-NEXT:    add a2, a0, t5
+; REMAT-NEXT:    lui a2, 6
+; REMAT-NEXT:    add a2, a0, a2
+; REMAT-NEXT:    lui s1, 6
 ; REMAT-NEXT:    vle32.v v28, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v12, v30
 ; REMAT-NEXT:    vle32.v v12, (a2)
-; REMAT-NEXT:    lui a2, 7
-; REMAT-NEXT:    addiw a2, a2, -1536
-; REMAT-NEXT:    add a2, a0, a2
+; REMAT-NEXT:    lui s0, 6
+; REMAT-NEXT:    addiw s0, s0, 512
+; REMAT-NEXT:    add a2, a0, s0
 ; REMAT-NEXT:    vle32.v v30, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v14, v6
 ; REMAT-NEXT:    vle32.v v14, (a2)
-; REMAT-NEXT:    li t4, 27
-; REMAT-NEXT:    slli t4, t4, 10
-; REMAT-NEXT:    add a2, a0, t4
+; REMAT-NEXT:    li a2, 25
+; REMAT-NEXT:    slli a2, a2, 10
+; REMAT-NEXT:    add a2, a0, a2
 ; REMAT-NEXT:    vle32.v v6, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v16, v4
 ; REMAT-NEXT:    vle32.v v16, (a2)
-; REMAT-NEXT:    lui a2, 7
-; REMAT-NEXT:    addiw a2, a2, -512
-; REMAT-NEXT:    add a2, a0, a2
+; REMAT-NEXT:    lui t6, 6
+; REMAT-NEXT:    addiw t6, t6, 1536
+; REMAT-NEXT:    add a2, a0, t6
 ; REMAT-NEXT:    vle32.v v4, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v18, v2
 ; REMAT-NEXT:    vle32.v v18, (a2)
-; REMAT-NEXT:    lui a2, 7
-; REMAT-NEXT:    add a2, a0, a2
-; REMAT-NEXT:    lui t3, 7
+; REMAT-NEXT:    li t5, 13
+; REMAT-NEXT:    slli t5, t5, 11
+; REMAT-NEXT:    add a2, a0, t5
 ; REMAT-NEXT:    vle32.v v2, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v20, v0
 ; REMAT-NEXT:    vle32.v v20, (a2)
-; REMAT-NEXT:    lui t2, 7
-; REMAT-NEXT:    addiw t2, t2, 512
-; REMAT-NEXT:    add a2, a0, t2
+; REMAT-NEXT:    lui a2, 7
+; REMAT-NEXT:    addiw a2, a2, -1536
+; REMAT-NEXT:    add a2, a0, a2
 ; REMAT-NEXT:    vle32.v v0, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v22, v24
 ; REMAT-NEXT:    vle32.v v22, (a2)
-; REMAT-NEXT:    li t1, 29
-; REMAT-NEXT:    slli t1, t1, 10
-; REMAT-NEXT:    add a2, a0, t1
+; REMAT-NEXT:    li t4, 27
+; REMAT-NEXT:    slli t4, t4, 10
+; REMAT-NEXT:    add a2, a0, t4
 ; REMAT-NEXT:    vle32.v v24, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v26
 ; REMAT-NEXT:    vle32.v v8, (a2)
-; REMAT-NEXT:    lui t0, 7
-; REMAT-NEXT:    addiw t0, t0, 1536
-; REMAT-NEXT:    add a2, a0, t0
+; REMAT-NEXT:    lui a2, 7
+; REMAT-NEXT:    addiw a2, a2, -512
+; REMAT-NEXT:    add a2, a0, a2
 ; REMAT-NEXT:    vle32.v v26, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v10, v28
 ; REMAT-NEXT:    vle32.v v10, (a2)
-; REMAT-NEXT:    li a7, 15
-; REMAT-NEXT:    slli a7, a7, 11
-; REMAT-NEXT:    add a2, a0, a7
+; REMAT-NEXT:    lui a2, 7
+; REMAT-NEXT:    add a2, a0, a2
+; REMAT-NEXT:    lui t3, 7
 ; REMAT-NEXT:    vle32.v v28, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v12, v30
 ; REMAT-NEXT:    vle32.v v12, (a2)
-; REMAT-NEXT:    lui a6, 8
-; REMAT-NEXT:    addiw a6, a6, -1536
-; REMAT-NEXT:    add a2, a0, a6
+; REMAT-NEXT:    lui t2, 7
+; REMAT-NEXT:    addiw t2, t2, 512
+; REMAT-NEXT:    add a2, a0, t2
 ; REMAT-NEXT:    vle32.v v30, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v14, v6
 ; REMAT-NEXT:    vle32.v v14, (a2)
-; REMAT-NEXT:    li a4, 31
-; REMAT-NEXT:    slli a4, a4, 10
-; REMAT-NEXT:    add a2, a0, a4
+; REMAT-NEXT:    li t1, 29
+; REMAT-NEXT:    slli t1, t1, 10
+; REMAT-NEXT:    add a2, a0, t1
 ; REMAT-NEXT:    vle32.v v6, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v16, v4
 ; REMAT-NEXT:    vle32.v v16, (a2)
-; REMAT-NEXT:    lui a3, 8
-; REMAT-NEXT:    addiw a3, a3, -512
-; REMAT-NEXT:    add a2, a0, a3
+; REMAT-NEXT:    lui t0, 7
+; REMAT-NEXT:    addiw t0, t0, 1536
+; REMAT-NEXT:    add a2, a0, t0
 ; REMAT-NEXT:    vle32.v v4, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v18, v2
 ; REMAT-NEXT:    vle32.v v18, (a2)
-; REMAT-NEXT:    lui a2, 8
-; REMAT-NEXT:    add a0, a0, a2
-; REMAT-NEXT:    vle32.v v2, (a0)
+; REMAT-NEXT:    li a7, 15
+; REMAT-NEXT:    slli a7, a7, 11
+; REMAT-NEXT:    add a2, a0, a7
+; REMAT-NEXT:    vle32.v v2, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v20, v0
+; REMAT-NEXT:    vle32.v v20, (a2)
+; REMAT-NEXT:    lui a6, 8
+; REMAT-NEXT:    addiw a6, a6, -1536
+; REMAT-NEXT:    add a2, a0, a6
+; REMAT-NEXT:    vle32.v v0, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v22, v24
+; REMAT-NEXT:    vle32.v v22, (a2)
+; REMAT-NEXT:    li a4, 31
+; REMAT-NEXT:    slli a4, a4, 10
+; REMAT-NEXT:    add a2, a0, a4
+; REMAT-NEXT:    vle32.v v24, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v26
+; REMAT-NEXT:    vle32.v v8, (a2)
+; REMAT-NEXT:    lui a3, 8
+; REMAT-NEXT:    addiw a3, a3, -512
+; REMAT-NEXT:    add a2, a0, a3
+; REMAT-NEXT:    vle32.v v26, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v10, v28
+; REMAT-NEXT:    vle32.v v10, (a2)
+; REMAT-NEXT:    lui a2, 8
+; REMAT-NEXT:    add a0, a0, a2
+; REMAT-NEXT:    vle32.v v28, (a0)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v12, v30
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v14, v6
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v16, v4
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v18, v2
+; REMAT-NEXT:    sf.vc.vv 3, 0, v20, v0
+; REMAT-NEXT:    sf.vc.vv 3, 0, v22, v24
+; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v26
+; REMAT-NEXT:    sf.vc.vv 3, 0, v10, v28
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; REMAT-NEXT:    addi a0, a1, 1024
 ; REMAT-NEXT:    vse32.v v8, (a0)
@@ -1457,41 +1397,36 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; REMAT-NEXT:    slli a0, a0, 10
 ; REMAT-NEXT:    add a0, a1, a0
 ; REMAT-NEXT:    sd a0, 336(sp) # 8-byte Folded Spill
-; REMAT-NEXT:    li a0, 15
-; REMAT-NEXT:    slli a0, a0, 9
-; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    sd a0, 328(sp) # 8-byte Folded Spill
-; REMAT-NEXT:    lui a0, 2
-; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    sd a0, 320(sp) # 8-byte Folded Spill
-; REMAT-NEXT:    li a0, 17
-; REMAT-NEXT:    slli a0, a0, 9
-; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    sd a0, 312(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    add s2, a1, s2
+; REMAT-NEXT:    sd s2, 328(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    add s3, a1, s3
+; REMAT-NEXT:    sd s3, 320(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    add s4, a1, s4
+; REMAT-NEXT:    sd s4, 312(sp) # 8-byte Folded Spill
 ; REMAT-NEXT:    add s5, a1, s5
 ; REMAT-NEXT:    sd s5, 304(sp) # 8-byte Folded Spill
 ; REMAT-NEXT:    add s6, a1, s6
 ; REMAT-NEXT:    sd s6, 296(sp) # 8-byte Folded Spill
 ; REMAT-NEXT:    add s7, a1, s7
 ; REMAT-NEXT:    sd s7, 288(sp) # 8-byte Folded Spill
-; REMAT-NEXT:    add s8, a1, s8
-; REMAT-NEXT:    sd s8, 280(sp) # 8-byte Folded Spill
-; REMAT-NEXT:    add s9, a1, s9
-; REMAT-NEXT:    sd s9, 272(sp) # 8-byte Folded Spill
-; REMAT-NEXT:    add s10, a1, s10
-; REMAT-NEXT:    sd s10, 264(sp) # 8-byte Folded Spill
-; REMAT-NEXT:    add s11, a1, s11
-; REMAT-NEXT:    sd s11, 256(sp) # 8-byte Folded Spill
-; REMAT-NEXT:    li a0, 25
+; REMAT-NEXT:    li a0, 21
 ; REMAT-NEXT:    slli a0, a0, 9
 ; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    sd a0, 248(sp) # 8-byte Folded Spill
-; REMAT-NEXT:    li a0, 13
+; REMAT-NEXT:    sd a0, 280(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    li a0, 11
 ; REMAT-NEXT:    slli a0, a0, 10
 ; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    sd a0, 240(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    sd a0, 272(sp) # 8-byte Folded Spill
 ; REMAT-NEXT:    add a5, a1, a5
-; REMAT-NEXT:    sd a5, 232(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    sd a5, 264(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    add s8, a1, s8
+; REMAT-NEXT:    sd s8, 256(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    add s9, a1, s9
+; REMAT-NEXT:    sd s9, 248(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    add s10, a1, s10
+; REMAT-NEXT:    sd s10, 240(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    add s11, a1, s11
+; REMAT-NEXT:    sd s11, 232(sp) # 8-byte Folded Spill
 ; REMAT-NEXT:    add ra, a1, ra
 ; REMAT-NEXT:    sd ra, 224(sp) # 8-byte Folded Spill
 ; REMAT-NEXT:    li a0, 29
@@ -1548,16 +1483,22 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; REMAT-NEXT:    slli a0, a0, 10
 ; REMAT-NEXT:    add a0, a1, a0
 ; REMAT-NEXT:    sd a0, 112(sp) # 8-byte Folded Spill
-; REMAT-NEXT:    add s4, a1, s4
-; REMAT-NEXT:    sd s4, 104(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    lui a0, 5
+; REMAT-NEXT:    addiw a0, a0, 1536
+; REMAT-NEXT:    add a0, a1, a0
+; REMAT-NEXT:    sd a0, 104(sp) # 8-byte Folded Spill
 ; REMAT-NEXT:    li a0, 11
 ; REMAT-NEXT:    slli a0, a0, 11
 ; REMAT-NEXT:    add a0, a1, a0
 ; REMAT-NEXT:    sd a0, 96(sp) # 8-byte Folded Spill
-; REMAT-NEXT:    add s3, a1, s3
-; REMAT-NEXT:    sd s3, 88(sp) # 8-byte Folded Spill
-; REMAT-NEXT:    add s2, a1, s2
-; REMAT-NEXT:    sd s2, 80(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    lui a0, 6
+; REMAT-NEXT:    addiw a0, a0, -1536
+; REMAT-NEXT:    add a0, a1, a0
+; REMAT-NEXT:    sd a0, 88(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    li a0, 23
+; REMAT-NEXT:    slli a0, a0, 10
+; REMAT-NEXT:    add a0, a1, a0
+; REMAT-NEXT:    sd a0, 80(sp) # 8-byte Folded Spill
 ; REMAT-NEXT:    lui a0, 6
 ; REMAT-NEXT:    addiw a0, a0, -512
 ; REMAT-NEXT:    add a0, a1, a0
@@ -1854,8 +1795,7 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; REMAT-NEXT:    csrr a0, vlenb
-; REMAT-NEXT:    li a1, 14
-; REMAT-NEXT:    mul a0, a0, a1
+; REMAT-NEXT:    slli a0, a0, 3
 ; REMAT-NEXT:    add sp, sp, a0
 ; REMAT-NEXT:    .cfi_def_cfa sp, 544
 ; REMAT-NEXT:    ld ra, 536(sp) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll
index 575a757149ebba..0b5856a7000dd4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll
@@ -5682,28 +5682,16 @@ define void @mscatter_baseidx_v8i64(<8 x i64> %val, ptr %base, <8 x i64> %idxs,
 ;
 ; RV32ZVE32F-LABEL: mscatter_baseidx_v8i64:
 ; RV32ZVE32F:       # %bb.0:
-; RV32ZVE32F-NEXT:    addi sp, sp, -48
-; RV32ZVE32F-NEXT:    .cfi_def_cfa_offset 48
-; RV32ZVE32F-NEXT:    sw s0, 44(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT:    sw s1, 40(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT:    sw s2, 36(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT:    sw s3, 32(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT:    sw s4, 28(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT:    sw s5, 24(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT:    sw s6, 20(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT:    sw s7, 16(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT:    sw s8, 12(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT:    sw s9, 8(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    addi sp, sp, -16
+; RV32ZVE32F-NEXT:    .cfi_def_cfa_offset 16
+; RV32ZVE32F-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw s2, 4(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw s3, 0(sp) # 4-byte Folded Spill
 ; RV32ZVE32F-NEXT:    .cfi_offset s0, -4
 ; RV32ZVE32F-NEXT:    .cfi_offset s1, -8
 ; RV32ZVE32F-NEXT:    .cfi_offset s2, -12
 ; RV32ZVE32F-NEXT:    .cfi_offset s3, -16
-; RV32ZVE32F-NEXT:    .cfi_offset s4, -20
-; RV32ZVE32F-NEXT:    .cfi_offset s5, -24
-; RV32ZVE32F-NEXT:    .cfi_offset s6, -28
-; RV32ZVE32F-NEXT:    .cfi_offset s7, -32
-; RV32ZVE32F-NEXT:    .cfi_offset s8, -36
-; RV32ZVE32F-NEXT:    .cfi_offset s9, -40
 ; RV32ZVE32F-NEXT:    .cfi_remember_state
 ; RV32ZVE32F-NEXT:    lw a3, 56(a0)
 ; RV32ZVE32F-NEXT:    lw a4, 60(a0)
@@ -5715,30 +5703,30 @@ define void @mscatter_baseidx_v8i64(<8 x i64> %val, ptr %base, <8 x i64> %idxs,
 ; RV32ZVE32F-NEXT:    lw t4, 28(a0)
 ; RV32ZVE32F-NEXT:    lw t1, 32(a0)
 ; RV32ZVE32F-NEXT:    lw t2, 36(a0)
+; RV32ZVE32F-NEXT:    lw t5, 0(a2)
+; RV32ZVE32F-NEXT:    lw t6, 8(a2)
+; RV32ZVE32F-NEXT:    lw s0, 16(a2)
+; RV32ZVE32F-NEXT:    lw s1, 24(a2)
+; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32ZVE32F-NEXT:    vmv.v.x v8, t5
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, t6
+; RV32ZVE32F-NEXT:    lw t5, 32(a2)
+; RV32ZVE32F-NEXT:    lw t6, 40(a2)
+; RV32ZVE32F-NEXT:    lw s2, 48(a2)
+; RV32ZVE32F-NEXT:    lw s3, 56(a2)
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s0
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s1
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, t5
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, t6
 ; RV32ZVE32F-NEXT:    lw s0, 8(a0)
 ; RV32ZVE32F-NEXT:    lw s1, 12(a0)
 ; RV32ZVE32F-NEXT:    lw t5, 16(a0)
 ; RV32ZVE32F-NEXT:    lw t6, 20(a0)
-; RV32ZVE32F-NEXT:    lw s2, 32(a2)
-; RV32ZVE32F-NEXT:    lw s3, 40(a2)
-; RV32ZVE32F-NEXT:    lw s4, 48(a2)
-; RV32ZVE32F-NEXT:    lw s5, 56(a2)
-; RV32ZVE32F-NEXT:    lw s6, 0(a2)
-; RV32ZVE32F-NEXT:    lw s7, 8(a2)
-; RV32ZVE32F-NEXT:    lw s8, 16(a2)
-; RV32ZVE32F-NEXT:    lw s9, 24(a2)
-; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32ZVE32F-NEXT:    vmv.v.x v8, s6
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s2
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s a2, v0
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s7
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s8
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s9
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s2
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s3
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s4
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s5
 ; RV32ZVE32F-NEXT:    vsll.vi v8, v8, 3
 ; RV32ZVE32F-NEXT:    andi s2, a2, 1
 ; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a1
@@ -5771,27 +5759,15 @@ define void @mscatter_baseidx_v8i64(<8 x i64> %val, ptr %base, <8 x i64> %idxs,
 ; RV32ZVE32F-NEXT:    sw a3, 0(a0)
 ; RV32ZVE32F-NEXT:    sw a4, 4(a0)
 ; RV32ZVE32F-NEXT:  .LBB51_9: # %else14
-; RV32ZVE32F-NEXT:    lw s0, 44(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    lw s1, 40(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    lw s2, 36(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    lw s3, 32(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    lw s4, 28(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    lw s5, 24(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    lw s6, 20(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    lw s7, 16(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    lw s8, 12(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    lw s9, 8(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    lw s2, 4(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    lw s3, 0(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    .cfi_restore s0
 ; RV32ZVE32F-NEXT:    .cfi_restore s1
 ; RV32ZVE32F-NEXT:    .cfi_restore s2
 ; RV32ZVE32F-NEXT:    .cfi_restore s3
-; RV32ZVE32F-NEXT:    .cfi_restore s4
-; RV32ZVE32F-NEXT:    .cfi_restore s5
-; RV32ZVE32F-NEXT:    .cfi_restore s6
-; RV32ZVE32F-NEXT:    .cfi_restore s7
-; RV32ZVE32F-NEXT:    .cfi_restore s8
-; RV32ZVE32F-NEXT:    .cfi_restore s9
-; RV32ZVE32F-NEXT:    addi sp, sp, 48
+; RV32ZVE32F-NEXT:    addi sp, sp, 16
 ; RV32ZVE32F-NEXT:    .cfi_def_cfa_offset 0
 ; RV32ZVE32F-NEXT:    ret
 ; RV32ZVE32F-NEXT:  .LBB51_10: # %cond.store
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll
index a11c02dd5e2cb4..036fee6a13ca4c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll
@@ -1306,6 +1306,12 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN32-NEXT:    sb a0, 219(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 564(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 308(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a0, 218(sp)
+; ZVFHMIN32-NEXT:    lh a0, 562(sp)
+; ZVFHMIN32-NEXT:    lh a1, 306(sp)
 ; ZVFHMIN32-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; ZVFHMIN32-NEXT:    vslidedown.vi v10, v8, 7
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
@@ -1358,86 +1364,82 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN32-NEXT:    vs1r.v v10, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN32-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
 ; ZVFHMIN32-NEXT:    vslidedown.vi v26, v8, 15
-; ZVFHMIN32-NEXT:    vslidedown.vi v20, v8, 14
-; ZVFHMIN32-NEXT:    vslidedown.vi v28, v8, 13
-; ZVFHMIN32-NEXT:    vslidedown.vi v10, v8, 12
-; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    slli a2, a2, 1
-; ZVFHMIN32-NEXT:    add a2, sp, a2
-; ZVFHMIN32-NEXT:    addi a2, a2, 848
+; ZVFHMIN32-NEXT:    vslidedown.vi v28, v8, 14
+; ZVFHMIN32-NEXT:    vslidedown.vi v10, v8, 13
+; ZVFHMIN32-NEXT:    addi a2, sp, 848
 ; ZVFHMIN32-NEXT:    vs2r.v v10, (a2) # Unknown-size Folded Spill
-; ZVFHMIN32-NEXT:    vslidedown.vi v4, v8, 11
-; ZVFHMIN32-NEXT:    vslidedown.vi v2, v8, 10
-; ZVFHMIN32-NEXT:    vslidedown.vi v30, v8, 9
-; ZVFHMIN32-NEXT:    vslidedown.vi v22, v8, 8
-; ZVFHMIN32-NEXT:    vmv.x.s t5, v16
+; ZVFHMIN32-NEXT:    vslidedown.vi v6, v8, 12
+; ZVFHMIN32-NEXT:    vslidedown.vi v2, v8, 11
+; ZVFHMIN32-NEXT:    vslidedown.vi v22, v8, 10
+; ZVFHMIN32-NEXT:    vslidedown.vi v20, v8, 9
+; ZVFHMIN32-NEXT:    vslidedown.vi v18, v8, 8
+; ZVFHMIN32-NEXT:    vmv.x.s a3, v16
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 218(sp)
-; ZVFHMIN32-NEXT:    lh a0, 562(sp)
-; ZVFHMIN32-NEXT:    lh a1, 306(sp)
+; ZVFHMIN32-NEXT:    sb a0, 217(sp)
+; ZVFHMIN32-NEXT:    lh a0, 560(sp)
+; ZVFHMIN32-NEXT:    lh a1, 304(sp)
 ; ZVFHMIN32-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; ZVFHMIN32-NEXT:    vslidedown.vi v3, v16, 7
-; ZVFHMIN32-NEXT:    vslidedown.vi v31, v16, 6
-; ZVFHMIN32-NEXT:    vslidedown.vi v5, v16, 5
+; ZVFHMIN32-NEXT:    vslidedown.vi v21, v16, 7
+; ZVFHMIN32-NEXT:    vslidedown.vi v3, v16, 6
+; ZVFHMIN32-NEXT:    vslidedown.vi v19, v16, 5
 ; ZVFHMIN32-NEXT:    vslidedown.vi v23, v16, 4
 ; ZVFHMIN32-NEXT:    vslidedown.vi v8, v16, 3
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    li a3, 18
-; ZVFHMIN32-NEXT:    mul a2, a2, a3
+; ZVFHMIN32-NEXT:    li a4, 10
+; ZVFHMIN32-NEXT:    mul a2, a2, a4
 ; ZVFHMIN32-NEXT:    add a2, sp, a2
 ; ZVFHMIN32-NEXT:    addi a2, a2, 848
 ; ZVFHMIN32-NEXT:    vs1r.v v8, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN32-NEXT:    vslidedown.vi v8, v16, 2
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    li a3, 22
-; ZVFHMIN32-NEXT:    mul a2, a2, a3
+; ZVFHMIN32-NEXT:    slli a2, a2, 4
 ; ZVFHMIN32-NEXT:    add a2, sp, a2
 ; ZVFHMIN32-NEXT:    addi a2, a2, 848
 ; ZVFHMIN32-NEXT:    vs1r.v v8, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN32-NEXT:    vslidedown.vi v8, v16, 1
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    li a3, 21
-; ZVFHMIN32-NEXT:    mul a2, a2, a3
+; ZVFHMIN32-NEXT:    slli a4, a2, 4
+; ZVFHMIN32-NEXT:    sub a2, a4, a2
 ; ZVFHMIN32-NEXT:    add a2, sp, a2
 ; ZVFHMIN32-NEXT:    addi a2, a2, 848
 ; ZVFHMIN32-NEXT:    vs1r.v v8, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN32-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
-; ZVFHMIN32-NEXT:    vslidedown.vi v18, v16, 15
-; ZVFHMIN32-NEXT:    vslidedown.vi v14, v16, 14
-; ZVFHMIN32-NEXT:    vslidedown.vi v12, v16, 13
-; ZVFHMIN32-NEXT:    vslidedown.vi v10, v16, 12
-; ZVFHMIN32-NEXT:    vslidedown.vi v8, v16, 11
-; ZVFHMIN32-NEXT:    vslidedown.vi v6, v16, 10
+; ZVFHMIN32-NEXT:    vslidedown.vi v14, v16, 15
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v16, 14
+; ZVFHMIN32-NEXT:    vslidedown.vi v10, v16, 13
+; ZVFHMIN32-NEXT:    vslidedown.vi v12, v16, 12
+; ZVFHMIN32-NEXT:    vslidedown.vi v30, v16, 11
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    li a3, 19
-; ZVFHMIN32-NEXT:    mul a2, a2, a3
+; ZVFHMIN32-NEXT:    slli a4, a2, 4
+; ZVFHMIN32-NEXT:    add a2, a4, a2
 ; ZVFHMIN32-NEXT:    add a2, sp, a2
 ; ZVFHMIN32-NEXT:    addi a2, a2, 848
-; ZVFHMIN32-NEXT:    vs2r.v v6, (a2) # Unknown-size Folded Spill
-; ZVFHMIN32-NEXT:    vslidedown.vi v6, v16, 9
+; ZVFHMIN32-NEXT:    vs2r.v v30, (a2) # Unknown-size Folded Spill
+; ZVFHMIN32-NEXT:    vslidedown.vi v30, v16, 10
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    li a3, 14
-; ZVFHMIN32-NEXT:    mul a2, a2, a3
+; ZVFHMIN32-NEXT:    li a4, 11
+; ZVFHMIN32-NEXT:    mul a2, a2, a4
 ; ZVFHMIN32-NEXT:    add a2, sp, a2
 ; ZVFHMIN32-NEXT:    addi a2, a2, 848
-; ZVFHMIN32-NEXT:    vs2r.v v6, (a2) # Unknown-size Folded Spill
-; ZVFHMIN32-NEXT:    vslidedown.vi v6, v16, 8
+; ZVFHMIN32-NEXT:    vs2r.v v30, (a2) # Unknown-size Folded Spill
+; ZVFHMIN32-NEXT:    vslidedown.vi v4, v16, 9
+; ZVFHMIN32-NEXT:    vslidedown.vi v30, v16, 8
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 217(sp)
-; ZVFHMIN32-NEXT:    lh a0, 560(sp)
-; ZVFHMIN32-NEXT:    lh a1, 304(sp)
+; ZVFHMIN32-NEXT:    sb a0, 216(sp)
+; ZVFHMIN32-NEXT:    lh a0, 558(sp)
+; ZVFHMIN32-NEXT:    lh a1, 302(sp)
 ; ZVFHMIN32-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; ZVFHMIN32-NEXT:    vslidedown.vi v9, v0, 7
-; ZVFHMIN32-NEXT:    vslidedown.vi v11, v0, 6
-; ZVFHMIN32-NEXT:    vslidedown.vi v13, v0, 5
+; ZVFHMIN32-NEXT:    vslidedown.vi v11, v0, 7
+; ZVFHMIN32-NEXT:    vslidedown.vi v7, v0, 6
+; ZVFHMIN32-NEXT:    vslidedown.vi v9, v0, 5
 ; ZVFHMIN32-NEXT:    vslidedown.vi v29, v0, 4
-; ZVFHMIN32-NEXT:    vslidedown.vi v27, v0, 3
-; ZVFHMIN32-NEXT:    vslidedown.vi v7, v0, 2
-; ZVFHMIN32-NEXT:    vslidedown.vi v21, v0, 1
+; ZVFHMIN32-NEXT:    vslidedown.vi v31, v0, 3
+; ZVFHMIN32-NEXT:    vslidedown.vi v5, v0, 2
+; ZVFHMIN32-NEXT:    vslidedown.vi v27, v0, 1
 ; ZVFHMIN32-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
 ; ZVFHMIN32-NEXT:    vslidedown.vi v16, v0, 15
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
@@ -1447,99 +1449,88 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN32-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN32-NEXT:    vslidedown.vi v16, v0, 14
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    slli a2, a2, 3
+; ZVFHMIN32-NEXT:    slli a2, a2, 1
 ; ZVFHMIN32-NEXT:    add a2, sp, a2
 ; ZVFHMIN32-NEXT:    addi a2, a2, 848
 ; ZVFHMIN32-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN32-NEXT:    vslidedown.vi v16, v0, 13
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    li a3, 6
-; ZVFHMIN32-NEXT:    mul a2, a2, a3
+; ZVFHMIN32-NEXT:    li a4, 6
+; ZVFHMIN32-NEXT:    mul a2, a2, a4
 ; ZVFHMIN32-NEXT:    add a2, sp, a2
 ; ZVFHMIN32-NEXT:    addi a2, a2, 848
 ; ZVFHMIN32-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN32-NEXT:    vslidedown.vi v16, v0, 12
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    li a3, 12
-; ZVFHMIN32-NEXT:    mul a2, a2, a3
+; ZVFHMIN32-NEXT:    slli a2, a2, 3
 ; ZVFHMIN32-NEXT:    add a2, sp, a2
 ; ZVFHMIN32-NEXT:    addi a2, a2, 848
 ; ZVFHMIN32-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN32-NEXT:    vslidedown.vi v16, v0, 11
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    li a3, 10
-; ZVFHMIN32-NEXT:    mul a2, a2, a3
+; ZVFHMIN32-NEXT:    li a4, 13
+; ZVFHMIN32-NEXT:    mul a2, a2, a4
 ; ZVFHMIN32-NEXT:    add a2, sp, a2
 ; ZVFHMIN32-NEXT:    addi a2, a2, 848
 ; ZVFHMIN32-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN32-NEXT:    vslidedown.vi v16, v0, 10
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    slli a2, a2, 4
+; ZVFHMIN32-NEXT:    li a4, 19
+; ZVFHMIN32-NEXT:    mul a2, a2, a4
 ; ZVFHMIN32-NEXT:    add a2, sp, a2
 ; ZVFHMIN32-NEXT:    addi a2, a2, 848
 ; ZVFHMIN32-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN32-NEXT:    vslidedown.vi v16, v0, 9
+; ZVFHMIN32-NEXT:    csrr a2, vlenb
+; ZVFHMIN32-NEXT:    li a4, 21
+; ZVFHMIN32-NEXT:    mul a2, a2, a4
+; ZVFHMIN32-NEXT:    add a2, sp, a2
+; ZVFHMIN32-NEXT:    addi a2, a2, 848
+; ZVFHMIN32-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN32-NEXT:    vslidedown.vi v0, v0, 8
-; ZVFHMIN32-NEXT:    addi a2, sp, 848
-; ZVFHMIN32-NEXT:    vs2r.v v0, (a2) # Unknown-size Folded Spill
-; ZVFHMIN32-NEXT:    vmv.x.s t4, v26
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 216(sp)
-; ZVFHMIN32-NEXT:    lh a0, 558(sp)
-; ZVFHMIN32-NEXT:    lh a1, 302(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s t3, v20
-; ZVFHMIN32-NEXT:    vmv.x.s t1, v28
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a0, 215(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 556(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 300(sp)
-; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    slli a2, a2, 1
-; ZVFHMIN32-NEXT:    add a2, sp, a2
-; ZVFHMIN32-NEXT:    addi a2, a2, 848
-; ZVFHMIN32-NEXT:    vl2r.v v0, (a2) # Unknown-size Folded Reload
-; ZVFHMIN32-NEXT:    vmv.x.s t2, v0
-; ZVFHMIN32-NEXT:    vmv.x.s t0, v4
+; ZVFHMIN32-NEXT:    vmv.x.s t3, v26
+; ZVFHMIN32-NEXT:    vmv.x.s t2, v28
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a0, 214(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 554(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 298(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s a7, v2
-; ZVFHMIN32-NEXT:    vmv.x.s a6, v30
+; ZVFHMIN32-NEXT:    addi a2, sp, 848
+; ZVFHMIN32-NEXT:    vl2r.v v16, (a2) # Unknown-size Folded Reload
+; ZVFHMIN32-NEXT:    vmv.x.s t1, v16
+; ZVFHMIN32-NEXT:    vmv.x.s t0, v6
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a0, 213(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 552(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 296(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s a2, v22
-; ZVFHMIN32-NEXT:    sw a2, 104(sp) # 4-byte Folded Spill
-; ZVFHMIN32-NEXT:    vmv.x.s a2, v18
-; ZVFHMIN32-NEXT:    sw a2, 108(sp) # 4-byte Folded Spill
+; ZVFHMIN32-NEXT:    vmv.x.s a7, v2
+; ZVFHMIN32-NEXT:    vmv.x.s a6, v22
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a0, 212(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 550(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 294(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s a2, v14
-; ZVFHMIN32-NEXT:    sw a2, 112(sp) # 4-byte Folded Spill
-; ZVFHMIN32-NEXT:    vmv.x.s a2, v12
-; ZVFHMIN32-NEXT:    sw a2, 116(sp) # 4-byte Folded Spill
+; ZVFHMIN32-NEXT:    vmv.x.s a5, v20
+; ZVFHMIN32-NEXT:    vmv.x.s a2, v18
+; ZVFHMIN32-NEXT:    sw a2, 108(sp) # 4-byte Folded Spill
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a0, 211(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 548(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 292(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s a2, v10
-; ZVFHMIN32-NEXT:    sw a2, 120(sp) # 4-byte Folded Spill
+; ZVFHMIN32-NEXT:    vmv.x.s a2, v14
+; ZVFHMIN32-NEXT:    sw a2, 116(sp) # 4-byte Folded Spill
 ; ZVFHMIN32-NEXT:    vmv.x.s a2, v8
 ; ZVFHMIN32-NEXT:    sw a2, 124(sp) # 4-byte Folded Spill
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
@@ -1548,33 +1539,27 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN32-NEXT:    sb a0, 210(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 546(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 290(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, t5
-; ZVFHMIN32-NEXT:    vmv.x.s t5, v24
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a3
+; ZVFHMIN32-NEXT:    vmv.x.s a3, v24
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa3, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa4, fa3
 ; ZVFHMIN32-NEXT:    sb a0, 209(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 544(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 288(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, t5
-; ZVFHMIN32-NEXT:    feq.h t5, fa5, fa4
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a3
+; ZVFHMIN32-NEXT:    feq.h a3, fa5, fa4
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb t5, 192(sp)
+; ZVFHMIN32-NEXT:    sb a3, 192(sp)
 ; ZVFHMIN32-NEXT:    sb a0, 208(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 738(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 482(sp)
-; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    li a3, 29
-; ZVFHMIN32-NEXT:    mul a2, a2, a3
-; ZVFHMIN32-NEXT:    add a2, sp, a2
-; ZVFHMIN32-NEXT:    lh s7, 848(a2) # 8-byte Folded Reload
-; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    li a3, 28
-; ZVFHMIN32-NEXT:    mul a2, a2, a3
-; ZVFHMIN32-NEXT:    add a2, sp, a2
-; ZVFHMIN32-NEXT:    lh s4, 848(a2) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT:    vmv.x.s a2, v10
+; ZVFHMIN32-NEXT:    sw a2, 112(sp) # 4-byte Folded Spill
+; ZVFHMIN32-NEXT:    vmv.x.s a2, v12
+; ZVFHMIN32-NEXT:    sw a2, 120(sp) # 4-byte Folded Spill
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
@@ -1582,15 +1567,15 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN32-NEXT:    lh a0, 736(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 480(sp)
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    li a3, 27
+; ZVFHMIN32-NEXT:    li a3, 29
 ; ZVFHMIN32-NEXT:    mul a2, a2, a3
 ; ZVFHMIN32-NEXT:    add a2, sp, a2
-; ZVFHMIN32-NEXT:    lh s8, 848(a2) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT:    lh s5, 848(a2) # 8-byte Folded Reload
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    li a3, 26
+; ZVFHMIN32-NEXT:    li a3, 28
 ; ZVFHMIN32-NEXT:    mul a2, a2, a3
 ; ZVFHMIN32-NEXT:    add a2, sp, a2
-; ZVFHMIN32-NEXT:    lh s5, 848(a2) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT:    lh s2, 848(a2) # 8-byte Folded Reload
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
@@ -1598,15 +1583,15 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN32-NEXT:    lh a0, 734(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 478(sp)
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    li a3, 25
+; ZVFHMIN32-NEXT:    li a3, 27
 ; ZVFHMIN32-NEXT:    mul a2, a2, a3
 ; ZVFHMIN32-NEXT:    add a2, sp, a2
-; ZVFHMIN32-NEXT:    lh s9, 848(a2) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT:    lh s6, 848(a2) # 8-byte Folded Reload
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    li a3, 24
+; ZVFHMIN32-NEXT:    li a3, 26
 ; ZVFHMIN32-NEXT:    mul a2, a2, a3
 ; ZVFHMIN32-NEXT:    add a2, sp, a2
-; ZVFHMIN32-NEXT:    lh s6, 848(a2) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT:    lh s3, 848(a2) # 8-byte Folded Reload
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
@@ -1614,138 +1599,148 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN32-NEXT:    lh a0, 732(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 476(sp)
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    li a3, 23
+; ZVFHMIN32-NEXT:    li a3, 25
 ; ZVFHMIN32-NEXT:    mul a2, a2, a3
 ; ZVFHMIN32-NEXT:    add a2, sp, a2
-; ZVFHMIN32-NEXT:    lh s3, 848(a2) # 8-byte Folded Reload
-; ZVFHMIN32-NEXT:    vmv.x.s t5, v3
+; ZVFHMIN32-NEXT:    lh s7, 848(a2) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT:    csrr a2, vlenb
+; ZVFHMIN32-NEXT:    li a3, 24
+; ZVFHMIN32-NEXT:    mul a2, a2, a3
+; ZVFHMIN32-NEXT:    add a2, sp, a2
+; ZVFHMIN32-NEXT:    lh s4, 848(a2) # 8-byte Folded Reload
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a0, 174(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 730(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 474(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s s2, v31
-; ZVFHMIN32-NEXT:    vmv.x.s t6, v5
+; ZVFHMIN32-NEXT:    csrr a2, vlenb
+; ZVFHMIN32-NEXT:    li a3, 23
+; ZVFHMIN32-NEXT:    mul a2, a2, a3
+; ZVFHMIN32-NEXT:    add a2, sp, a2
+; ZVFHMIN32-NEXT:    lh s8, 848(a2) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT:    vmv.x.s t4, v21
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a0, 173(sp)
-; ZVFHMIN32-NEXT:    lh a1, 728(sp)
-; ZVFHMIN32-NEXT:    lh s10, 472(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s a3, v9
-; ZVFHMIN32-NEXT:    vmv.x.s a4, v11
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, s10
-; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a1, 172(sp)
-; ZVFHMIN32-NEXT:    lh a1, 726(sp)
-; ZVFHMIN32-NEXT:    lh s10, 470(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s a2, v13
-; ZVFHMIN32-NEXT:    vmv.x.s s11, v29
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, s10
-; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a1, 171(sp)
-; ZVFHMIN32-NEXT:    lh ra, 724(sp)
-; ZVFHMIN32-NEXT:    lh a0, 468(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s a5, v27
-; ZVFHMIN32-NEXT:    vmv.x.s s10, v7
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, ra
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a0
+; ZVFHMIN32-NEXT:    lh a0, 728(sp)
+; ZVFHMIN32-NEXT:    lh a1, 472(sp)
+; ZVFHMIN32-NEXT:    vmv.x.s t6, v3
+; ZVFHMIN32-NEXT:    vmv.x.s t5, v19
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a0, 172(sp)
+; ZVFHMIN32-NEXT:    lh a0, 726(sp)
+; ZVFHMIN32-NEXT:    lh a1, 470(sp)
+; ZVFHMIN32-NEXT:    vmv.x.s s10, v11
+; ZVFHMIN32-NEXT:    vmv.x.s s11, v7
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a0, 171(sp)
+; ZVFHMIN32-NEXT:    lh a0, 724(sp)
+; ZVFHMIN32-NEXT:    lh s9, 468(sp)
+; ZVFHMIN32-NEXT:    vmv.x.s a4, v9
+; ZVFHMIN32-NEXT:    vmv.x.s ra, v29
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, s9
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a0, 170(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 722(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 466(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s ra, v21
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, s7
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa3, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa4, fa3
+; ZVFHMIN32-NEXT:    vmv.x.s s9, v31
+; ZVFHMIN32-NEXT:    vmv.x.s a3, v5
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a0, 169(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 720(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 464(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, s4
-; ZVFHMIN32-NEXT:    fmv.h.x fa3, s8
-; ZVFHMIN32-NEXT:    fmv.h.x fa2, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa1, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa2, fa1
+; ZVFHMIN32-NEXT:    vmv.x.s a2, v27
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, s5
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa3, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa4, fa3
 ; ZVFHMIN32-NEXT:    sb a0, 168(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 718(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 462(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa2, s5
-; ZVFHMIN32-NEXT:    fmv.h.x fa1, s9
-; ZVFHMIN32-NEXT:    fmv.h.x fa0, a0
-; ZVFHMIN32-NEXT:    fmv.h.x ft0, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa0, ft0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, s2
+; ZVFHMIN32-NEXT:    fmv.h.x fa3, s6
+; ZVFHMIN32-NEXT:    fmv.h.x fa2, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa1, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa2, fa1
 ; ZVFHMIN32-NEXT:    sb a0, 167(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 716(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa0, s6
 ; ZVFHMIN32-NEXT:    lh a1, 460(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x ft0, a3
+; ZVFHMIN32-NEXT:    fmv.h.x fa2, s3
+; ZVFHMIN32-NEXT:    fmv.h.x fa1, s7
+; ZVFHMIN32-NEXT:    fmv.h.x fa0, a0
+; ZVFHMIN32-NEXT:    fmv.h.x ft0, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa0, ft0
+; ZVFHMIN32-NEXT:    sb a0, 166(sp)
+; ZVFHMIN32-NEXT:    lh a0, 714(sp)
+; ZVFHMIN32-NEXT:    lh a1, 458(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa0, s4
+; ZVFHMIN32-NEXT:    fmv.h.x ft0, s8
 ; ZVFHMIN32-NEXT:    fmv.h.x ft1, a0
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, ft0
+; ZVFHMIN32-NEXT:    fmv.h.x ft2, a1
+; ZVFHMIN32-NEXT:    feq.h a0, ft1, ft2
+; ZVFHMIN32-NEXT:    sb a0, 165(sp)
+; ZVFHMIN32-NEXT:    lh a0, 712(sp)
+; ZVFHMIN32-NEXT:    lh a1, 456(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x ft1, s10
+; ZVFHMIN32-NEXT:    fmv.h.x ft2, s11
+; ZVFHMIN32-NEXT:    fmv.h.x ft3, a0
+; ZVFHMIN32-NEXT:    fmv.h.x ft4, a1
+; ZVFHMIN32-NEXT:    feq.h a0, ft3, ft4
+; ZVFHMIN32-NEXT:    sb a0, 164(sp)
+; ZVFHMIN32-NEXT:    lh a0, 710(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x ft3, a4
+; ZVFHMIN32-NEXT:    lh a1, 454(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x ft4, ra
+; ZVFHMIN32-NEXT:    fmv.h.x ft5, a0
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, ft1
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT:    feq.h a1, ft1, fa5
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a4
-; ZVFHMIN32-NEXT:    sb a1, 166(sp)
-; ZVFHMIN32-NEXT:    lh a1, 714(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x ft0, a2
-; ZVFHMIN32-NEXT:    lh a2, 458(sp)
-; ZVFHMIN32-NEXT:    feq.h a3, fa4, fa5
+; ZVFHMIN32-NEXT:    feq.h a1, ft5, fa5
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a3
+; ZVFHMIN32-NEXT:    sb a1, 163(sp)
+; ZVFHMIN32-NEXT:    lh a1, 708(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x ft1, a2
+; ZVFHMIN32-NEXT:    lh a2, 452(sp)
+; ZVFHMIN32-NEXT:    feq.h a3, fa0, fa5
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT:    feq.h a1, fa3, ft0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
-; ZVFHMIN32-NEXT:    feq.h a2, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, s3
-; ZVFHMIN32-NEXT:    sb a2, 165(sp)
-; ZVFHMIN32-NEXT:    lh a2, 712(sp)
-; ZVFHMIN32-NEXT:    lh a4, 456(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, s11
-; ZVFHMIN32-NEXT:    feq.h s3, fa2, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
-; ZVFHMIN32-NEXT:    fmv.h.x fa3, a4
-; ZVFHMIN32-NEXT:    feq.h a2, fa4, fa3
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a5
-; ZVFHMIN32-NEXT:    sb a2, 164(sp)
-; ZVFHMIN32-NEXT:    lh a2, 710(sp)
-; ZVFHMIN32-NEXT:    lh a4, 454(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa3, s10
-; ZVFHMIN32-NEXT:    feq.h a5, fa1, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
-; ZVFHMIN32-NEXT:    fmv.h.x fa2, a4
-; ZVFHMIN32-NEXT:    feq.h a2, fa4, fa2
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, ra
-; ZVFHMIN32-NEXT:    sb a2, 163(sp)
-; ZVFHMIN32-NEXT:    lh a2, 708(sp)
-; ZVFHMIN32-NEXT:    lh a4, 452(sp)
-; ZVFHMIN32-NEXT:    feq.h s4, fa0, fa3
-; ZVFHMIN32-NEXT:    feq.h s5, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a2
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a4
-; ZVFHMIN32-NEXT:    feq.h a2, fa5, fa4
+; ZVFHMIN32-NEXT:    feq.h a1, ft0, ft1
+; ZVFHMIN32-NEXT:    fmv.h.x fa0, a2
+; ZVFHMIN32-NEXT:    feq.h a2, fa5, fa0
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, s9
 ; ZVFHMIN32-NEXT:    sb a2, 162(sp)
 ; ZVFHMIN32-NEXT:    lh a2, 706(sp)
 ; ZVFHMIN32-NEXT:    lh a4, 450(sp)
-; ZVFHMIN32-NEXT:    sb s5, 129(sp)
-; ZVFHMIN32-NEXT:    sb s4, 130(sp)
-; ZVFHMIN32-NEXT:    sb a5, 131(sp)
-; ZVFHMIN32-NEXT:    sb s3, 132(sp)
+; ZVFHMIN32-NEXT:    sb a1, 129(sp)
+; ZVFHMIN32-NEXT:    feq.h a1, fa1, fa5
+; ZVFHMIN32-NEXT:    sb a3, 130(sp)
+; ZVFHMIN32-NEXT:    feq.h a3, fa2, ft4
+; ZVFHMIN32-NEXT:    sb a1, 131(sp)
+; ZVFHMIN32-NEXT:    feq.h a1, fa4, ft2
+; ZVFHMIN32-NEXT:    sb a3, 132(sp)
+; ZVFHMIN32-NEXT:    feq.h a3, fa3, ft3
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a2
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a4
 ; ZVFHMIN32-NEXT:    feq.h a2, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a1, 133(sp)
-; ZVFHMIN32-NEXT:    sb a3, 134(sp)
+; ZVFHMIN32-NEXT:    sb a3, 133(sp)
+; ZVFHMIN32-NEXT:    sb a1, 134(sp)
 ; ZVFHMIN32-NEXT:    sb a0, 135(sp)
 ; ZVFHMIN32-NEXT:    sb a2, 161(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 610(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 354(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s s6, v23
+; ZVFHMIN32-NEXT:    vmv.x.s s4, v23
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    li a3, 18
+; ZVFHMIN32-NEXT:    li a3, 10
 ; ZVFHMIN32-NEXT:    mul a2, a2, a3
 ; ZVFHMIN32-NEXT:    add a2, sp, a2
-; ZVFHMIN32-NEXT:    lh s5, 848(a2) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT:    lh s2, 848(a2) # 8-byte Folded Reload
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
@@ -1753,13 +1748,12 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN32-NEXT:    lh a0, 608(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 352(sp)
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    li a3, 22
-; ZVFHMIN32-NEXT:    mul a2, a2, a3
+; ZVFHMIN32-NEXT:    slli a2, a2, 4
 ; ZVFHMIN32-NEXT:    add a2, sp, a2
-; ZVFHMIN32-NEXT:    lh s4, 848(a2) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT:    lh s5, 848(a2) # 8-byte Folded Reload
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    li a3, 21
-; ZVFHMIN32-NEXT:    mul a2, a2, a3
+; ZVFHMIN32-NEXT:    slli a3, a2, 4
+; ZVFHMIN32-NEXT:    sub a2, a3, a2
 ; ZVFHMIN32-NEXT:    add a2, sp, a2
 ; ZVFHMIN32-NEXT:    lh s3, 848(a2) # 8-byte Folded Reload
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
@@ -1768,148 +1762,148 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN32-NEXT:    sb a0, 240(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 606(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 350(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa3, t5
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, s2
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa2, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa4, fa2
+; ZVFHMIN32-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 7
+; ZVFHMIN32-NEXT:    vmv.x.s s6, v8
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a0, 239(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 604(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 348(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, t6
-; ZVFHMIN32-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 7
-; ZVFHMIN32-NEXT:    fmv.h.x fa2, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa1, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa2, fa1
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 6
+; ZVFHMIN32-NEXT:    vmv.x.s s7, v8
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a0, 238(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 602(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 346(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s a2, v8
-; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 6
-; ZVFHMIN32-NEXT:    fmv.h.x fa2, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa1, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa2, fa1
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 5
+; ZVFHMIN32-NEXT:    vmv.x.s s8, v8
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a0, 237(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 600(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 344(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s a3, v8
-; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 5
-; ZVFHMIN32-NEXT:    fmv.h.x fa2, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa1, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa2, fa1
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 4
+; ZVFHMIN32-NEXT:    vmv.x.s s9, v8
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a0, 236(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 598(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 342(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s a4, v8
-; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 4
-; ZVFHMIN32-NEXT:    fmv.h.x fa2, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa1, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa2, fa1
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 3
+; ZVFHMIN32-NEXT:    vmv.x.s s10, v8
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a0, 235(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 596(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 340(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s a5, v8
-; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 3
-; ZVFHMIN32-NEXT:    fmv.h.x fa2, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa1, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa2, fa1
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 2
+; ZVFHMIN32-NEXT:    vmv.x.s s11, v8
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a0, 234(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 594(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 338(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s t6, v8
-; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 2
-; ZVFHMIN32-NEXT:    fmv.h.x fa2, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa1, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa2, fa1
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 1
+; ZVFHMIN32-NEXT:    vmv.x.s ra, v8
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a0, 233(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 592(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 336(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s s2, v8
-; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 1
-; ZVFHMIN32-NEXT:    fmv.h.x fa2, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa1, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa2, fa1
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, t4
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, t6
+; ZVFHMIN32-NEXT:    fmv.h.x fa3, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa2, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa3, fa2
 ; ZVFHMIN32-NEXT:    sb a0, 232(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 590(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa2, a2
 ; ZVFHMIN32-NEXT:    lh a1, 334(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s a2, v8
+; ZVFHMIN32-NEXT:    fmv.h.x fa3, t5
+; ZVFHMIN32-NEXT:    fmv.h.x fa2, s4
 ; ZVFHMIN32-NEXT:    fmv.h.x fa1, a0
-; ZVFHMIN32-NEXT:    feq.h t5, fa3, fa2
-; ZVFHMIN32-NEXT:    fmv.h.x fa3, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa1, fa3
-; ZVFHMIN32-NEXT:    fmv.h.x fa3, a3
+; ZVFHMIN32-NEXT:    fmv.h.x fa0, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa1, fa0
 ; ZVFHMIN32-NEXT:    sb a0, 231(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 588(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa2, a4
 ; ZVFHMIN32-NEXT:    lh a1, 332(sp)
-; ZVFHMIN32-NEXT:    feq.h a3, fa5, fa3
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    feq.h a0, fa4, fa2
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, s6
-; ZVFHMIN32-NEXT:    sb a1, 230(sp)
-; ZVFHMIN32-NEXT:    lh a1, 586(sp)
-; ZVFHMIN32-NEXT:    lh a4, 330(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a5
-; ZVFHMIN32-NEXT:    feq.h a5, fa5, fa4
+; ZVFHMIN32-NEXT:    fmv.h.x fa1, s2
+; ZVFHMIN32-NEXT:    fmv.h.x fa0, s5
+; ZVFHMIN32-NEXT:    fmv.h.x ft0, a0
+; ZVFHMIN32-NEXT:    fmv.h.x ft1, a1
+; ZVFHMIN32-NEXT:    feq.h a0, ft0, ft1
+; ZVFHMIN32-NEXT:    sb a0, 230(sp)
+; ZVFHMIN32-NEXT:    lh a0, 586(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x ft0, s3
+; ZVFHMIN32-NEXT:    lh a1, 330(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x ft1, s6
+; ZVFHMIN32-NEXT:    fmv.h.x ft2, a0
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, ft1
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a4
-; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, s5
+; ZVFHMIN32-NEXT:    feq.h a1, ft2, fa5
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, s7
 ; ZVFHMIN32-NEXT:    sb a1, 229(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 584(sp)
-; ZVFHMIN32-NEXT:    lh a4, 328(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, t6
-; ZVFHMIN32-NEXT:    feq.h t6, fa5, fa4
+; ZVFHMIN32-NEXT:    fmv.h.x ft1, s8
+; ZVFHMIN32-NEXT:    lh a2, 328(sp)
+; ZVFHMIN32-NEXT:    feq.h a3, fa4, fa5
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a4
-; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, s4
-; ZVFHMIN32-NEXT:    sb a1, 228(sp)
-; ZVFHMIN32-NEXT:    lh a1, 582(sp)
-; ZVFHMIN32-NEXT:    lh a4, 326(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, s2
-; ZVFHMIN32-NEXT:    feq.h s2, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a4
-; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, s3
-; ZVFHMIN32-NEXT:    sb a1, 227(sp)
-; ZVFHMIN32-NEXT:    lh a1, 580(sp)
-; ZVFHMIN32-NEXT:    lh a4, 324(sp)
+; ZVFHMIN32-NEXT:    feq.h a1, fa3, ft1
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
 ; ZVFHMIN32-NEXT:    feq.h a2, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a4
-; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a1, 226(sp)
-; ZVFHMIN32-NEXT:    lh a1, 578(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, s9
+; ZVFHMIN32-NEXT:    sb a2, 228(sp)
+; ZVFHMIN32-NEXT:    lh a2, 582(sp)
+; ZVFHMIN32-NEXT:    lh a4, 326(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, s10
+; ZVFHMIN32-NEXT:    feq.h t4, fa2, fa5
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a2
+; ZVFHMIN32-NEXT:    fmv.h.x fa3, a4
+; ZVFHMIN32-NEXT:    feq.h a2, fa5, fa3
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, s11
+; ZVFHMIN32-NEXT:    fmv.h.x fa3, ra
+; ZVFHMIN32-NEXT:    sb a2, 227(sp)
+; ZVFHMIN32-NEXT:    lh a2, 580(sp)
+; ZVFHMIN32-NEXT:    lh a4, 324(sp)
+; ZVFHMIN32-NEXT:    feq.h t5, fa0, fa5
+; ZVFHMIN32-NEXT:    feq.h t6, ft0, fa3
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a2
+; ZVFHMIN32-NEXT:    fmv.h.x fa3, a4
+; ZVFHMIN32-NEXT:    feq.h a2, fa5, fa3
+; ZVFHMIN32-NEXT:    sb a2, 226(sp)
+; ZVFHMIN32-NEXT:    lh a2, 578(sp)
 ; ZVFHMIN32-NEXT:    lh a4, 322(sp)
-; ZVFHMIN32-NEXT:    sb a2, 193(sp)
-; ZVFHMIN32-NEXT:    sb s2, 194(sp)
+; ZVFHMIN32-NEXT:    sb t6, 193(sp)
+; ZVFHMIN32-NEXT:    feq.h t6, fa1, fa4
+; ZVFHMIN32-NEXT:    sb t5, 194(sp)
 ; ZVFHMIN32-NEXT:    sb t6, 195(sp)
-; ZVFHMIN32-NEXT:    sb a5, 196(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT:    sb t4, 196(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a2
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a4
-; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 197(sp)
+; ZVFHMIN32-NEXT:    feq.h a2, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a1, 197(sp)
 ; ZVFHMIN32-NEXT:    sb a3, 198(sp)
-; ZVFHMIN32-NEXT:    sb t5, 199(sp)
-; ZVFHMIN32-NEXT:    sb a1, 225(sp)
+; ZVFHMIN32-NEXT:    sb a0, 199(sp)
+; ZVFHMIN32-NEXT:    sb a2, 225(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 766(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 510(sp)
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    li a3, 19
-; ZVFHMIN32-NEXT:    mul a2, a2, a3
+; ZVFHMIN32-NEXT:    slli a3, a2, 4
+; ZVFHMIN32-NEXT:    add a2, a3, a2
 ; ZVFHMIN32-NEXT:    add a2, sp, a2
 ; ZVFHMIN32-NEXT:    addi a2, a2, 848
 ; ZVFHMIN32-NEXT:    vl2r.v v8, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN32-NEXT:    vmv.x.s s2, v8
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    li a3, 14
+; ZVFHMIN32-NEXT:    li a3, 11
 ; ZVFHMIN32-NEXT:    mul a2, a2, a3
 ; ZVFHMIN32-NEXT:    add a2, sp, a2
 ; ZVFHMIN32-NEXT:    addi a2, a2, 848
@@ -1921,301 +1915,305 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN32-NEXT:    sb a0, 191(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 764(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 508(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s t5, v6
-; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    slli a2, a2, 2
-; ZVFHMIN32-NEXT:    add a2, sp, a2
-; ZVFHMIN32-NEXT:    addi a2, a2, 848
-; ZVFHMIN32-NEXT:    vl2r.v v8, (a2) # Unknown-size Folded Reload
-; ZVFHMIN32-NEXT:    vmv.x.s a2, v8
+; ZVFHMIN32-NEXT:    vmv.x.s t5, v4
+; ZVFHMIN32-NEXT:    vmv.x.s t4, v30
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a0, 190(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 762(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 506(sp)
+; ZVFHMIN32-NEXT:    csrr a2, vlenb
+; ZVFHMIN32-NEXT:    slli a2, a2, 2
+; ZVFHMIN32-NEXT:    add a2, sp, a2
+; ZVFHMIN32-NEXT:    addi a2, a2, 848
+; ZVFHMIN32-NEXT:    vl2r.v v8, (a2) # Unknown-size Folded Reload
+; ZVFHMIN32-NEXT:    vmv.x.s a2, v8
 ; ZVFHMIN32-NEXT:    csrr a3, vlenb
-; ZVFHMIN32-NEXT:    slli a3, a3, 3
+; ZVFHMIN32-NEXT:    slli a3, a3, 1
 ; ZVFHMIN32-NEXT:    add a3, sp, a3
 ; ZVFHMIN32-NEXT:    addi a3, a3, 848
 ; ZVFHMIN32-NEXT:    vl2r.v v8, (a3) # Unknown-size Folded Reload
 ; ZVFHMIN32-NEXT:    vmv.x.s a3, v8
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a0, 189(sp)
+; ZVFHMIN32-NEXT:    lh a0, 760(sp)
+; ZVFHMIN32-NEXT:    lh a1, 504(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, t3
 ; ZVFHMIN32-NEXT:    csrr a4, vlenb
-; ZVFHMIN32-NEXT:    li a5, 6
-; ZVFHMIN32-NEXT:    mul a4, a4, a5
+; ZVFHMIN32-NEXT:    li t3, 6
+; ZVFHMIN32-NEXT:    mul a4, a4, t3
 ; ZVFHMIN32-NEXT:    add a4, sp, a4
 ; ZVFHMIN32-NEXT:    addi a4, a4, 848
 ; ZVFHMIN32-NEXT:    vl2r.v v8, (a4) # Unknown-size Folded Reload
 ; ZVFHMIN32-NEXT:    vmv.x.s a4, v8
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 189(sp)
-; ZVFHMIN32-NEXT:    lh a1, 760(sp)
-; ZVFHMIN32-NEXT:    lh a5, 504(sp)
-; ZVFHMIN32-NEXT:    csrr a0, vlenb
-; ZVFHMIN32-NEXT:    li s3, 12
-; ZVFHMIN32-NEXT:    mul a0, a0, s3
-; ZVFHMIN32-NEXT:    add a0, sp, a0
-; ZVFHMIN32-NEXT:    addi a0, a0, 848
-; ZVFHMIN32-NEXT:    vl2r.v v8, (a0) # Unknown-size Folded Reload
-; ZVFHMIN32-NEXT:    vmv.x.s s5, v8
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa3, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa4, fa3
+; ZVFHMIN32-NEXT:    sb a0, 188(sp)
+; ZVFHMIN32-NEXT:    lh a0, 758(sp)
+; ZVFHMIN32-NEXT:    lh a1, 502(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, t2
+; ZVFHMIN32-NEXT:    csrr t2, vlenb
+; ZVFHMIN32-NEXT:    slli t2, t2, 3
+; ZVFHMIN32-NEXT:    add t2, sp, t2
+; ZVFHMIN32-NEXT:    addi t2, t2, 848
+; ZVFHMIN32-NEXT:    vl2r.v v8, (t2) # Unknown-size Folded Reload
+; ZVFHMIN32-NEXT:    vmv.x.s t2, v8
+; ZVFHMIN32-NEXT:    fmv.h.x fa3, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa2, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa3, fa2
+; ZVFHMIN32-NEXT:    sb a0, 187(sp)
+; ZVFHMIN32-NEXT:    lh a0, 756(sp)
+; ZVFHMIN32-NEXT:    lh a1, 500(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa3, t1
+; ZVFHMIN32-NEXT:    csrr t1, vlenb
+; ZVFHMIN32-NEXT:    li t3, 13
+; ZVFHMIN32-NEXT:    mul t1, t1, t3
+; ZVFHMIN32-NEXT:    add t1, sp, t1
+; ZVFHMIN32-NEXT:    addi t1, t1, 848
+; ZVFHMIN32-NEXT:    vl2r.v v8, (t1) # Unknown-size Folded Reload
+; ZVFHMIN32-NEXT:    vmv.x.s t3, v8
+; ZVFHMIN32-NEXT:    fmv.h.x fa2, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa1, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa2, fa1
+; ZVFHMIN32-NEXT:    sb a0, 186(sp)
+; ZVFHMIN32-NEXT:    lh a0, 754(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa2, t0
+; ZVFHMIN32-NEXT:    lh a1, 498(sp)
+; ZVFHMIN32-NEXT:    csrr t0, vlenb
+; ZVFHMIN32-NEXT:    li t1, 19
+; ZVFHMIN32-NEXT:    mul t0, t0, t1
+; ZVFHMIN32-NEXT:    add t0, sp, t0
+; ZVFHMIN32-NEXT:    addi t0, t0, 848
+; ZVFHMIN32-NEXT:    vl2r.v v8, (t0) # Unknown-size Folded Reload
+; ZVFHMIN32-NEXT:    vmv.x.s s3, v8
+; ZVFHMIN32-NEXT:    fmv.h.x fa1, a0
 ; ZVFHMIN32-NEXT:    csrr a0, vlenb
-; ZVFHMIN32-NEXT:    li s3, 10
-; ZVFHMIN32-NEXT:    mul a0, a0, s3
+; ZVFHMIN32-NEXT:    li t0, 21
+; ZVFHMIN32-NEXT:    mul a0, a0, t0
 ; ZVFHMIN32-NEXT:    add a0, sp, a0
 ; ZVFHMIN32-NEXT:    addi a0, a0, 848
 ; ZVFHMIN32-NEXT:    vl2r.v v8, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN32-NEXT:    vmv.x.s a0, v8
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a5
-; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a1, 188(sp)
-; ZVFHMIN32-NEXT:    lh a1, 758(sp)
-; ZVFHMIN32-NEXT:    lh a5, 502(sp)
-; ZVFHMIN32-NEXT:    csrr s3, vlenb
-; ZVFHMIN32-NEXT:    slli s3, s3, 4
-; ZVFHMIN32-NEXT:    add s3, sp, s3
-; ZVFHMIN32-NEXT:    addi s3, s3, 848
-; ZVFHMIN32-NEXT:    vl2r.v v8, (s3) # Unknown-size Folded Reload
-; ZVFHMIN32-NEXT:    vmv.x.s s4, v8
-; ZVFHMIN32-NEXT:    vmv.x.s s3, v16
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a5
-; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, t4
-; ZVFHMIN32-NEXT:    sb a1, 187(sp)
-; ZVFHMIN32-NEXT:    lh a1, 756(sp)
-; ZVFHMIN32-NEXT:    lh a5, 500(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
-; ZVFHMIN32-NEXT:    feq.h t4, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a5
-; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, t3
-; ZVFHMIN32-NEXT:    sb a1, 186(sp)
-; ZVFHMIN32-NEXT:    lh a1, 754(sp)
-; ZVFHMIN32-NEXT:    lh a2, 498(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a3
-; ZVFHMIN32-NEXT:    feq.h t3, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
-; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, t1
+; ZVFHMIN32-NEXT:    fmv.h.x fa0, a1
+; ZVFHMIN32-NEXT:    feq.h a1, fa1, fa0
+; ZVFHMIN32-NEXT:    fmv.h.x fa1, a2
 ; ZVFHMIN32-NEXT:    sb a1, 185(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 752(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa0, a3
 ; ZVFHMIN32-NEXT:    lh a2, 496(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a4
-; ZVFHMIN32-NEXT:    feq.h t1, fa5, fa4
+; ZVFHMIN32-NEXT:    feq.h t0, fa5, fa1
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT:    feq.h t1, fa4, fa0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
 ; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, t2
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a4
 ; ZVFHMIN32-NEXT:    sb a1, 184(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 750(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, t2
 ; ZVFHMIN32-NEXT:    lh a2, 494(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, s5
-; ZVFHMIN32-NEXT:    feq.h a3, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
-; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, t0
-; ZVFHMIN32-NEXT:    sb a1, 183(sp)
-; ZVFHMIN32-NEXT:    lh a1, 748(sp)
-; ZVFHMIN32-NEXT:    lh a2, 492(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a0
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    feq.h a3, fa3, fa5
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT:    feq.h a1, fa2, fa4
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
-; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    feq.h a2, fa5, fa4
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a7
-; ZVFHMIN32-NEXT:    sb a1, 182(sp)
-; ZVFHMIN32-NEXT:    lh a1, 746(sp)
-; ZVFHMIN32-NEXT:    lh a2, 490(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, s4
-; ZVFHMIN32-NEXT:    feq.h a4, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
-; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a2, 183(sp)
+; ZVFHMIN32-NEXT:    lh a2, 748(sp)
+; ZVFHMIN32-NEXT:    lh a4, 492(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, t3
+; ZVFHMIN32-NEXT:    feq.h a7, fa5, fa4
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a2
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a4
+; ZVFHMIN32-NEXT:    feq.h a2, fa5, fa4
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a6
-; ZVFHMIN32-NEXT:    sb a1, 181(sp)
-; ZVFHMIN32-NEXT:    lh a1, 744(sp)
-; ZVFHMIN32-NEXT:    lh a2, 488(sp)
+; ZVFHMIN32-NEXT:    sb a2, 182(sp)
+; ZVFHMIN32-NEXT:    lh a2, 746(sp)
+; ZVFHMIN32-NEXT:    lh a4, 490(sp)
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, s3
 ; ZVFHMIN32-NEXT:    feq.h a6, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
-; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
-; ZVFHMIN32-NEXT:    lw a2, 104(sp) # 4-byte Folded Reload
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a2
-; ZVFHMIN32-NEXT:    addi a2, sp, 848
-; ZVFHMIN32-NEXT:    vl2r.v v8, (a2) # Unknown-size Folded Reload
-; ZVFHMIN32-NEXT:    vmv.x.s a2, v8
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a4
+; ZVFHMIN32-NEXT:    feq.h a2, fa5, fa4
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a5
+; ZVFHMIN32-NEXT:    sb a2, 181(sp)
+; ZVFHMIN32-NEXT:    lh a2, 744(sp)
+; ZVFHMIN32-NEXT:    lh a4, 488(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a0
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a2
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a4
+; ZVFHMIN32-NEXT:    feq.h a2, fa5, fa4
+; ZVFHMIN32-NEXT:    lw a4, 108(sp) # 4-byte Folded Reload
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a4
+; ZVFHMIN32-NEXT:    vmv.x.s a5, v0
 ; ZVFHMIN32-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
 ; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 15
-; ZVFHMIN32-NEXT:    vmv.x.s a5, v8
-; ZVFHMIN32-NEXT:    sb a1, 180(sp)
-; ZVFHMIN32-NEXT:    lh a1, 742(sp)
-; ZVFHMIN32-NEXT:    lh a7, 486(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
+; ZVFHMIN32-NEXT:    vmv.x.s a4, v8
+; ZVFHMIN32-NEXT:    sb a2, 180(sp)
+; ZVFHMIN32-NEXT:    lh a2, 742(sp)
+; ZVFHMIN32-NEXT:    lh t2, 486(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a5
+; ZVFHMIN32-NEXT:    feq.h a5, fa5, fa4
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a2
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, t2
 ; ZVFHMIN32-NEXT:    feq.h a2, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a7
-; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a1, 179(sp)
-; ZVFHMIN32-NEXT:    lh a1, 740(sp)
-; ZVFHMIN32-NEXT:    lh a7, 484(sp)
-; ZVFHMIN32-NEXT:    sb a3, 140(sp)
-; ZVFHMIN32-NEXT:    sb t1, 141(sp)
-; ZVFHMIN32-NEXT:    sb t3, 142(sp)
-; ZVFHMIN32-NEXT:    sb t4, 143(sp)
-; ZVFHMIN32-NEXT:    sb a2, 136(sp)
-; ZVFHMIN32-NEXT:    sb a6, 137(sp)
-; ZVFHMIN32-NEXT:    sb a4, 138(sp)
-; ZVFHMIN32-NEXT:    sb a0, 139(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a7
+; ZVFHMIN32-NEXT:    sb a2, 179(sp)
+; ZVFHMIN32-NEXT:    lh a2, 740(sp)
+; ZVFHMIN32-NEXT:    lh t2, 484(sp)
+; ZVFHMIN32-NEXT:    sb a1, 140(sp)
+; ZVFHMIN32-NEXT:    sb a3, 141(sp)
+; ZVFHMIN32-NEXT:    sb t1, 142(sp)
+; ZVFHMIN32-NEXT:    sb t0, 143(sp)
+; ZVFHMIN32-NEXT:    sb a5, 136(sp)
+; ZVFHMIN32-NEXT:    sb a0, 137(sp)
+; ZVFHMIN32-NEXT:    sb a6, 138(sp)
+; ZVFHMIN32-NEXT:    sb a7, 139(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a2
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, t2
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a0, 178(sp)
-; ZVFHMIN32-NEXT:    lh a1, 638(sp)
-; ZVFHMIN32-NEXT:    lh a2, 382(sp)
+; ZVFHMIN32-NEXT:    lh a0, 638(sp)
+; ZVFHMIN32-NEXT:    lh a1, 382(sp)
 ; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 14
-; ZVFHMIN32-NEXT:    vmv.x.s a0, v8
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
-; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a1, 255(sp)
-; ZVFHMIN32-NEXT:    lh a1, 636(sp)
-; ZVFHMIN32-NEXT:    lh a2, 380(sp)
-; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 13
 ; ZVFHMIN32-NEXT:    vmv.x.s t2, v8
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
-; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a1, 254(sp)
-; ZVFHMIN32-NEXT:    lh a1, 634(sp)
-; ZVFHMIN32-NEXT:    lh a2, 378(sp)
-; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 12
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a0, 255(sp)
+; ZVFHMIN32-NEXT:    lh a0, 636(sp)
+; ZVFHMIN32-NEXT:    lh a1, 380(sp)
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 13
 ; ZVFHMIN32-NEXT:    vmv.x.s t1, v8
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
-; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a1, 253(sp)
-; ZVFHMIN32-NEXT:    lh a1, 632(sp)
-; ZVFHMIN32-NEXT:    lh a2, 376(sp)
-; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 11
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a0, 254(sp)
+; ZVFHMIN32-NEXT:    lh a0, 634(sp)
+; ZVFHMIN32-NEXT:    lh a1, 378(sp)
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 12
 ; ZVFHMIN32-NEXT:    vmv.x.s t0, v8
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
-; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a1, 252(sp)
-; ZVFHMIN32-NEXT:    lh a1, 630(sp)
-; ZVFHMIN32-NEXT:    lh a2, 374(sp)
-; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 10
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a0, 253(sp)
+; ZVFHMIN32-NEXT:    lh a0, 632(sp)
+; ZVFHMIN32-NEXT:    lh a1, 376(sp)
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 11
 ; ZVFHMIN32-NEXT:    vmv.x.s a7, v8
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
-; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a1, 251(sp)
-; ZVFHMIN32-NEXT:    lh a1, 628(sp)
-; ZVFHMIN32-NEXT:    lh a2, 372(sp)
-; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 9
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a0, 252(sp)
+; ZVFHMIN32-NEXT:    lh a0, 630(sp)
+; ZVFHMIN32-NEXT:    lh a1, 374(sp)
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 10
 ; ZVFHMIN32-NEXT:    vmv.x.s a6, v8
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
-; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
-; ZVFHMIN32-NEXT:    lw a2, 108(sp) # 4-byte Folded Reload
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a2
-; ZVFHMIN32-NEXT:    sb a1, 250(sp)
-; ZVFHMIN32-NEXT:    lh a1, 626(sp)
-; ZVFHMIN32-NEXT:    lh a2, 370(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a5
-; ZVFHMIN32-NEXT:    feq.h a4, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
-; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
-; ZVFHMIN32-NEXT:    lw a2, 112(sp) # 4-byte Folded Reload
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a2
-; ZVFHMIN32-NEXT:    sb a1, 249(sp)
-; ZVFHMIN32-NEXT:    lh a1, 624(sp)
-; ZVFHMIN32-NEXT:    lh a2, 368(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a0
-; ZVFHMIN32-NEXT:    feq.h a3, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    lw a1, 116(sp) # 4-byte Folded Reload
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT:    sb a0, 248(sp)
-; ZVFHMIN32-NEXT:    lh a0, 622(sp)
-; ZVFHMIN32-NEXT:    lh a1, 366(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, t2
-; ZVFHMIN32-NEXT:    feq.h a2, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a0, 251(sp)
+; ZVFHMIN32-NEXT:    lh a0, 628(sp)
+; ZVFHMIN32-NEXT:    lh a1, 372(sp)
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 9
+; ZVFHMIN32-NEXT:    vmv.x.s a5, v8
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    lw a1, 120(sp) # 4-byte Folded Reload
+; ZVFHMIN32-NEXT:    lw a1, 116(sp) # 4-byte Folded Reload
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT:    sb a0, 247(sp)
-; ZVFHMIN32-NEXT:    lh a0, 620(sp)
-; ZVFHMIN32-NEXT:    lh a1, 364(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, t1
-; ZVFHMIN32-NEXT:    feq.h a5, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a0, 250(sp)
+; ZVFHMIN32-NEXT:    lh a0, 626(sp)
+; ZVFHMIN32-NEXT:    lh a1, 370(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a4
+; ZVFHMIN32-NEXT:    feq.h a2, fa5, fa4
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    lw a1, 124(sp) # 4-byte Folded Reload
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT:    sb a0, 246(sp)
-; ZVFHMIN32-NEXT:    lh a0, 618(sp)
-; ZVFHMIN32-NEXT:    lh a1, 362(sp)
+; ZVFHMIN32-NEXT:    sb a0, 249(sp)
+; ZVFHMIN32-NEXT:    lh a1, 624(sp)
+; ZVFHMIN32-NEXT:    lh a3, 368(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, t2
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a3
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    lw a3, 112(sp) # 4-byte Folded Reload
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a3
+; ZVFHMIN32-NEXT:    sb a1, 248(sp)
+; ZVFHMIN32-NEXT:    lh a1, 622(sp)
+; ZVFHMIN32-NEXT:    lh a3, 366(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, t1
+; ZVFHMIN32-NEXT:    feq.h a4, fa5, fa4
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a3
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    lw a3, 120(sp) # 4-byte Folded Reload
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a3
+; ZVFHMIN32-NEXT:    sb a1, 247(sp)
+; ZVFHMIN32-NEXT:    lh a1, 620(sp)
+; ZVFHMIN32-NEXT:    lh a3, 364(sp)
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, t0
 ; ZVFHMIN32-NEXT:    feq.h t0, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a3
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, s2
-; ZVFHMIN32-NEXT:    sb a0, 245(sp)
-; ZVFHMIN32-NEXT:    lh a0, 616(sp)
-; ZVFHMIN32-NEXT:    lh a1, 360(sp)
+; ZVFHMIN32-NEXT:    sb a1, 246(sp)
+; ZVFHMIN32-NEXT:    lh a1, 618(sp)
+; ZVFHMIN32-NEXT:    lh a3, 362(sp)
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a7
 ; ZVFHMIN32-NEXT:    feq.h a7, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a3
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, t6
-; ZVFHMIN32-NEXT:    sb a0, 244(sp)
-; ZVFHMIN32-NEXT:    lh a0, 614(sp)
-; ZVFHMIN32-NEXT:    lh a1, 358(sp)
+; ZVFHMIN32-NEXT:    sb a1, 245(sp)
+; ZVFHMIN32-NEXT:    lh a1, 616(sp)
+; ZVFHMIN32-NEXT:    lh a3, 360(sp)
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a6
 ; ZVFHMIN32-NEXT:    feq.h a6, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a3
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, t5
+; ZVFHMIN32-NEXT:    sb a1, 244(sp)
+; ZVFHMIN32-NEXT:    lh a1, 614(sp)
+; ZVFHMIN32-NEXT:    lh a3, 358(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a5
+; ZVFHMIN32-NEXT:    feq.h a5, fa5, fa4
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a3
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, t4
 ; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 8
-; ZVFHMIN32-NEXT:    vmv.x.s a1, v8
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    sb a0, 243(sp)
-; ZVFHMIN32-NEXT:    lh a0, 612(sp)
-; ZVFHMIN32-NEXT:    lh a1, 356(sp)
-; ZVFHMIN32-NEXT:    sb a5, 204(sp)
-; ZVFHMIN32-NEXT:    sb a2, 205(sp)
-; ZVFHMIN32-NEXT:    sb a3, 206(sp)
-; ZVFHMIN32-NEXT:    sb a4, 207(sp)
-; ZVFHMIN32-NEXT:    feq.h a2, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a2, 200(sp)
-; ZVFHMIN32-NEXT:    sb a6, 201(sp)
-; ZVFHMIN32-NEXT:    sb a7, 202(sp)
-; ZVFHMIN32-NEXT:    sb t0, 203(sp)
-; ZVFHMIN32-NEXT:    li a2, 128
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    vmv.x.s a3, v8
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a3
+; ZVFHMIN32-NEXT:    sb a1, 243(sp)
+; ZVFHMIN32-NEXT:    lh a1, 612(sp)
+; ZVFHMIN32-NEXT:    lh a3, 356(sp)
+; ZVFHMIN32-NEXT:    sb t0, 204(sp)
+; ZVFHMIN32-NEXT:    sb a4, 205(sp)
+; ZVFHMIN32-NEXT:    sb a0, 206(sp)
+; ZVFHMIN32-NEXT:    sb a2, 207(sp)
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 242(sp)
-; ZVFHMIN32-NEXT:    addi a0, sp, 128
-; ZVFHMIN32-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
-; ZVFHMIN32-NEXT:    vle8.v v8, (a0)
+; ZVFHMIN32-NEXT:    sb a0, 200(sp)
+; ZVFHMIN32-NEXT:    sb a5, 201(sp)
+; ZVFHMIN32-NEXT:    sb a6, 202(sp)
+; ZVFHMIN32-NEXT:    sb a7, 203(sp)
+; ZVFHMIN32-NEXT:    li a0, 128
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a3
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a1, 242(sp)
+; ZVFHMIN32-NEXT:    addi a1, sp, 128
+; ZVFHMIN32-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; ZVFHMIN32-NEXT:    vle8.v v8, (a1)
 ; ZVFHMIN32-NEXT:    vand.vi v8, v8, 1
 ; ZVFHMIN32-NEXT:    vmsne.vi v0, v8, 0
 ; ZVFHMIN32-NEXT:    addi sp, s0, -896
@@ -2442,6 +2440,12 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN64-NEXT:    sb a0, 219(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 564(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 308(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a0, 218(sp)
+; ZVFHMIN64-NEXT:    lh a0, 562(sp)
+; ZVFHMIN64-NEXT:    lh a1, 306(sp)
 ; ZVFHMIN64-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; ZVFHMIN64-NEXT:    vslidedown.vi v10, v8, 7
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
@@ -2494,86 +2498,82 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN64-NEXT:    vs1r.v v10, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN64-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
 ; ZVFHMIN64-NEXT:    vslidedown.vi v26, v8, 15
-; ZVFHMIN64-NEXT:    vslidedown.vi v20, v8, 14
-; ZVFHMIN64-NEXT:    vslidedown.vi v28, v8, 13
-; ZVFHMIN64-NEXT:    vslidedown.vi v10, v8, 12
-; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    slli a2, a2, 1
-; ZVFHMIN64-NEXT:    add a2, sp, a2
-; ZVFHMIN64-NEXT:    addi a2, a2, 800
+; ZVFHMIN64-NEXT:    vslidedown.vi v28, v8, 14
+; ZVFHMIN64-NEXT:    vslidedown.vi v10, v8, 13
+; ZVFHMIN64-NEXT:    addi a2, sp, 800
 ; ZVFHMIN64-NEXT:    vs2r.v v10, (a2) # Unknown-size Folded Spill
-; ZVFHMIN64-NEXT:    vslidedown.vi v4, v8, 11
-; ZVFHMIN64-NEXT:    vslidedown.vi v2, v8, 10
-; ZVFHMIN64-NEXT:    vslidedown.vi v30, v8, 9
-; ZVFHMIN64-NEXT:    vslidedown.vi v22, v8, 8
-; ZVFHMIN64-NEXT:    vmv.x.s t5, v16
+; ZVFHMIN64-NEXT:    vslidedown.vi v6, v8, 12
+; ZVFHMIN64-NEXT:    vslidedown.vi v2, v8, 11
+; ZVFHMIN64-NEXT:    vslidedown.vi v22, v8, 10
+; ZVFHMIN64-NEXT:    vslidedown.vi v20, v8, 9
+; ZVFHMIN64-NEXT:    vslidedown.vi v18, v8, 8
+; ZVFHMIN64-NEXT:    vmv.x.s a3, v16
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 218(sp)
-; ZVFHMIN64-NEXT:    lh a0, 562(sp)
-; ZVFHMIN64-NEXT:    lh a1, 306(sp)
+; ZVFHMIN64-NEXT:    sb a0, 217(sp)
+; ZVFHMIN64-NEXT:    lh a0, 560(sp)
+; ZVFHMIN64-NEXT:    lh a1, 304(sp)
 ; ZVFHMIN64-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; ZVFHMIN64-NEXT:    vslidedown.vi v3, v16, 7
-; ZVFHMIN64-NEXT:    vslidedown.vi v31, v16, 6
-; ZVFHMIN64-NEXT:    vslidedown.vi v5, v16, 5
+; ZVFHMIN64-NEXT:    vslidedown.vi v21, v16, 7
+; ZVFHMIN64-NEXT:    vslidedown.vi v3, v16, 6
+; ZVFHMIN64-NEXT:    vslidedown.vi v19, v16, 5
 ; ZVFHMIN64-NEXT:    vslidedown.vi v23, v16, 4
 ; ZVFHMIN64-NEXT:    vslidedown.vi v8, v16, 3
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    li a3, 18
-; ZVFHMIN64-NEXT:    mul a2, a2, a3
+; ZVFHMIN64-NEXT:    li a4, 10
+; ZVFHMIN64-NEXT:    mul a2, a2, a4
 ; ZVFHMIN64-NEXT:    add a2, sp, a2
 ; ZVFHMIN64-NEXT:    addi a2, a2, 800
 ; ZVFHMIN64-NEXT:    vs1r.v v8, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN64-NEXT:    vslidedown.vi v8, v16, 2
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    li a3, 22
-; ZVFHMIN64-NEXT:    mul a2, a2, a3
+; ZVFHMIN64-NEXT:    slli a2, a2, 4
 ; ZVFHMIN64-NEXT:    add a2, sp, a2
 ; ZVFHMIN64-NEXT:    addi a2, a2, 800
 ; ZVFHMIN64-NEXT:    vs1r.v v8, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN64-NEXT:    vslidedown.vi v8, v16, 1
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    li a3, 21
-; ZVFHMIN64-NEXT:    mul a2, a2, a3
+; ZVFHMIN64-NEXT:    slli a4, a2, 4
+; ZVFHMIN64-NEXT:    sub a2, a4, a2
 ; ZVFHMIN64-NEXT:    add a2, sp, a2
 ; ZVFHMIN64-NEXT:    addi a2, a2, 800
 ; ZVFHMIN64-NEXT:    vs1r.v v8, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN64-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
-; ZVFHMIN64-NEXT:    vslidedown.vi v18, v16, 15
-; ZVFHMIN64-NEXT:    vslidedown.vi v14, v16, 14
-; ZVFHMIN64-NEXT:    vslidedown.vi v12, v16, 13
-; ZVFHMIN64-NEXT:    vslidedown.vi v10, v16, 12
-; ZVFHMIN64-NEXT:    vslidedown.vi v8, v16, 11
-; ZVFHMIN64-NEXT:    vslidedown.vi v6, v16, 10
+; ZVFHMIN64-NEXT:    vslidedown.vi v14, v16, 15
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v16, 14
+; ZVFHMIN64-NEXT:    vslidedown.vi v10, v16, 13
+; ZVFHMIN64-NEXT:    vslidedown.vi v12, v16, 12
+; ZVFHMIN64-NEXT:    vslidedown.vi v30, v16, 11
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    li a3, 19
-; ZVFHMIN64-NEXT:    mul a2, a2, a3
+; ZVFHMIN64-NEXT:    slli a4, a2, 4
+; ZVFHMIN64-NEXT:    add a2, a4, a2
 ; ZVFHMIN64-NEXT:    add a2, sp, a2
 ; ZVFHMIN64-NEXT:    addi a2, a2, 800
-; ZVFHMIN64-NEXT:    vs2r.v v6, (a2) # Unknown-size Folded Spill
-; ZVFHMIN64-NEXT:    vslidedown.vi v6, v16, 9
+; ZVFHMIN64-NEXT:    vs2r.v v30, (a2) # Unknown-size Folded Spill
+; ZVFHMIN64-NEXT:    vslidedown.vi v30, v16, 10
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    li a3, 14
-; ZVFHMIN64-NEXT:    mul a2, a2, a3
+; ZVFHMIN64-NEXT:    li a4, 11
+; ZVFHMIN64-NEXT:    mul a2, a2, a4
 ; ZVFHMIN64-NEXT:    add a2, sp, a2
 ; ZVFHMIN64-NEXT:    addi a2, a2, 800
-; ZVFHMIN64-NEXT:    vs2r.v v6, (a2) # Unknown-size Folded Spill
-; ZVFHMIN64-NEXT:    vslidedown.vi v6, v16, 8
+; ZVFHMIN64-NEXT:    vs2r.v v30, (a2) # Unknown-size Folded Spill
+; ZVFHMIN64-NEXT:    vslidedown.vi v4, v16, 9
+; ZVFHMIN64-NEXT:    vslidedown.vi v30, v16, 8
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 217(sp)
-; ZVFHMIN64-NEXT:    lh a0, 560(sp)
-; ZVFHMIN64-NEXT:    lh a1, 304(sp)
-; ZVFHMIN64-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; ZVFHMIN64-NEXT:    vslidedown.vi v9, v0, 7
-; ZVFHMIN64-NEXT:    vslidedown.vi v11, v0, 6
-; ZVFHMIN64-NEXT:    vslidedown.vi v13, v0, 5
-; ZVFHMIN64-NEXT:    vslidedown.vi v29, v0, 4
-; ZVFHMIN64-NEXT:    vslidedown.vi v27, v0, 3
-; ZVFHMIN64-NEXT:    vslidedown.vi v7, v0, 2
-; ZVFHMIN64-NEXT:    vslidedown.vi v21, v0, 1
+; ZVFHMIN64-NEXT:    sb a0, 216(sp)
+; ZVFHMIN64-NEXT:    lh a0, 558(sp)
+; ZVFHMIN64-NEXT:    lh a1, 302(sp)
+; ZVFHMIN64-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZVFHMIN64-NEXT:    vslidedown.vi v11, v0, 7
+; ZVFHMIN64-NEXT:    vslidedown.vi v7, v0, 6
+; ZVFHMIN64-NEXT:    vslidedown.vi v9, v0, 5
+; ZVFHMIN64-NEXT:    vslidedown.vi v29, v0, 4
+; ZVFHMIN64-NEXT:    vslidedown.vi v31, v0, 3
+; ZVFHMIN64-NEXT:    vslidedown.vi v5, v0, 2
+; ZVFHMIN64-NEXT:    vslidedown.vi v27, v0, 1
 ; ZVFHMIN64-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
 ; ZVFHMIN64-NEXT:    vslidedown.vi v16, v0, 15
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
@@ -2583,99 +2583,88 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN64-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN64-NEXT:    vslidedown.vi v16, v0, 14
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    slli a2, a2, 3
+; ZVFHMIN64-NEXT:    slli a2, a2, 1
 ; ZVFHMIN64-NEXT:    add a2, sp, a2
 ; ZVFHMIN64-NEXT:    addi a2, a2, 800
 ; ZVFHMIN64-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN64-NEXT:    vslidedown.vi v16, v0, 13
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    li a3, 6
-; ZVFHMIN64-NEXT:    mul a2, a2, a3
+; ZVFHMIN64-NEXT:    li a4, 6
+; ZVFHMIN64-NEXT:    mul a2, a2, a4
 ; ZVFHMIN64-NEXT:    add a2, sp, a2
 ; ZVFHMIN64-NEXT:    addi a2, a2, 800
 ; ZVFHMIN64-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN64-NEXT:    vslidedown.vi v16, v0, 12
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    li a3, 12
-; ZVFHMIN64-NEXT:    mul a2, a2, a3
+; ZVFHMIN64-NEXT:    slli a2, a2, 3
 ; ZVFHMIN64-NEXT:    add a2, sp, a2
 ; ZVFHMIN64-NEXT:    addi a2, a2, 800
 ; ZVFHMIN64-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN64-NEXT:    vslidedown.vi v16, v0, 11
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    li a3, 10
-; ZVFHMIN64-NEXT:    mul a2, a2, a3
+; ZVFHMIN64-NEXT:    li a4, 13
+; ZVFHMIN64-NEXT:    mul a2, a2, a4
 ; ZVFHMIN64-NEXT:    add a2, sp, a2
 ; ZVFHMIN64-NEXT:    addi a2, a2, 800
 ; ZVFHMIN64-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN64-NEXT:    vslidedown.vi v16, v0, 10
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    slli a2, a2, 4
+; ZVFHMIN64-NEXT:    li a4, 19
+; ZVFHMIN64-NEXT:    mul a2, a2, a4
 ; ZVFHMIN64-NEXT:    add a2, sp, a2
 ; ZVFHMIN64-NEXT:    addi a2, a2, 800
 ; ZVFHMIN64-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN64-NEXT:    vslidedown.vi v16, v0, 9
+; ZVFHMIN64-NEXT:    csrr a2, vlenb
+; ZVFHMIN64-NEXT:    li a4, 21
+; ZVFHMIN64-NEXT:    mul a2, a2, a4
+; ZVFHMIN64-NEXT:    add a2, sp, a2
+; ZVFHMIN64-NEXT:    addi a2, a2, 800
+; ZVFHMIN64-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN64-NEXT:    vslidedown.vi v0, v0, 8
-; ZVFHMIN64-NEXT:    addi a2, sp, 800
-; ZVFHMIN64-NEXT:    vs2r.v v0, (a2) # Unknown-size Folded Spill
-; ZVFHMIN64-NEXT:    vmv.x.s t4, v26
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 216(sp)
-; ZVFHMIN64-NEXT:    lh a0, 558(sp)
-; ZVFHMIN64-NEXT:    lh a1, 302(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s t3, v20
-; ZVFHMIN64-NEXT:    vmv.x.s t1, v28
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a0, 215(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 556(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 300(sp)
-; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    slli a2, a2, 1
-; ZVFHMIN64-NEXT:    add a2, sp, a2
-; ZVFHMIN64-NEXT:    addi a2, a2, 800
-; ZVFHMIN64-NEXT:    vl2r.v v0, (a2) # Unknown-size Folded Reload
-; ZVFHMIN64-NEXT:    vmv.x.s t2, v0
-; ZVFHMIN64-NEXT:    vmv.x.s t0, v4
+; ZVFHMIN64-NEXT:    vmv.x.s t3, v26
+; ZVFHMIN64-NEXT:    vmv.x.s t2, v28
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a0, 214(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 554(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 298(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s a7, v2
-; ZVFHMIN64-NEXT:    vmv.x.s a6, v30
+; ZVFHMIN64-NEXT:    addi a2, sp, 800
+; ZVFHMIN64-NEXT:    vl2r.v v16, (a2) # Unknown-size Folded Reload
+; ZVFHMIN64-NEXT:    vmv.x.s t1, v16
+; ZVFHMIN64-NEXT:    vmv.x.s t0, v6
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a0, 213(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 552(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 296(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s a2, v22
-; ZVFHMIN64-NEXT:    sd a2, 80(sp) # 8-byte Folded Spill
-; ZVFHMIN64-NEXT:    vmv.x.s a2, v18
-; ZVFHMIN64-NEXT:    sd a2, 88(sp) # 8-byte Folded Spill
+; ZVFHMIN64-NEXT:    vmv.x.s a7, v2
+; ZVFHMIN64-NEXT:    vmv.x.s a6, v22
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a0, 212(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 550(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 294(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s a2, v14
-; ZVFHMIN64-NEXT:    sd a2, 96(sp) # 8-byte Folded Spill
-; ZVFHMIN64-NEXT:    vmv.x.s a2, v12
-; ZVFHMIN64-NEXT:    sd a2, 104(sp) # 8-byte Folded Spill
+; ZVFHMIN64-NEXT:    vmv.x.s a5, v20
+; ZVFHMIN64-NEXT:    vmv.x.s a2, v18
+; ZVFHMIN64-NEXT:    sd a2, 88(sp) # 8-byte Folded Spill
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a0, 211(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 548(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 292(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s a2, v10
-; ZVFHMIN64-NEXT:    sd a2, 112(sp) # 8-byte Folded Spill
+; ZVFHMIN64-NEXT:    vmv.x.s a2, v14
+; ZVFHMIN64-NEXT:    sd a2, 104(sp) # 8-byte Folded Spill
 ; ZVFHMIN64-NEXT:    vmv.x.s a2, v8
 ; ZVFHMIN64-NEXT:    sd a2, 120(sp) # 8-byte Folded Spill
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
@@ -2684,33 +2673,27 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN64-NEXT:    sb a0, 210(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 546(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 290(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, t5
-; ZVFHMIN64-NEXT:    vmv.x.s t5, v24
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a3
+; ZVFHMIN64-NEXT:    vmv.x.s a3, v24
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa3, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa4, fa3
 ; ZVFHMIN64-NEXT:    sb a0, 209(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 544(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 288(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, t5
-; ZVFHMIN64-NEXT:    feq.h t5, fa5, fa4
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a3
+; ZVFHMIN64-NEXT:    feq.h a3, fa5, fa4
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb t5, 192(sp)
+; ZVFHMIN64-NEXT:    sb a3, 192(sp)
 ; ZVFHMIN64-NEXT:    sb a0, 208(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 738(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 482(sp)
-; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    li a3, 29
-; ZVFHMIN64-NEXT:    mul a2, a2, a3
-; ZVFHMIN64-NEXT:    add a2, sp, a2
-; ZVFHMIN64-NEXT:    lh s7, 800(a2) # 8-byte Folded Reload
-; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    li a3, 28
-; ZVFHMIN64-NEXT:    mul a2, a2, a3
-; ZVFHMIN64-NEXT:    add a2, sp, a2
-; ZVFHMIN64-NEXT:    lh s4, 800(a2) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    vmv.x.s a2, v10
+; ZVFHMIN64-NEXT:    sd a2, 96(sp) # 8-byte Folded Spill
+; ZVFHMIN64-NEXT:    vmv.x.s a2, v12
+; ZVFHMIN64-NEXT:    sd a2, 112(sp) # 8-byte Folded Spill
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
@@ -2718,15 +2701,15 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN64-NEXT:    lh a0, 736(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 480(sp)
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    li a3, 27
+; ZVFHMIN64-NEXT:    li a3, 29
 ; ZVFHMIN64-NEXT:    mul a2, a2, a3
 ; ZVFHMIN64-NEXT:    add a2, sp, a2
-; ZVFHMIN64-NEXT:    lh s8, 800(a2) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    lh s5, 800(a2) # 8-byte Folded Reload
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    li a3, 26
+; ZVFHMIN64-NEXT:    li a3, 28
 ; ZVFHMIN64-NEXT:    mul a2, a2, a3
 ; ZVFHMIN64-NEXT:    add a2, sp, a2
-; ZVFHMIN64-NEXT:    lh s5, 800(a2) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    lh s2, 800(a2) # 8-byte Folded Reload
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
@@ -2734,15 +2717,15 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN64-NEXT:    lh a0, 734(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 478(sp)
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    li a3, 25
+; ZVFHMIN64-NEXT:    li a3, 27
 ; ZVFHMIN64-NEXT:    mul a2, a2, a3
 ; ZVFHMIN64-NEXT:    add a2, sp, a2
-; ZVFHMIN64-NEXT:    lh s9, 800(a2) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    lh s6, 800(a2) # 8-byte Folded Reload
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    li a3, 24
+; ZVFHMIN64-NEXT:    li a3, 26
 ; ZVFHMIN64-NEXT:    mul a2, a2, a3
 ; ZVFHMIN64-NEXT:    add a2, sp, a2
-; ZVFHMIN64-NEXT:    lh s6, 800(a2) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    lh s3, 800(a2) # 8-byte Folded Reload
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
@@ -2750,138 +2733,148 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN64-NEXT:    lh a0, 732(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 476(sp)
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    li a3, 23
+; ZVFHMIN64-NEXT:    li a3, 25
 ; ZVFHMIN64-NEXT:    mul a2, a2, a3
 ; ZVFHMIN64-NEXT:    add a2, sp, a2
-; ZVFHMIN64-NEXT:    lh s3, 800(a2) # 8-byte Folded Reload
-; ZVFHMIN64-NEXT:    vmv.x.s t5, v3
+; ZVFHMIN64-NEXT:    lh s7, 800(a2) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    csrr a2, vlenb
+; ZVFHMIN64-NEXT:    li a3, 24
+; ZVFHMIN64-NEXT:    mul a2, a2, a3
+; ZVFHMIN64-NEXT:    add a2, sp, a2
+; ZVFHMIN64-NEXT:    lh s4, 800(a2) # 8-byte Folded Reload
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a0, 174(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 730(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 474(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s s2, v31
-; ZVFHMIN64-NEXT:    vmv.x.s t6, v5
+; ZVFHMIN64-NEXT:    csrr a2, vlenb
+; ZVFHMIN64-NEXT:    li a3, 23
+; ZVFHMIN64-NEXT:    mul a2, a2, a3
+; ZVFHMIN64-NEXT:    add a2, sp, a2
+; ZVFHMIN64-NEXT:    lh s8, 800(a2) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    vmv.x.s t4, v21
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a0, 173(sp)
-; ZVFHMIN64-NEXT:    lh a1, 728(sp)
-; ZVFHMIN64-NEXT:    lh s10, 472(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s a3, v9
-; ZVFHMIN64-NEXT:    vmv.x.s a4, v11
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, s10
-; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a1, 172(sp)
-; ZVFHMIN64-NEXT:    lh a1, 726(sp)
-; ZVFHMIN64-NEXT:    lh s10, 470(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s a2, v13
-; ZVFHMIN64-NEXT:    vmv.x.s s11, v29
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, s10
-; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a1, 171(sp)
-; ZVFHMIN64-NEXT:    lh ra, 724(sp)
-; ZVFHMIN64-NEXT:    lh a0, 468(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s a5, v27
-; ZVFHMIN64-NEXT:    vmv.x.s s10, v7
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, ra
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a0
+; ZVFHMIN64-NEXT:    lh a0, 728(sp)
+; ZVFHMIN64-NEXT:    lh a1, 472(sp)
+; ZVFHMIN64-NEXT:    vmv.x.s t6, v3
+; ZVFHMIN64-NEXT:    vmv.x.s t5, v19
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a0, 172(sp)
+; ZVFHMIN64-NEXT:    lh a0, 726(sp)
+; ZVFHMIN64-NEXT:    lh a1, 470(sp)
+; ZVFHMIN64-NEXT:    vmv.x.s s10, v11
+; ZVFHMIN64-NEXT:    vmv.x.s s11, v7
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a0, 171(sp)
+; ZVFHMIN64-NEXT:    lh a0, 724(sp)
+; ZVFHMIN64-NEXT:    lh s9, 468(sp)
+; ZVFHMIN64-NEXT:    vmv.x.s a4, v9
+; ZVFHMIN64-NEXT:    vmv.x.s ra, v29
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, s9
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a0, 170(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 722(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 466(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s ra, v21
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, s7
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa3, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa4, fa3
+; ZVFHMIN64-NEXT:    vmv.x.s s9, v31
+; ZVFHMIN64-NEXT:    vmv.x.s a3, v5
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a0, 169(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 720(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 464(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, s4
-; ZVFHMIN64-NEXT:    fmv.h.x fa3, s8
-; ZVFHMIN64-NEXT:    fmv.h.x fa2, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa1, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa2, fa1
+; ZVFHMIN64-NEXT:    vmv.x.s a2, v27
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, s5
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa3, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa4, fa3
 ; ZVFHMIN64-NEXT:    sb a0, 168(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 718(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 462(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa2, s5
-; ZVFHMIN64-NEXT:    fmv.h.x fa1, s9
-; ZVFHMIN64-NEXT:    fmv.h.x fa0, a0
-; ZVFHMIN64-NEXT:    fmv.h.x ft0, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa0, ft0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, s2
+; ZVFHMIN64-NEXT:    fmv.h.x fa3, s6
+; ZVFHMIN64-NEXT:    fmv.h.x fa2, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa1, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa2, fa1
 ; ZVFHMIN64-NEXT:    sb a0, 167(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 716(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa0, s6
 ; ZVFHMIN64-NEXT:    lh a1, 460(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x ft0, a3
+; ZVFHMIN64-NEXT:    fmv.h.x fa2, s3
+; ZVFHMIN64-NEXT:    fmv.h.x fa1, s7
+; ZVFHMIN64-NEXT:    fmv.h.x fa0, a0
+; ZVFHMIN64-NEXT:    fmv.h.x ft0, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa0, ft0
+; ZVFHMIN64-NEXT:    sb a0, 166(sp)
+; ZVFHMIN64-NEXT:    lh a0, 714(sp)
+; ZVFHMIN64-NEXT:    lh a1, 458(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa0, s4
+; ZVFHMIN64-NEXT:    fmv.h.x ft0, s8
 ; ZVFHMIN64-NEXT:    fmv.h.x ft1, a0
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, ft0
+; ZVFHMIN64-NEXT:    fmv.h.x ft2, a1
+; ZVFHMIN64-NEXT:    feq.h a0, ft1, ft2
+; ZVFHMIN64-NEXT:    sb a0, 165(sp)
+; ZVFHMIN64-NEXT:    lh a0, 712(sp)
+; ZVFHMIN64-NEXT:    lh a1, 456(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x ft1, s10
+; ZVFHMIN64-NEXT:    fmv.h.x ft2, s11
+; ZVFHMIN64-NEXT:    fmv.h.x ft3, a0
+; ZVFHMIN64-NEXT:    fmv.h.x ft4, a1
+; ZVFHMIN64-NEXT:    feq.h a0, ft3, ft4
+; ZVFHMIN64-NEXT:    sb a0, 164(sp)
+; ZVFHMIN64-NEXT:    lh a0, 710(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x ft3, a4
+; ZVFHMIN64-NEXT:    lh a1, 454(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x ft4, ra
+; ZVFHMIN64-NEXT:    fmv.h.x ft5, a0
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, ft1
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT:    feq.h a1, ft1, fa5
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a4
-; ZVFHMIN64-NEXT:    sb a1, 166(sp)
-; ZVFHMIN64-NEXT:    lh a1, 714(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x ft0, a2
-; ZVFHMIN64-NEXT:    lh a2, 458(sp)
-; ZVFHMIN64-NEXT:    feq.h a3, fa4, fa5
+; ZVFHMIN64-NEXT:    feq.h a1, ft5, fa5
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a3
+; ZVFHMIN64-NEXT:    sb a1, 163(sp)
+; ZVFHMIN64-NEXT:    lh a1, 708(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x ft1, a2
+; ZVFHMIN64-NEXT:    lh a2, 452(sp)
+; ZVFHMIN64-NEXT:    feq.h a3, fa0, fa5
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT:    feq.h a1, fa3, ft0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
-; ZVFHMIN64-NEXT:    feq.h a2, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, s3
-; ZVFHMIN64-NEXT:    sb a2, 165(sp)
-; ZVFHMIN64-NEXT:    lh a2, 712(sp)
-; ZVFHMIN64-NEXT:    lh a4, 456(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, s11
-; ZVFHMIN64-NEXT:    feq.h s3, fa2, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
-; ZVFHMIN64-NEXT:    fmv.h.x fa3, a4
-; ZVFHMIN64-NEXT:    feq.h a2, fa4, fa3
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a5
-; ZVFHMIN64-NEXT:    sb a2, 164(sp)
-; ZVFHMIN64-NEXT:    lh a2, 710(sp)
-; ZVFHMIN64-NEXT:    lh a4, 454(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa3, s10
-; ZVFHMIN64-NEXT:    feq.h a5, fa1, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
-; ZVFHMIN64-NEXT:    fmv.h.x fa2, a4
-; ZVFHMIN64-NEXT:    feq.h a2, fa4, fa2
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, ra
-; ZVFHMIN64-NEXT:    sb a2, 163(sp)
-; ZVFHMIN64-NEXT:    lh a2, 708(sp)
-; ZVFHMIN64-NEXT:    lh a4, 452(sp)
-; ZVFHMIN64-NEXT:    feq.h s4, fa0, fa3
-; ZVFHMIN64-NEXT:    feq.h s5, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a2
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a4
-; ZVFHMIN64-NEXT:    feq.h a2, fa5, fa4
+; ZVFHMIN64-NEXT:    feq.h a1, ft0, ft1
+; ZVFHMIN64-NEXT:    fmv.h.x fa0, a2
+; ZVFHMIN64-NEXT:    feq.h a2, fa5, fa0
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, s9
 ; ZVFHMIN64-NEXT:    sb a2, 162(sp)
 ; ZVFHMIN64-NEXT:    lh a2, 706(sp)
 ; ZVFHMIN64-NEXT:    lh a4, 450(sp)
-; ZVFHMIN64-NEXT:    sb s5, 129(sp)
-; ZVFHMIN64-NEXT:    sb s4, 130(sp)
-; ZVFHMIN64-NEXT:    sb a5, 131(sp)
-; ZVFHMIN64-NEXT:    sb s3, 132(sp)
+; ZVFHMIN64-NEXT:    sb a1, 129(sp)
+; ZVFHMIN64-NEXT:    feq.h a1, fa1, fa5
+; ZVFHMIN64-NEXT:    sb a3, 130(sp)
+; ZVFHMIN64-NEXT:    feq.h a3, fa2, ft4
+; ZVFHMIN64-NEXT:    sb a1, 131(sp)
+; ZVFHMIN64-NEXT:    feq.h a1, fa4, ft2
+; ZVFHMIN64-NEXT:    sb a3, 132(sp)
+; ZVFHMIN64-NEXT:    feq.h a3, fa3, ft3
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a2
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a4
 ; ZVFHMIN64-NEXT:    feq.h a2, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a1, 133(sp)
-; ZVFHMIN64-NEXT:    sb a3, 134(sp)
+; ZVFHMIN64-NEXT:    sb a3, 133(sp)
+; ZVFHMIN64-NEXT:    sb a1, 134(sp)
 ; ZVFHMIN64-NEXT:    sb a0, 135(sp)
 ; ZVFHMIN64-NEXT:    sb a2, 161(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 610(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 354(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s s6, v23
+; ZVFHMIN64-NEXT:    vmv.x.s s4, v23
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    li a3, 18
+; ZVFHMIN64-NEXT:    li a3, 10
 ; ZVFHMIN64-NEXT:    mul a2, a2, a3
 ; ZVFHMIN64-NEXT:    add a2, sp, a2
-; ZVFHMIN64-NEXT:    lh s5, 800(a2) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    lh s2, 800(a2) # 8-byte Folded Reload
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
@@ -2889,13 +2882,12 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN64-NEXT:    lh a0, 608(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 352(sp)
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    li a3, 22
-; ZVFHMIN64-NEXT:    mul a2, a2, a3
+; ZVFHMIN64-NEXT:    slli a2, a2, 4
 ; ZVFHMIN64-NEXT:    add a2, sp, a2
-; ZVFHMIN64-NEXT:    lh s4, 800(a2) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    lh s5, 800(a2) # 8-byte Folded Reload
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    li a3, 21
-; ZVFHMIN64-NEXT:    mul a2, a2, a3
+; ZVFHMIN64-NEXT:    slli a3, a2, 4
+; ZVFHMIN64-NEXT:    sub a2, a3, a2
 ; ZVFHMIN64-NEXT:    add a2, sp, a2
 ; ZVFHMIN64-NEXT:    lh s3, 800(a2) # 8-byte Folded Reload
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
@@ -2904,148 +2896,148 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN64-NEXT:    sb a0, 240(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 606(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 350(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa3, t5
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, s2
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa2, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa4, fa2
+; ZVFHMIN64-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 7
+; ZVFHMIN64-NEXT:    vmv.x.s s6, v8
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a0, 239(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 604(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 348(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, t6
-; ZVFHMIN64-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 7
-; ZVFHMIN64-NEXT:    fmv.h.x fa2, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa1, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa2, fa1
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 6
+; ZVFHMIN64-NEXT:    vmv.x.s s7, v8
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a0, 238(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 602(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 346(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s a2, v8
-; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 6
-; ZVFHMIN64-NEXT:    fmv.h.x fa2, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa1, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa2, fa1
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 5
+; ZVFHMIN64-NEXT:    vmv.x.s s8, v8
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a0, 237(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 600(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 344(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s a3, v8
-; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 5
-; ZVFHMIN64-NEXT:    fmv.h.x fa2, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa1, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa2, fa1
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 4
+; ZVFHMIN64-NEXT:    vmv.x.s s9, v8
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a0, 236(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 598(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 342(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s a4, v8
-; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 4
-; ZVFHMIN64-NEXT:    fmv.h.x fa2, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa1, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa2, fa1
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 3
+; ZVFHMIN64-NEXT:    vmv.x.s s10, v8
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a0, 235(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 596(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 340(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s a5, v8
-; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 3
-; ZVFHMIN64-NEXT:    fmv.h.x fa2, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa1, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa2, fa1
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 2
+; ZVFHMIN64-NEXT:    vmv.x.s s11, v8
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a0, 234(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 594(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 338(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s t6, v8
-; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 2
-; ZVFHMIN64-NEXT:    fmv.h.x fa2, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa1, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa2, fa1
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 1
+; ZVFHMIN64-NEXT:    vmv.x.s ra, v8
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a0, 233(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 592(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 336(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s s2, v8
-; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 1
-; ZVFHMIN64-NEXT:    fmv.h.x fa2, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa1, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa2, fa1
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, t4
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, t6
+; ZVFHMIN64-NEXT:    fmv.h.x fa3, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa2, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa3, fa2
 ; ZVFHMIN64-NEXT:    sb a0, 232(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 590(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa2, a2
 ; ZVFHMIN64-NEXT:    lh a1, 334(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s a2, v8
+; ZVFHMIN64-NEXT:    fmv.h.x fa3, t5
+; ZVFHMIN64-NEXT:    fmv.h.x fa2, s4
 ; ZVFHMIN64-NEXT:    fmv.h.x fa1, a0
-; ZVFHMIN64-NEXT:    feq.h t5, fa3, fa2
-; ZVFHMIN64-NEXT:    fmv.h.x fa3, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa1, fa3
-; ZVFHMIN64-NEXT:    fmv.h.x fa3, a3
+; ZVFHMIN64-NEXT:    fmv.h.x fa0, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa1, fa0
 ; ZVFHMIN64-NEXT:    sb a0, 231(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 588(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa2, a4
 ; ZVFHMIN64-NEXT:    lh a1, 332(sp)
-; ZVFHMIN64-NEXT:    feq.h a3, fa5, fa3
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    feq.h a0, fa4, fa2
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, s6
-; ZVFHMIN64-NEXT:    sb a1, 230(sp)
-; ZVFHMIN64-NEXT:    lh a1, 586(sp)
-; ZVFHMIN64-NEXT:    lh a4, 330(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a5
-; ZVFHMIN64-NEXT:    feq.h a5, fa5, fa4
+; ZVFHMIN64-NEXT:    fmv.h.x fa1, s2
+; ZVFHMIN64-NEXT:    fmv.h.x fa0, s5
+; ZVFHMIN64-NEXT:    fmv.h.x ft0, a0
+; ZVFHMIN64-NEXT:    fmv.h.x ft1, a1
+; ZVFHMIN64-NEXT:    feq.h a0, ft0, ft1
+; ZVFHMIN64-NEXT:    sb a0, 230(sp)
+; ZVFHMIN64-NEXT:    lh a0, 586(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x ft0, s3
+; ZVFHMIN64-NEXT:    lh a1, 330(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x ft1, s6
+; ZVFHMIN64-NEXT:    fmv.h.x ft2, a0
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, ft1
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a4
-; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, s5
+; ZVFHMIN64-NEXT:    feq.h a1, ft2, fa5
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, s7
 ; ZVFHMIN64-NEXT:    sb a1, 229(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 584(sp)
-; ZVFHMIN64-NEXT:    lh a4, 328(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, t6
-; ZVFHMIN64-NEXT:    feq.h t6, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a4
-; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, s4
-; ZVFHMIN64-NEXT:    sb a1, 228(sp)
-; ZVFHMIN64-NEXT:    lh a1, 582(sp)
-; ZVFHMIN64-NEXT:    lh a4, 326(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, s2
-; ZVFHMIN64-NEXT:    feq.h s2, fa5, fa4
+; ZVFHMIN64-NEXT:    fmv.h.x ft1, s8
+; ZVFHMIN64-NEXT:    lh a2, 328(sp)
+; ZVFHMIN64-NEXT:    feq.h a3, fa4, fa5
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a4
-; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, s3
-; ZVFHMIN64-NEXT:    sb a1, 227(sp)
-; ZVFHMIN64-NEXT:    lh a1, 580(sp)
-; ZVFHMIN64-NEXT:    lh a4, 324(sp)
+; ZVFHMIN64-NEXT:    feq.h a1, fa3, ft1
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
 ; ZVFHMIN64-NEXT:    feq.h a2, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a4
-; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a1, 226(sp)
-; ZVFHMIN64-NEXT:    lh a1, 578(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, s9
+; ZVFHMIN64-NEXT:    sb a2, 228(sp)
+; ZVFHMIN64-NEXT:    lh a2, 582(sp)
+; ZVFHMIN64-NEXT:    lh a4, 326(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, s10
+; ZVFHMIN64-NEXT:    feq.h t4, fa2, fa5
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a2
+; ZVFHMIN64-NEXT:    fmv.h.x fa3, a4
+; ZVFHMIN64-NEXT:    feq.h a2, fa5, fa3
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, s11
+; ZVFHMIN64-NEXT:    fmv.h.x fa3, ra
+; ZVFHMIN64-NEXT:    sb a2, 227(sp)
+; ZVFHMIN64-NEXT:    lh a2, 580(sp)
+; ZVFHMIN64-NEXT:    lh a4, 324(sp)
+; ZVFHMIN64-NEXT:    feq.h t5, fa0, fa5
+; ZVFHMIN64-NEXT:    feq.h t6, ft0, fa3
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a2
+; ZVFHMIN64-NEXT:    fmv.h.x fa3, a4
+; ZVFHMIN64-NEXT:    feq.h a2, fa5, fa3
+; ZVFHMIN64-NEXT:    sb a2, 226(sp)
+; ZVFHMIN64-NEXT:    lh a2, 578(sp)
 ; ZVFHMIN64-NEXT:    lh a4, 322(sp)
-; ZVFHMIN64-NEXT:    sb a2, 193(sp)
-; ZVFHMIN64-NEXT:    sb s2, 194(sp)
+; ZVFHMIN64-NEXT:    sb t6, 193(sp)
+; ZVFHMIN64-NEXT:    feq.h t6, fa1, fa4
+; ZVFHMIN64-NEXT:    sb t5, 194(sp)
 ; ZVFHMIN64-NEXT:    sb t6, 195(sp)
-; ZVFHMIN64-NEXT:    sb a5, 196(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT:    sb t4, 196(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a2
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a4
-; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 197(sp)
+; ZVFHMIN64-NEXT:    feq.h a2, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a1, 197(sp)
 ; ZVFHMIN64-NEXT:    sb a3, 198(sp)
-; ZVFHMIN64-NEXT:    sb t5, 199(sp)
-; ZVFHMIN64-NEXT:    sb a1, 225(sp)
+; ZVFHMIN64-NEXT:    sb a0, 199(sp)
+; ZVFHMIN64-NEXT:    sb a2, 225(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 766(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 510(sp)
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    li a3, 19
-; ZVFHMIN64-NEXT:    mul a2, a2, a3
+; ZVFHMIN64-NEXT:    slli a3, a2, 4
+; ZVFHMIN64-NEXT:    add a2, a3, a2
 ; ZVFHMIN64-NEXT:    add a2, sp, a2
 ; ZVFHMIN64-NEXT:    addi a2, a2, 800
 ; ZVFHMIN64-NEXT:    vl2r.v v8, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN64-NEXT:    vmv.x.s s2, v8
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    li a3, 14
+; ZVFHMIN64-NEXT:    li a3, 11
 ; ZVFHMIN64-NEXT:    mul a2, a2, a3
 ; ZVFHMIN64-NEXT:    add a2, sp, a2
 ; ZVFHMIN64-NEXT:    addi a2, a2, 800
@@ -3057,301 +3049,305 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN64-NEXT:    sb a0, 191(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 764(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 508(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s t5, v6
-; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    slli a2, a2, 2
-; ZVFHMIN64-NEXT:    add a2, sp, a2
-; ZVFHMIN64-NEXT:    addi a2, a2, 800
-; ZVFHMIN64-NEXT:    vl2r.v v8, (a2) # Unknown-size Folded Reload
-; ZVFHMIN64-NEXT:    vmv.x.s a2, v8
+; ZVFHMIN64-NEXT:    vmv.x.s t5, v4
+; ZVFHMIN64-NEXT:    vmv.x.s t4, v30
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a0, 190(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 762(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 506(sp)
+; ZVFHMIN64-NEXT:    csrr a2, vlenb
+; ZVFHMIN64-NEXT:    slli a2, a2, 2
+; ZVFHMIN64-NEXT:    add a2, sp, a2
+; ZVFHMIN64-NEXT:    addi a2, a2, 800
+; ZVFHMIN64-NEXT:    vl2r.v v8, (a2) # Unknown-size Folded Reload
+; ZVFHMIN64-NEXT:    vmv.x.s a2, v8
 ; ZVFHMIN64-NEXT:    csrr a3, vlenb
-; ZVFHMIN64-NEXT:    slli a3, a3, 3
+; ZVFHMIN64-NEXT:    slli a3, a3, 1
 ; ZVFHMIN64-NEXT:    add a3, sp, a3
 ; ZVFHMIN64-NEXT:    addi a3, a3, 800
 ; ZVFHMIN64-NEXT:    vl2r.v v8, (a3) # Unknown-size Folded Reload
 ; ZVFHMIN64-NEXT:    vmv.x.s a3, v8
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a0, 189(sp)
+; ZVFHMIN64-NEXT:    lh a0, 760(sp)
+; ZVFHMIN64-NEXT:    lh a1, 504(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, t3
 ; ZVFHMIN64-NEXT:    csrr a4, vlenb
-; ZVFHMIN64-NEXT:    li a5, 6
-; ZVFHMIN64-NEXT:    mul a4, a4, a5
+; ZVFHMIN64-NEXT:    li t3, 6
+; ZVFHMIN64-NEXT:    mul a4, a4, t3
 ; ZVFHMIN64-NEXT:    add a4, sp, a4
 ; ZVFHMIN64-NEXT:    addi a4, a4, 800
 ; ZVFHMIN64-NEXT:    vl2r.v v8, (a4) # Unknown-size Folded Reload
 ; ZVFHMIN64-NEXT:    vmv.x.s a4, v8
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 189(sp)
-; ZVFHMIN64-NEXT:    lh a1, 760(sp)
-; ZVFHMIN64-NEXT:    lh a5, 504(sp)
-; ZVFHMIN64-NEXT:    csrr a0, vlenb
-; ZVFHMIN64-NEXT:    li s3, 12
-; ZVFHMIN64-NEXT:    mul a0, a0, s3
-; ZVFHMIN64-NEXT:    add a0, sp, a0
-; ZVFHMIN64-NEXT:    addi a0, a0, 800
-; ZVFHMIN64-NEXT:    vl2r.v v8, (a0) # Unknown-size Folded Reload
-; ZVFHMIN64-NEXT:    vmv.x.s s5, v8
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa3, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa4, fa3
+; ZVFHMIN64-NEXT:    sb a0, 188(sp)
+; ZVFHMIN64-NEXT:    lh a0, 758(sp)
+; ZVFHMIN64-NEXT:    lh a1, 502(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, t2
+; ZVFHMIN64-NEXT:    csrr t2, vlenb
+; ZVFHMIN64-NEXT:    slli t2, t2, 3
+; ZVFHMIN64-NEXT:    add t2, sp, t2
+; ZVFHMIN64-NEXT:    addi t2, t2, 800
+; ZVFHMIN64-NEXT:    vl2r.v v8, (t2) # Unknown-size Folded Reload
+; ZVFHMIN64-NEXT:    vmv.x.s t2, v8
+; ZVFHMIN64-NEXT:    fmv.h.x fa3, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa2, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa3, fa2
+; ZVFHMIN64-NEXT:    sb a0, 187(sp)
+; ZVFHMIN64-NEXT:    lh a0, 756(sp)
+; ZVFHMIN64-NEXT:    lh a1, 500(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa3, t1
+; ZVFHMIN64-NEXT:    csrr t1, vlenb
+; ZVFHMIN64-NEXT:    li t3, 13
+; ZVFHMIN64-NEXT:    mul t1, t1, t3
+; ZVFHMIN64-NEXT:    add t1, sp, t1
+; ZVFHMIN64-NEXT:    addi t1, t1, 800
+; ZVFHMIN64-NEXT:    vl2r.v v8, (t1) # Unknown-size Folded Reload
+; ZVFHMIN64-NEXT:    vmv.x.s t3, v8
+; ZVFHMIN64-NEXT:    fmv.h.x fa2, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa1, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa2, fa1
+; ZVFHMIN64-NEXT:    sb a0, 186(sp)
+; ZVFHMIN64-NEXT:    lh a0, 754(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa2, t0
+; ZVFHMIN64-NEXT:    lh a1, 498(sp)
+; ZVFHMIN64-NEXT:    csrr t0, vlenb
+; ZVFHMIN64-NEXT:    li t1, 19
+; ZVFHMIN64-NEXT:    mul t0, t0, t1
+; ZVFHMIN64-NEXT:    add t0, sp, t0
+; ZVFHMIN64-NEXT:    addi t0, t0, 800
+; ZVFHMIN64-NEXT:    vl2r.v v8, (t0) # Unknown-size Folded Reload
+; ZVFHMIN64-NEXT:    vmv.x.s s3, v8
+; ZVFHMIN64-NEXT:    fmv.h.x fa1, a0
 ; ZVFHMIN64-NEXT:    csrr a0, vlenb
-; ZVFHMIN64-NEXT:    li s3, 10
-; ZVFHMIN64-NEXT:    mul a0, a0, s3
+; ZVFHMIN64-NEXT:    li t0, 21
+; ZVFHMIN64-NEXT:    mul a0, a0, t0
 ; ZVFHMIN64-NEXT:    add a0, sp, a0
 ; ZVFHMIN64-NEXT:    addi a0, a0, 800
 ; ZVFHMIN64-NEXT:    vl2r.v v8, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN64-NEXT:    vmv.x.s a0, v8
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a5
-; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a1, 188(sp)
-; ZVFHMIN64-NEXT:    lh a1, 758(sp)
-; ZVFHMIN64-NEXT:    lh a5, 502(sp)
-; ZVFHMIN64-NEXT:    csrr s3, vlenb
-; ZVFHMIN64-NEXT:    slli s3, s3, 4
-; ZVFHMIN64-NEXT:    add s3, sp, s3
-; ZVFHMIN64-NEXT:    addi s3, s3, 800
-; ZVFHMIN64-NEXT:    vl2r.v v8, (s3) # Unknown-size Folded Reload
-; ZVFHMIN64-NEXT:    vmv.x.s s4, v8
-; ZVFHMIN64-NEXT:    vmv.x.s s3, v16
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a5
-; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, t4
-; ZVFHMIN64-NEXT:    sb a1, 187(sp)
-; ZVFHMIN64-NEXT:    lh a1, 756(sp)
-; ZVFHMIN64-NEXT:    lh a5, 500(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
-; ZVFHMIN64-NEXT:    feq.h t4, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a5
-; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, t3
-; ZVFHMIN64-NEXT:    sb a1, 186(sp)
-; ZVFHMIN64-NEXT:    lh a1, 754(sp)
-; ZVFHMIN64-NEXT:    lh a2, 498(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a3
-; ZVFHMIN64-NEXT:    feq.h t3, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
-; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, t1
+; ZVFHMIN64-NEXT:    fmv.h.x fa0, a1
+; ZVFHMIN64-NEXT:    feq.h a1, fa1, fa0
+; ZVFHMIN64-NEXT:    fmv.h.x fa1, a2
 ; ZVFHMIN64-NEXT:    sb a1, 185(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 752(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa0, a3
 ; ZVFHMIN64-NEXT:    lh a2, 496(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a4
-; ZVFHMIN64-NEXT:    feq.h t1, fa5, fa4
+; ZVFHMIN64-NEXT:    feq.h t0, fa5, fa1
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT:    feq.h t1, fa4, fa0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
 ; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, t2
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a4
 ; ZVFHMIN64-NEXT:    sb a1, 184(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 750(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, t2
 ; ZVFHMIN64-NEXT:    lh a2, 494(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, s5
-; ZVFHMIN64-NEXT:    feq.h a3, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
-; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, t0
-; ZVFHMIN64-NEXT:    sb a1, 183(sp)
-; ZVFHMIN64-NEXT:    lh a1, 748(sp)
-; ZVFHMIN64-NEXT:    lh a2, 492(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a0
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    feq.h a3, fa3, fa5
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT:    feq.h a1, fa2, fa4
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
-; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    feq.h a2, fa5, fa4
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a7
-; ZVFHMIN64-NEXT:    sb a1, 182(sp)
-; ZVFHMIN64-NEXT:    lh a1, 746(sp)
-; ZVFHMIN64-NEXT:    lh a2, 490(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, s4
-; ZVFHMIN64-NEXT:    feq.h a4, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
-; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a2, 183(sp)
+; ZVFHMIN64-NEXT:    lh a2, 748(sp)
+; ZVFHMIN64-NEXT:    lh a4, 492(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, t3
+; ZVFHMIN64-NEXT:    feq.h a7, fa5, fa4
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a2
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a4
+; ZVFHMIN64-NEXT:    feq.h a2, fa5, fa4
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a6
-; ZVFHMIN64-NEXT:    sb a1, 181(sp)
-; ZVFHMIN64-NEXT:    lh a1, 744(sp)
-; ZVFHMIN64-NEXT:    lh a2, 488(sp)
+; ZVFHMIN64-NEXT:    sb a2, 182(sp)
+; ZVFHMIN64-NEXT:    lh a2, 746(sp)
+; ZVFHMIN64-NEXT:    lh a4, 490(sp)
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, s3
 ; ZVFHMIN64-NEXT:    feq.h a6, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
-; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
-; ZVFHMIN64-NEXT:    ld a2, 80(sp) # 8-byte Folded Reload
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a2
-; ZVFHMIN64-NEXT:    addi a2, sp, 800
-; ZVFHMIN64-NEXT:    vl2r.v v8, (a2) # Unknown-size Folded Reload
-; ZVFHMIN64-NEXT:    vmv.x.s a2, v8
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a4
+; ZVFHMIN64-NEXT:    feq.h a2, fa5, fa4
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a5
+; ZVFHMIN64-NEXT:    sb a2, 181(sp)
+; ZVFHMIN64-NEXT:    lh a2, 744(sp)
+; ZVFHMIN64-NEXT:    lh a4, 488(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a0
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a2
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a4
+; ZVFHMIN64-NEXT:    feq.h a2, fa5, fa4
+; ZVFHMIN64-NEXT:    ld a4, 88(sp) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a4
+; ZVFHMIN64-NEXT:    vmv.x.s a5, v0
 ; ZVFHMIN64-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
 ; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 15
-; ZVFHMIN64-NEXT:    vmv.x.s a5, v8
-; ZVFHMIN64-NEXT:    sb a1, 180(sp)
-; ZVFHMIN64-NEXT:    lh a1, 742(sp)
-; ZVFHMIN64-NEXT:    lh a7, 486(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
+; ZVFHMIN64-NEXT:    vmv.x.s a4, v8
+; ZVFHMIN64-NEXT:    sb a2, 180(sp)
+; ZVFHMIN64-NEXT:    lh a2, 742(sp)
+; ZVFHMIN64-NEXT:    lh t2, 486(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a5
+; ZVFHMIN64-NEXT:    feq.h a5, fa5, fa4
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a2
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, t2
 ; ZVFHMIN64-NEXT:    feq.h a2, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a7
-; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a1, 179(sp)
-; ZVFHMIN64-NEXT:    lh a1, 740(sp)
-; ZVFHMIN64-NEXT:    lh a7, 484(sp)
-; ZVFHMIN64-NEXT:    sb a3, 140(sp)
-; ZVFHMIN64-NEXT:    sb t1, 141(sp)
-; ZVFHMIN64-NEXT:    sb t3, 142(sp)
-; ZVFHMIN64-NEXT:    sb t4, 143(sp)
-; ZVFHMIN64-NEXT:    sb a2, 136(sp)
-; ZVFHMIN64-NEXT:    sb a6, 137(sp)
-; ZVFHMIN64-NEXT:    sb a4, 138(sp)
-; ZVFHMIN64-NEXT:    sb a0, 139(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a7
+; ZVFHMIN64-NEXT:    sb a2, 179(sp)
+; ZVFHMIN64-NEXT:    lh a2, 740(sp)
+; ZVFHMIN64-NEXT:    lh t2, 484(sp)
+; ZVFHMIN64-NEXT:    sb a1, 140(sp)
+; ZVFHMIN64-NEXT:    sb a3, 141(sp)
+; ZVFHMIN64-NEXT:    sb t1, 142(sp)
+; ZVFHMIN64-NEXT:    sb t0, 143(sp)
+; ZVFHMIN64-NEXT:    sb a5, 136(sp)
+; ZVFHMIN64-NEXT:    sb a0, 137(sp)
+; ZVFHMIN64-NEXT:    sb a6, 138(sp)
+; ZVFHMIN64-NEXT:    sb a7, 139(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a2
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, t2
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a0, 178(sp)
-; ZVFHMIN64-NEXT:    lh a1, 638(sp)
-; ZVFHMIN64-NEXT:    lh a2, 382(sp)
+; ZVFHMIN64-NEXT:    lh a0, 638(sp)
+; ZVFHMIN64-NEXT:    lh a1, 382(sp)
 ; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 14
-; ZVFHMIN64-NEXT:    vmv.x.s a0, v8
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
-; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a1, 255(sp)
-; ZVFHMIN64-NEXT:    lh a1, 636(sp)
-; ZVFHMIN64-NEXT:    lh a2, 380(sp)
-; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 13
 ; ZVFHMIN64-NEXT:    vmv.x.s t2, v8
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
-; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a1, 254(sp)
-; ZVFHMIN64-NEXT:    lh a1, 634(sp)
-; ZVFHMIN64-NEXT:    lh a2, 378(sp)
-; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 12
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a0, 255(sp)
+; ZVFHMIN64-NEXT:    lh a0, 636(sp)
+; ZVFHMIN64-NEXT:    lh a1, 380(sp)
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 13
 ; ZVFHMIN64-NEXT:    vmv.x.s t1, v8
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
-; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a1, 253(sp)
-; ZVFHMIN64-NEXT:    lh a1, 632(sp)
-; ZVFHMIN64-NEXT:    lh a2, 376(sp)
-; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 11
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a0, 254(sp)
+; ZVFHMIN64-NEXT:    lh a0, 634(sp)
+; ZVFHMIN64-NEXT:    lh a1, 378(sp)
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 12
 ; ZVFHMIN64-NEXT:    vmv.x.s t0, v8
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
-; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a1, 252(sp)
-; ZVFHMIN64-NEXT:    lh a1, 630(sp)
-; ZVFHMIN64-NEXT:    lh a2, 374(sp)
-; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 10
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a0, 253(sp)
+; ZVFHMIN64-NEXT:    lh a0, 632(sp)
+; ZVFHMIN64-NEXT:    lh a1, 376(sp)
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 11
 ; ZVFHMIN64-NEXT:    vmv.x.s a7, v8
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
-; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a1, 251(sp)
-; ZVFHMIN64-NEXT:    lh a1, 628(sp)
-; ZVFHMIN64-NEXT:    lh a2, 372(sp)
-; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 9
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a0, 252(sp)
+; ZVFHMIN64-NEXT:    lh a0, 630(sp)
+; ZVFHMIN64-NEXT:    lh a1, 374(sp)
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 10
 ; ZVFHMIN64-NEXT:    vmv.x.s a6, v8
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
-; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
-; ZVFHMIN64-NEXT:    ld a2, 88(sp) # 8-byte Folded Reload
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a2
-; ZVFHMIN64-NEXT:    sb a1, 250(sp)
-; ZVFHMIN64-NEXT:    lh a1, 626(sp)
-; ZVFHMIN64-NEXT:    lh a2, 370(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a5
-; ZVFHMIN64-NEXT:    feq.h a4, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
-; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
-; ZVFHMIN64-NEXT:    ld a2, 96(sp) # 8-byte Folded Reload
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a2
-; ZVFHMIN64-NEXT:    sb a1, 249(sp)
-; ZVFHMIN64-NEXT:    lh a1, 624(sp)
-; ZVFHMIN64-NEXT:    lh a2, 368(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a0
-; ZVFHMIN64-NEXT:    feq.h a3, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    ld a1, 104(sp) # 8-byte Folded Reload
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT:    sb a0, 248(sp)
-; ZVFHMIN64-NEXT:    lh a0, 622(sp)
-; ZVFHMIN64-NEXT:    lh a1, 366(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, t2
-; ZVFHMIN64-NEXT:    feq.h a2, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a0, 251(sp)
+; ZVFHMIN64-NEXT:    lh a0, 628(sp)
+; ZVFHMIN64-NEXT:    lh a1, 372(sp)
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 9
+; ZVFHMIN64-NEXT:    vmv.x.s a5, v8
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    ld a1, 112(sp) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    ld a1, 104(sp) # 8-byte Folded Reload
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT:    sb a0, 247(sp)
-; ZVFHMIN64-NEXT:    lh a0, 620(sp)
-; ZVFHMIN64-NEXT:    lh a1, 364(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, t1
-; ZVFHMIN64-NEXT:    feq.h a5, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a0, 250(sp)
+; ZVFHMIN64-NEXT:    lh a0, 626(sp)
+; ZVFHMIN64-NEXT:    lh a1, 370(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a4
+; ZVFHMIN64-NEXT:    feq.h a2, fa5, fa4
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    ld a1, 120(sp) # 8-byte Folded Reload
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT:    sb a0, 246(sp)
-; ZVFHMIN64-NEXT:    lh a0, 618(sp)
-; ZVFHMIN64-NEXT:    lh a1, 362(sp)
+; ZVFHMIN64-NEXT:    sb a0, 249(sp)
+; ZVFHMIN64-NEXT:    lh a1, 624(sp)
+; ZVFHMIN64-NEXT:    lh a3, 368(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, t2
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a3
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    ld a3, 96(sp) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a3
+; ZVFHMIN64-NEXT:    sb a1, 248(sp)
+; ZVFHMIN64-NEXT:    lh a1, 622(sp)
+; ZVFHMIN64-NEXT:    lh a3, 366(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, t1
+; ZVFHMIN64-NEXT:    feq.h a4, fa5, fa4
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a3
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    ld a3, 112(sp) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a3
+; ZVFHMIN64-NEXT:    sb a1, 247(sp)
+; ZVFHMIN64-NEXT:    lh a1, 620(sp)
+; ZVFHMIN64-NEXT:    lh a3, 364(sp)
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, t0
 ; ZVFHMIN64-NEXT:    feq.h t0, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a3
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, s2
-; ZVFHMIN64-NEXT:    sb a0, 245(sp)
-; ZVFHMIN64-NEXT:    lh a0, 616(sp)
-; ZVFHMIN64-NEXT:    lh a1, 360(sp)
+; ZVFHMIN64-NEXT:    sb a1, 246(sp)
+; ZVFHMIN64-NEXT:    lh a1, 618(sp)
+; ZVFHMIN64-NEXT:    lh a3, 362(sp)
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a7
 ; ZVFHMIN64-NEXT:    feq.h a7, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a3
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, t6
-; ZVFHMIN64-NEXT:    sb a0, 244(sp)
-; ZVFHMIN64-NEXT:    lh a0, 614(sp)
-; ZVFHMIN64-NEXT:    lh a1, 358(sp)
+; ZVFHMIN64-NEXT:    sb a1, 245(sp)
+; ZVFHMIN64-NEXT:    lh a1, 616(sp)
+; ZVFHMIN64-NEXT:    lh a3, 360(sp)
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a6
 ; ZVFHMIN64-NEXT:    feq.h a6, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a3
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, t5
+; ZVFHMIN64-NEXT:    sb a1, 244(sp)
+; ZVFHMIN64-NEXT:    lh a1, 614(sp)
+; ZVFHMIN64-NEXT:    lh a3, 358(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a5
+; ZVFHMIN64-NEXT:    feq.h a5, fa5, fa4
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a3
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, t4
 ; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 8
-; ZVFHMIN64-NEXT:    vmv.x.s a1, v8
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    sb a0, 243(sp)
-; ZVFHMIN64-NEXT:    lh a0, 612(sp)
-; ZVFHMIN64-NEXT:    lh a1, 356(sp)
-; ZVFHMIN64-NEXT:    sb a5, 204(sp)
-; ZVFHMIN64-NEXT:    sb a2, 205(sp)
-; ZVFHMIN64-NEXT:    sb a3, 206(sp)
-; ZVFHMIN64-NEXT:    sb a4, 207(sp)
-; ZVFHMIN64-NEXT:    feq.h a2, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a2, 200(sp)
-; ZVFHMIN64-NEXT:    sb a6, 201(sp)
-; ZVFHMIN64-NEXT:    sb a7, 202(sp)
-; ZVFHMIN64-NEXT:    sb t0, 203(sp)
-; ZVFHMIN64-NEXT:    li a2, 128
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    vmv.x.s a3, v8
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a3
+; ZVFHMIN64-NEXT:    sb a1, 243(sp)
+; ZVFHMIN64-NEXT:    lh a1, 612(sp)
+; ZVFHMIN64-NEXT:    lh a3, 356(sp)
+; ZVFHMIN64-NEXT:    sb t0, 204(sp)
+; ZVFHMIN64-NEXT:    sb a4, 205(sp)
+; ZVFHMIN64-NEXT:    sb a0, 206(sp)
+; ZVFHMIN64-NEXT:    sb a2, 207(sp)
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 242(sp)
-; ZVFHMIN64-NEXT:    addi a0, sp, 128
-; ZVFHMIN64-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
-; ZVFHMIN64-NEXT:    vle8.v v8, (a0)
+; ZVFHMIN64-NEXT:    sb a0, 200(sp)
+; ZVFHMIN64-NEXT:    sb a5, 201(sp)
+; ZVFHMIN64-NEXT:    sb a6, 202(sp)
+; ZVFHMIN64-NEXT:    sb a7, 203(sp)
+; ZVFHMIN64-NEXT:    li a0, 128
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a3
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a1, 242(sp)
+; ZVFHMIN64-NEXT:    addi a1, sp, 128
+; ZVFHMIN64-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; ZVFHMIN64-NEXT:    vle8.v v8, (a1)
 ; ZVFHMIN64-NEXT:    vand.vi v8, v8, 1
 ; ZVFHMIN64-NEXT:    vmsne.vi v0, v8, 0
 ; ZVFHMIN64-NEXT:    addi sp, s0, -896
diff --git a/llvm/test/CodeGen/RISCV/rvv/intrinsic-vector-match.ll b/llvm/test/CodeGen/RISCV/rvv/intrinsic-vector-match.ll
index 5b272c98a1e0ac..dd2a8240ee2533 100644
--- a/llvm/test/CodeGen/RISCV/rvv/intrinsic-vector-match.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/intrinsic-vector-match.ll
@@ -507,34 +507,26 @@ define <8 x i1> @match_v8i8_v16i8(<8 x i8> %op1, <16 x i8> %op2, <8 x i1> %mask)
 define <vscale x 16 x i1> @match_nxv16i8_v32i8(<vscale x 16 x i8> %op1, <32 x i8> %op2, <vscale x 16 x i1> %mask) {
 ; RV32-LABEL: match_nxv16i8_v32i8:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -64
-; RV32-NEXT:    .cfi_def_cfa_offset 64
-; RV32-NEXT:    sw ra, 60(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s0, 56(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s1, 52(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s2, 48(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s3, 44(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s4, 40(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s5, 36(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s6, 32(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s7, 28(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s8, 24(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s9, 20(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s10, 16(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s11, 12(sp) # 4-byte Folded Spill
-; RV32-NEXT:    .cfi_offset ra, -4
-; RV32-NEXT:    .cfi_offset s0, -8
-; RV32-NEXT:    .cfi_offset s1, -12
-; RV32-NEXT:    .cfi_offset s2, -16
-; RV32-NEXT:    .cfi_offset s3, -20
-; RV32-NEXT:    .cfi_offset s4, -24
-; RV32-NEXT:    .cfi_offset s5, -28
-; RV32-NEXT:    .cfi_offset s6, -32
-; RV32-NEXT:    .cfi_offset s7, -36
-; RV32-NEXT:    .cfi_offset s8, -40
-; RV32-NEXT:    .cfi_offset s9, -44
-; RV32-NEXT:    .cfi_offset s10, -48
-; RV32-NEXT:    .cfi_offset s11, -52
+; RV32-NEXT:    addi sp, sp, -48
+; RV32-NEXT:    .cfi_def_cfa_offset 48
+; RV32-NEXT:    sw s0, 44(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s1, 40(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s2, 36(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s3, 32(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s4, 28(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s5, 24(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s6, 20(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s7, 16(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s8, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT:    .cfi_offset s0, -4
+; RV32-NEXT:    .cfi_offset s1, -8
+; RV32-NEXT:    .cfi_offset s2, -12
+; RV32-NEXT:    .cfi_offset s3, -16
+; RV32-NEXT:    .cfi_offset s4, -20
+; RV32-NEXT:    .cfi_offset s5, -24
+; RV32-NEXT:    .cfi_offset s6, -28
+; RV32-NEXT:    .cfi_offset s7, -32
+; RV32-NEXT:    .cfi_offset s8, -36
 ; RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
 ; RV32-NEXT:    vmv.x.s a0, v10
 ; RV32-NEXT:    vslidedown.vi v12, v10, 1
@@ -592,43 +584,43 @@ define <vscale x 16 x i1> @match_nxv16i8_v32i8(<vscale x 16 x i8> %op1, <32 x i8
 ; RV32-NEXT:    vmv.x.s s5, v15
 ; RV32-NEXT:    vmv.x.s s6, v16
 ; RV32-NEXT:    vmv.x.s s7, v17
-; RV32-NEXT:    vmv.x.s s8, v18
-; RV32-NEXT:    vmv.x.s s9, v19
-; RV32-NEXT:    vmv.x.s s10, v20
-; RV32-NEXT:    vmv.x.s s11, v21
-; RV32-NEXT:    vsetvli ra, zero, e8, m2, ta, ma
+; RV32-NEXT:    vsetvli s8, zero, e8, m2, ta, ma
 ; RV32-NEXT:    vmseq.vx v12, v8, a0
-; RV32-NEXT:    vmv.x.s a0, v22
+; RV32-NEXT:    vmv.x.s a0, v18
 ; RV32-NEXT:    vmseq.vx v13, v8, s2
-; RV32-NEXT:    vmv.x.s s2, v23
+; RV32-NEXT:    vmv.x.s s2, v19
 ; RV32-NEXT:    vmseq.vx v14, v8, s3
-; RV32-NEXT:    vmv.x.s s3, v11
-; RV32-NEXT:    vmseq.vx v11, v8, s4
-; RV32-NEXT:    vmv.x.s s4, v24
-; RV32-NEXT:    vmseq.vx v15, v8, s5
-; RV32-NEXT:    vmv.x.s s5, v10
+; RV32-NEXT:    vmv.x.s s3, v20
+; RV32-NEXT:    vmseq.vx v15, v8, s4
+; RV32-NEXT:    vmv.x.s s4, v21
+; RV32-NEXT:    vmseq.vx v16, v8, s5
+; RV32-NEXT:    vmv.x.s s5, v22
+; RV32-NEXT:    vmseq.vx v17, v8, s6
+; RV32-NEXT:    vmv.x.s s6, v23
+; RV32-NEXT:    vmseq.vx v18, v8, s7
+; RV32-NEXT:    vmv.x.s s7, v11
+; RV32-NEXT:    vmseq.vx v11, v8, a0
+; RV32-NEXT:    vmv.x.s a0, v24
+; RV32-NEXT:    vmseq.vx v19, v8, s2
+; RV32-NEXT:    vmv.x.s s2, v10
 ; RV32-NEXT:    vmor.mm v10, v12, v13
-; RV32-NEXT:    vmseq.vx v12, v8, s6
 ; RV32-NEXT:    vmor.mm v10, v10, v14
-; RV32-NEXT:    vmseq.vx v13, v8, s7
-; RV32-NEXT:    vmor.mm v10, v10, v11
-; RV32-NEXT:    vmseq.vx v11, v8, s8
 ; RV32-NEXT:    vmor.mm v10, v10, v15
-; RV32-NEXT:    vmseq.vx v14, v8, s9
-; RV32-NEXT:    vmor.mm v10, v10, v12
-; RV32-NEXT:    vmseq.vx v12, v8, s10
-; RV32-NEXT:    vmor.mm v10, v10, v13
-; RV32-NEXT:    vmseq.vx v13, v8, s11
-; RV32-NEXT:    vmor.mm v10, v10, v11
-; RV32-NEXT:    vmseq.vx v11, v8, a0
-; RV32-NEXT:    vmor.mm v10, v10, v14
-; RV32-NEXT:    vmseq.vx v14, v8, s2
-; RV32-NEXT:    vmor.mm v10, v10, v12
+; RV32-NEXT:    vmor.mm v10, v10, v16
+; RV32-NEXT:    vmor.mm v10, v10, v17
 ; RV32-NEXT:    vmseq.vx v12, v8, s3
-; RV32-NEXT:    vmor.mm v10, v10, v13
+; RV32-NEXT:    vmor.mm v10, v10, v18
 ; RV32-NEXT:    vmseq.vx v13, v8, s4
 ; RV32-NEXT:    vmor.mm v10, v10, v11
 ; RV32-NEXT:    vmseq.vx v11, v8, s5
+; RV32-NEXT:    vmor.mm v10, v10, v19
+; RV32-NEXT:    vmseq.vx v14, v8, s6
+; RV32-NEXT:    vmor.mm v10, v10, v12
+; RV32-NEXT:    vmseq.vx v12, v8, s7
+; RV32-NEXT:    vmor.mm v10, v10, v13
+; RV32-NEXT:    vmseq.vx v13, v8, a0
+; RV32-NEXT:    vmor.mm v10, v10, v11
+; RV32-NEXT:    vmseq.vx v11, v8, s2
 ; RV32-NEXT:    vmor.mm v10, v10, v14
 ; RV32-NEXT:    vmseq.vx v14, v8, a1
 ; RV32-NEXT:    vmor.mm v10, v10, v12
@@ -666,20 +658,15 @@ define <vscale x 16 x i1> @match_nxv16i8_v32i8(<vscale x 16 x i8> %op1, <32 x i8
 ; RV32-NEXT:    vmseq.vx v11, v8, s1
 ; RV32-NEXT:    vmor.mm v8, v10, v11
 ; RV32-NEXT:    vmand.mm v0, v8, v0
-; RV32-NEXT:    lw ra, 60(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s0, 56(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s1, 52(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s2, 48(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s3, 44(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s4, 40(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s5, 36(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s6, 32(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s7, 28(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s8, 24(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s9, 20(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s10, 16(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s11, 12(sp) # 4-byte Folded Reload
-; RV32-NEXT:    .cfi_restore ra
+; RV32-NEXT:    lw s0, 44(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s1, 40(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s2, 36(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s3, 32(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s4, 28(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s5, 24(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s6, 20(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s7, 16(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s8, 12(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    .cfi_restore s0
 ; RV32-NEXT:    .cfi_restore s1
 ; RV32-NEXT:    .cfi_restore s2
@@ -689,43 +676,32 @@ define <vscale x 16 x i1> @match_nxv16i8_v32i8(<vscale x 16 x i8> %op1, <32 x i8
 ; RV32-NEXT:    .cfi_restore s6
 ; RV32-NEXT:    .cfi_restore s7
 ; RV32-NEXT:    .cfi_restore s8
-; RV32-NEXT:    .cfi_restore s9
-; RV32-NEXT:    .cfi_restore s10
-; RV32-NEXT:    .cfi_restore s11
-; RV32-NEXT:    addi sp, sp, 64
+; RV32-NEXT:    addi sp, sp, 48
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: match_nxv16i8_v32i8:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    addi sp, sp, -112
-; RV64-NEXT:    .cfi_def_cfa_offset 112
-; RV64-NEXT:    sd ra, 104(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s0, 96(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s1, 88(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s2, 80(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s3, 72(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s4, 64(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s5, 56(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s6, 48(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s7, 40(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s8, 32(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s9, 24(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s10, 16(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s11, 8(sp) # 8-byte Folded Spill
-; RV64-NEXT:    .cfi_offset ra, -8
-; RV64-NEXT:    .cfi_offset s0, -16
-; RV64-NEXT:    .cfi_offset s1, -24
-; RV64-NEXT:    .cfi_offset s2, -32
-; RV64-NEXT:    .cfi_offset s3, -40
-; RV64-NEXT:    .cfi_offset s4, -48
-; RV64-NEXT:    .cfi_offset s5, -56
-; RV64-NEXT:    .cfi_offset s6, -64
-; RV64-NEXT:    .cfi_offset s7, -72
-; RV64-NEXT:    .cfi_offset s8, -80
-; RV64-NEXT:    .cfi_offset s9, -88
-; RV64-NEXT:    .cfi_offset s10, -96
-; RV64-NEXT:    .cfi_offset s11, -104
+; RV64-NEXT:    addi sp, sp, -80
+; RV64-NEXT:    .cfi_def_cfa_offset 80
+; RV64-NEXT:    sd s0, 72(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s1, 64(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s2, 56(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s3, 48(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s4, 40(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s5, 32(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s6, 24(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s7, 16(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s8, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT:    .cfi_offset s0, -8
+; RV64-NEXT:    .cfi_offset s1, -16
+; RV64-NEXT:    .cfi_offset s2, -24
+; RV64-NEXT:    .cfi_offset s3, -32
+; RV64-NEXT:    .cfi_offset s4, -40
+; RV64-NEXT:    .cfi_offset s5, -48
+; RV64-NEXT:    .cfi_offset s6, -56
+; RV64-NEXT:    .cfi_offset s7, -64
+; RV64-NEXT:    .cfi_offset s8, -72
 ; RV64-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
 ; RV64-NEXT:    vmv.x.s a0, v10
 ; RV64-NEXT:    vslidedown.vi v12, v10, 1
@@ -783,43 +759,43 @@ define <vscale x 16 x i1> @match_nxv16i8_v32i8(<vscale x 16 x i8> %op1, <32 x i8
 ; RV64-NEXT:    vmv.x.s s5, v15
 ; RV64-NEXT:    vmv.x.s s6, v16
 ; RV64-NEXT:    vmv.x.s s7, v17
-; RV64-NEXT:    vmv.x.s s8, v18
-; RV64-NEXT:    vmv.x.s s9, v19
-; RV64-NEXT:    vmv.x.s s10, v20
-; RV64-NEXT:    vmv.x.s s11, v21
-; RV64-NEXT:    vsetvli ra, zero, e8, m2, ta, ma
+; RV64-NEXT:    vsetvli s8, zero, e8, m2, ta, ma
 ; RV64-NEXT:    vmseq.vx v12, v8, a0
-; RV64-NEXT:    vmv.x.s a0, v22
+; RV64-NEXT:    vmv.x.s a0, v18
 ; RV64-NEXT:    vmseq.vx v13, v8, s2
-; RV64-NEXT:    vmv.x.s s2, v23
+; RV64-NEXT:    vmv.x.s s2, v19
 ; RV64-NEXT:    vmseq.vx v14, v8, s3
-; RV64-NEXT:    vmv.x.s s3, v11
-; RV64-NEXT:    vmseq.vx v11, v8, s4
-; RV64-NEXT:    vmv.x.s s4, v24
-; RV64-NEXT:    vmseq.vx v15, v8, s5
-; RV64-NEXT:    vmv.x.s s5, v10
+; RV64-NEXT:    vmv.x.s s3, v20
+; RV64-NEXT:    vmseq.vx v15, v8, s4
+; RV64-NEXT:    vmv.x.s s4, v21
+; RV64-NEXT:    vmseq.vx v16, v8, s5
+; RV64-NEXT:    vmv.x.s s5, v22
+; RV64-NEXT:    vmseq.vx v17, v8, s6
+; RV64-NEXT:    vmv.x.s s6, v23
+; RV64-NEXT:    vmseq.vx v18, v8, s7
+; RV64-NEXT:    vmv.x.s s7, v11
+; RV64-NEXT:    vmseq.vx v11, v8, a0
+; RV64-NEXT:    vmv.x.s a0, v24
+; RV64-NEXT:    vmseq.vx v19, v8, s2
+; RV64-NEXT:    vmv.x.s s2, v10
 ; RV64-NEXT:    vmor.mm v10, v12, v13
-; RV64-NEXT:    vmseq.vx v12, v8, s6
 ; RV64-NEXT:    vmor.mm v10, v10, v14
-; RV64-NEXT:    vmseq.vx v13, v8, s7
-; RV64-NEXT:    vmor.mm v10, v10, v11
-; RV64-NEXT:    vmseq.vx v11, v8, s8
 ; RV64-NEXT:    vmor.mm v10, v10, v15
-; RV64-NEXT:    vmseq.vx v14, v8, s9
-; RV64-NEXT:    vmor.mm v10, v10, v12
-; RV64-NEXT:    vmseq.vx v12, v8, s10
-; RV64-NEXT:    vmor.mm v10, v10, v13
-; RV64-NEXT:    vmseq.vx v13, v8, s11
-; RV64-NEXT:    vmor.mm v10, v10, v11
-; RV64-NEXT:    vmseq.vx v11, v8, a0
-; RV64-NEXT:    vmor.mm v10, v10, v14
-; RV64-NEXT:    vmseq.vx v14, v8, s2
-; RV64-NEXT:    vmor.mm v10, v10, v12
+; RV64-NEXT:    vmor.mm v10, v10, v16
+; RV64-NEXT:    vmor.mm v10, v10, v17
 ; RV64-NEXT:    vmseq.vx v12, v8, s3
-; RV64-NEXT:    vmor.mm v10, v10, v13
+; RV64-NEXT:    vmor.mm v10, v10, v18
 ; RV64-NEXT:    vmseq.vx v13, v8, s4
 ; RV64-NEXT:    vmor.mm v10, v10, v11
 ; RV64-NEXT:    vmseq.vx v11, v8, s5
+; RV64-NEXT:    vmor.mm v10, v10, v19
+; RV64-NEXT:    vmseq.vx v14, v8, s6
+; RV64-NEXT:    vmor.mm v10, v10, v12
+; RV64-NEXT:    vmseq.vx v12, v8, s7
+; RV64-NEXT:    vmor.mm v10, v10, v13
+; RV64-NEXT:    vmseq.vx v13, v8, a0
+; RV64-NEXT:    vmor.mm v10, v10, v11
+; RV64-NEXT:    vmseq.vx v11, v8, s2
 ; RV64-NEXT:    vmor.mm v10, v10, v14
 ; RV64-NEXT:    vmseq.vx v14, v8, a1
 ; RV64-NEXT:    vmor.mm v10, v10, v12
@@ -857,20 +833,15 @@ define <vscale x 16 x i1> @match_nxv16i8_v32i8(<vscale x 16 x i8> %op1, <32 x i8
 ; RV64-NEXT:    vmseq.vx v11, v8, s1
 ; RV64-NEXT:    vmor.mm v8, v10, v11
 ; RV64-NEXT:    vmand.mm v0, v8, v0
-; RV64-NEXT:    ld ra, 104(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s0, 96(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s1, 88(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s2, 80(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s3, 72(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s4, 64(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s5, 56(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s6, 48(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s7, 40(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s8, 32(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s9, 24(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s10, 16(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s11, 8(sp) # 8-byte Folded Reload
-; RV64-NEXT:    .cfi_restore ra
+; RV64-NEXT:    ld s0, 72(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s1, 64(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s2, 56(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s3, 48(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s4, 40(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s5, 32(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s6, 24(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s7, 16(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s8, 8(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    .cfi_restore s0
 ; RV64-NEXT:    .cfi_restore s1
 ; RV64-NEXT:    .cfi_restore s2
@@ -880,10 +851,7 @@ define <vscale x 16 x i1> @match_nxv16i8_v32i8(<vscale x 16 x i8> %op1, <32 x i8
 ; RV64-NEXT:    .cfi_restore s6
 ; RV64-NEXT:    .cfi_restore s7
 ; RV64-NEXT:    .cfi_restore s8
-; RV64-NEXT:    .cfi_restore s9
-; RV64-NEXT:    .cfi_restore s10
-; RV64-NEXT:    .cfi_restore s11
-; RV64-NEXT:    addi sp, sp, 112
+; RV64-NEXT:    addi sp, sp, 80
 ; RV64-NEXT:    .cfi_def_cfa_offset 0
 ; RV64-NEXT:    ret
   %r = tail call <vscale x 16 x i1> @llvm.experimental.vector.match(<vscale x 16 x i8> %op1, <32 x i8> %op2, <vscale x 16 x i1> %mask)
@@ -893,20 +861,16 @@ define <vscale x 16 x i1> @match_nxv16i8_v32i8(<vscale x 16 x i8> %op1, <32 x i8
 define <16 x i1> @match_v16i8_v32i8(<16 x i8> %op1, <32 x i8> %op2, <16 x i1> %mask) {
 ; RV32-LABEL: match_v16i8_v32i8:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -48
-; RV32-NEXT:    .cfi_def_cfa_offset 48
-; RV32-NEXT:    sw s0, 44(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s1, 40(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s2, 36(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s3, 32(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s4, 28(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s5, 24(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s6, 20(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s7, 16(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s8, 12(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s9, 8(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s10, 4(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s11, 0(sp) # 4-byte Folded Spill
+; RV32-NEXT:    addi sp, sp, -32
+; RV32-NEXT:    .cfi_def_cfa_offset 32
+; RV32-NEXT:    sw s0, 28(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s1, 24(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s2, 20(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s3, 16(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s4, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s5, 8(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s6, 4(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s7, 0(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    .cfi_offset s0, -4
 ; RV32-NEXT:    .cfi_offset s1, -8
 ; RV32-NEXT:    .cfi_offset s2, -12
@@ -915,10 +879,6 @@ define <16 x i1> @match_v16i8_v32i8(<16 x i8> %op1, <32 x i8> %op2, <16 x i1> %m
 ; RV32-NEXT:    .cfi_offset s5, -24
 ; RV32-NEXT:    .cfi_offset s6, -28
 ; RV32-NEXT:    .cfi_offset s7, -32
-; RV32-NEXT:    .cfi_offset s8, -36
-; RV32-NEXT:    .cfi_offset s9, -40
-; RV32-NEXT:    .cfi_offset s10, -44
-; RV32-NEXT:    .cfi_offset s11, -48
 ; RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
 ; RV32-NEXT:    vmv.x.s a0, v10
 ; RV32-NEXT:    vslidedown.vi v9, v10, 1
@@ -976,42 +936,42 @@ define <16 x i1> @match_v16i8_v32i8(<16 x i8> %op1, <32 x i8> %op2, <16 x i1> %m
 ; RV32-NEXT:    vmv.x.s s5, v14
 ; RV32-NEXT:    vmv.x.s s6, v15
 ; RV32-NEXT:    vmv.x.s s7, v16
-; RV32-NEXT:    vmv.x.s s8, v17
-; RV32-NEXT:    vmv.x.s s9, v18
-; RV32-NEXT:    vmv.x.s s10, v19
-; RV32-NEXT:    vmv.x.s s11, v20
 ; RV32-NEXT:    vmseq.vx v9, v8, a0
-; RV32-NEXT:    vmv.x.s a0, v21
+; RV32-NEXT:    vmv.x.s a0, v17
 ; RV32-NEXT:    vmseq.vx v12, v8, s2
-; RV32-NEXT:    vmv.x.s s2, v22
+; RV32-NEXT:    vmv.x.s s2, v18
 ; RV32-NEXT:    vmseq.vx v13, v8, s3
-; RV32-NEXT:    vmv.x.s s3, v11
-; RV32-NEXT:    vmseq.vx v11, v8, s4
-; RV32-NEXT:    vmv.x.s s4, v23
-; RV32-NEXT:    vmseq.vx v14, v8, s5
-; RV32-NEXT:    vmv.x.s s5, v10
+; RV32-NEXT:    vmv.x.s s3, v19
+; RV32-NEXT:    vmseq.vx v14, v8, s4
+; RV32-NEXT:    vmv.x.s s4, v20
+; RV32-NEXT:    vmseq.vx v15, v8, s5
+; RV32-NEXT:    vmv.x.s s5, v21
+; RV32-NEXT:    vmseq.vx v16, v8, s6
+; RV32-NEXT:    vmv.x.s s6, v22
+; RV32-NEXT:    vmseq.vx v17, v8, s7
+; RV32-NEXT:    vmv.x.s s7, v11
+; RV32-NEXT:    vmseq.vx v11, v8, a0
+; RV32-NEXT:    vmv.x.s a0, v23
+; RV32-NEXT:    vmseq.vx v18, v8, s2
+; RV32-NEXT:    vmv.x.s s2, v10
 ; RV32-NEXT:    vmor.mm v9, v9, v12
-; RV32-NEXT:    vmseq.vx v10, v8, s6
 ; RV32-NEXT:    vmor.mm v9, v9, v13
-; RV32-NEXT:    vmseq.vx v12, v8, s7
-; RV32-NEXT:    vmor.mm v9, v9, v11
-; RV32-NEXT:    vmseq.vx v11, v8, s8
 ; RV32-NEXT:    vmor.mm v9, v9, v14
-; RV32-NEXT:    vmseq.vx v13, v8, s9
-; RV32-NEXT:    vmor.mm v9, v9, v10
-; RV32-NEXT:    vmseq.vx v10, v8, s10
-; RV32-NEXT:    vmor.mm v9, v9, v12
-; RV32-NEXT:    vmseq.vx v12, v8, s11
-; RV32-NEXT:    vmor.mm v9, v9, v11
-; RV32-NEXT:    vmseq.vx v11, v8, a0
-; RV32-NEXT:    vmor.mm v9, v9, v13
-; RV32-NEXT:    vmseq.vx v13, v8, s2
-; RV32-NEXT:    vmor.mm v9, v9, v10
+; RV32-NEXT:    vmor.mm v9, v9, v15
+; RV32-NEXT:    vmor.mm v9, v9, v16
 ; RV32-NEXT:    vmseq.vx v10, v8, s3
-; RV32-NEXT:    vmor.mm v9, v9, v12
+; RV32-NEXT:    vmor.mm v9, v9, v17
 ; RV32-NEXT:    vmseq.vx v12, v8, s4
 ; RV32-NEXT:    vmor.mm v9, v9, v11
 ; RV32-NEXT:    vmseq.vx v11, v8, s5
+; RV32-NEXT:    vmor.mm v9, v9, v18
+; RV32-NEXT:    vmseq.vx v13, v8, s6
+; RV32-NEXT:    vmor.mm v9, v9, v10
+; RV32-NEXT:    vmseq.vx v10, v8, s7
+; RV32-NEXT:    vmor.mm v9, v9, v12
+; RV32-NEXT:    vmseq.vx v12, v8, a0
+; RV32-NEXT:    vmor.mm v9, v9, v11
+; RV32-NEXT:    vmseq.vx v11, v8, s2
 ; RV32-NEXT:    vmor.mm v9, v9, v13
 ; RV32-NEXT:    vmseq.vx v13, v8, a1
 ; RV32-NEXT:    vmor.mm v9, v9, v10
@@ -1049,18 +1009,14 @@ define <16 x i1> @match_v16i8_v32i8(<16 x i8> %op1, <32 x i8> %op2, <16 x i1> %m
 ; RV32-NEXT:    vmseq.vx v8, v8, s1
 ; RV32-NEXT:    vmor.mm v8, v9, v8
 ; RV32-NEXT:    vmand.mm v0, v8, v0
-; RV32-NEXT:    lw s0, 44(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s1, 40(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s2, 36(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s3, 32(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s4, 28(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s5, 24(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s6, 20(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s7, 16(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s8, 12(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s9, 8(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s10, 4(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s11, 0(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s0, 28(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s1, 24(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s2, 20(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s3, 16(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s4, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s5, 8(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s6, 4(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s7, 0(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    .cfi_restore s0
 ; RV32-NEXT:    .cfi_restore s1
 ; RV32-NEXT:    .cfi_restore s2
@@ -1069,30 +1025,22 @@ define <16 x i1> @match_v16i8_v32i8(<16 x i8> %op1, <32 x i8> %op2, <16 x i1> %m
 ; RV32-NEXT:    .cfi_restore s5
 ; RV32-NEXT:    .cfi_restore s6
 ; RV32-NEXT:    .cfi_restore s7
-; RV32-NEXT:    .cfi_restore s8
-; RV32-NEXT:    .cfi_restore s9
-; RV32-NEXT:    .cfi_restore s10
-; RV32-NEXT:    .cfi_restore s11
-; RV32-NEXT:    addi sp, sp, 48
+; RV32-NEXT:    addi sp, sp, 32
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: match_v16i8_v32i8:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    addi sp, sp, -96
-; RV64-NEXT:    .cfi_def_cfa_offset 96
-; RV64-NEXT:    sd s0, 88(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s1, 80(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s2, 72(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s3, 64(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s4, 56(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s5, 48(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s6, 40(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s7, 32(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s8, 24(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s9, 16(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s10, 8(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s11, 0(sp) # 8-byte Folded Spill
+; RV64-NEXT:    addi sp, sp, -64
+; RV64-NEXT:    .cfi_def_cfa_offset 64
+; RV64-NEXT:    sd s0, 56(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s1, 48(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s2, 40(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s3, 32(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s4, 24(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s5, 16(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s6, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s7, 0(sp) # 8-byte Folded Spill
 ; RV64-NEXT:    .cfi_offset s0, -8
 ; RV64-NEXT:    .cfi_offset s1, -16
 ; RV64-NEXT:    .cfi_offset s2, -24
@@ -1101,10 +1049,6 @@ define <16 x i1> @match_v16i8_v32i8(<16 x i8> %op1, <32 x i8> %op2, <16 x i1> %m
 ; RV64-NEXT:    .cfi_offset s5, -48
 ; RV64-NEXT:    .cfi_offset s6, -56
 ; RV64-NEXT:    .cfi_offset s7, -64
-; RV64-NEXT:    .cfi_offset s8, -72
-; RV64-NEXT:    .cfi_offset s9, -80
-; RV64-NEXT:    .cfi_offset s10, -88
-; RV64-NEXT:    .cfi_offset s11, -96
 ; RV64-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
 ; RV64-NEXT:    vmv.x.s a0, v10
 ; RV64-NEXT:    vslidedown.vi v9, v10, 1
@@ -1162,42 +1106,42 @@ define <16 x i1> @match_v16i8_v32i8(<16 x i8> %op1, <32 x i8> %op2, <16 x i1> %m
 ; RV64-NEXT:    vmv.x.s s5, v14
 ; RV64-NEXT:    vmv.x.s s6, v15
 ; RV64-NEXT:    vmv.x.s s7, v16
-; RV64-NEXT:    vmv.x.s s8, v17
-; RV64-NEXT:    vmv.x.s s9, v18
-; RV64-NEXT:    vmv.x.s s10, v19
-; RV64-NEXT:    vmv.x.s s11, v20
 ; RV64-NEXT:    vmseq.vx v9, v8, a0
-; RV64-NEXT:    vmv.x.s a0, v21
+; RV64-NEXT:    vmv.x.s a0, v17
 ; RV64-NEXT:    vmseq.vx v12, v8, s2
-; RV64-NEXT:    vmv.x.s s2, v22
+; RV64-NEXT:    vmv.x.s s2, v18
 ; RV64-NEXT:    vmseq.vx v13, v8, s3
-; RV64-NEXT:    vmv.x.s s3, v11
-; RV64-NEXT:    vmseq.vx v11, v8, s4
-; RV64-NEXT:    vmv.x.s s4, v23
-; RV64-NEXT:    vmseq.vx v14, v8, s5
-; RV64-NEXT:    vmv.x.s s5, v10
+; RV64-NEXT:    vmv.x.s s3, v19
+; RV64-NEXT:    vmseq.vx v14, v8, s4
+; RV64-NEXT:    vmv.x.s s4, v20
+; RV64-NEXT:    vmseq.vx v15, v8, s5
+; RV64-NEXT:    vmv.x.s s5, v21
+; RV64-NEXT:    vmseq.vx v16, v8, s6
+; RV64-NEXT:    vmv.x.s s6, v22
+; RV64-NEXT:    vmseq.vx v17, v8, s7
+; RV64-NEXT:    vmv.x.s s7, v11
+; RV64-NEXT:    vmseq.vx v11, v8, a0
+; RV64-NEXT:    vmv.x.s a0, v23
+; RV64-NEXT:    vmseq.vx v18, v8, s2
+; RV64-NEXT:    vmv.x.s s2, v10
 ; RV64-NEXT:    vmor.mm v9, v9, v12
-; RV64-NEXT:    vmseq.vx v10, v8, s6
 ; RV64-NEXT:    vmor.mm v9, v9, v13
-; RV64-NEXT:    vmseq.vx v12, v8, s7
-; RV64-NEXT:    vmor.mm v9, v9, v11
-; RV64-NEXT:    vmseq.vx v11, v8, s8
 ; RV64-NEXT:    vmor.mm v9, v9, v14
-; RV64-NEXT:    vmseq.vx v13, v8, s9
-; RV64-NEXT:    vmor.mm v9, v9, v10
-; RV64-NEXT:    vmseq.vx v10, v8, s10
-; RV64-NEXT:    vmor.mm v9, v9, v12
-; RV64-NEXT:    vmseq.vx v12, v8, s11
-; RV64-NEXT:    vmor.mm v9, v9, v11
-; RV64-NEXT:    vmseq.vx v11, v8, a0
-; RV64-NEXT:    vmor.mm v9, v9, v13
-; RV64-NEXT:    vmseq.vx v13, v8, s2
-; RV64-NEXT:    vmor.mm v9, v9, v10
+; RV64-NEXT:    vmor.mm v9, v9, v15
+; RV64-NEXT:    vmor.mm v9, v9, v16
 ; RV64-NEXT:    vmseq.vx v10, v8, s3
-; RV64-NEXT:    vmor.mm v9, v9, v12
+; RV64-NEXT:    vmor.mm v9, v9, v17
 ; RV64-NEXT:    vmseq.vx v12, v8, s4
 ; RV64-NEXT:    vmor.mm v9, v9, v11
 ; RV64-NEXT:    vmseq.vx v11, v8, s5
+; RV64-NEXT:    vmor.mm v9, v9, v18
+; RV64-NEXT:    vmseq.vx v13, v8, s6
+; RV64-NEXT:    vmor.mm v9, v9, v10
+; RV64-NEXT:    vmseq.vx v10, v8, s7
+; RV64-NEXT:    vmor.mm v9, v9, v12
+; RV64-NEXT:    vmseq.vx v12, v8, a0
+; RV64-NEXT:    vmor.mm v9, v9, v11
+; RV64-NEXT:    vmseq.vx v11, v8, s2
 ; RV64-NEXT:    vmor.mm v9, v9, v13
 ; RV64-NEXT:    vmseq.vx v13, v8, a1
 ; RV64-NEXT:    vmor.mm v9, v9, v10
@@ -1235,18 +1179,14 @@ define <16 x i1> @match_v16i8_v32i8(<16 x i8> %op1, <32 x i8> %op2, <16 x i1> %m
 ; RV64-NEXT:    vmseq.vx v8, v8, s1
 ; RV64-NEXT:    vmor.mm v8, v9, v8
 ; RV64-NEXT:    vmand.mm v0, v8, v0
-; RV64-NEXT:    ld s0, 88(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s1, 80(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s2, 72(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s3, 64(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s4, 56(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s5, 48(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s6, 40(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s7, 32(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s8, 24(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s9, 16(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s10, 8(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s11, 0(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s0, 56(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s1, 48(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s2, 40(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s3, 32(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s4, 24(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s5, 16(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s6, 8(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s7, 0(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    .cfi_restore s0
 ; RV64-NEXT:    .cfi_restore s1
 ; RV64-NEXT:    .cfi_restore s2
@@ -1255,11 +1195,7 @@ define <16 x i1> @match_v16i8_v32i8(<16 x i8> %op1, <32 x i8> %op2, <16 x i1> %m
 ; RV64-NEXT:    .cfi_restore s5
 ; RV64-NEXT:    .cfi_restore s6
 ; RV64-NEXT:    .cfi_restore s7
-; RV64-NEXT:    .cfi_restore s8
-; RV64-NEXT:    .cfi_restore s9
-; RV64-NEXT:    .cfi_restore s10
-; RV64-NEXT:    .cfi_restore s11
-; RV64-NEXT:    addi sp, sp, 96
+; RV64-NEXT:    addi sp, sp, 64
 ; RV64-NEXT:    .cfi_def_cfa_offset 0
 ; RV64-NEXT:    ret
   %r = tail call <16 x i1> @llvm.experimental.vector.match(<16 x i8> %op1, <32 x i8> %op2, <16 x i1> %mask)
diff --git a/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll
index 123048d996360c..22e6f23d4d6e6a 100644
--- a/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll
+++ b/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll
@@ -2203,139 +2203,136 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ;
 ; RV32I-LABEL: lshr_32bytes:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -128
-; RV32I-NEXT:    sw ra, 124(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 120(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 116(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 112(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s5, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s6, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s7, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s8, 88(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s9, 84(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s10, 80(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s11, 76(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu s1, 0(a0)
+; RV32I-NEXT:    addi sp, sp, -112
+; RV32I-NEXT:    sw s0, 108(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 104(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 100(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 96(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s5, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s6, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s7, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s8, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s9, 72(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s10, 68(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s11, 64(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a3, 0(a0)
 ; RV32I-NEXT:    lbu a4, 1(a0)
 ; RV32I-NEXT:    lbu a5, 2(a0)
 ; RV32I-NEXT:    lbu a6, 3(a0)
-; RV32I-NEXT:    lbu t1, 4(a0)
-; RV32I-NEXT:    lbu t3, 5(a0)
-; RV32I-NEXT:    lbu t4, 6(a0)
-; RV32I-NEXT:    lbu s0, 7(a0)
-; RV32I-NEXT:    lbu t2, 8(a0)
-; RV32I-NEXT:    lbu s3, 9(a0)
-; RV32I-NEXT:    lbu s6, 10(a0)
-; RV32I-NEXT:    lbu s8, 11(a0)
-; RV32I-NEXT:    lbu s9, 12(a0)
-; RV32I-NEXT:    lbu s10, 13(a0)
-; RV32I-NEXT:    lbu s4, 14(a0)
-; RV32I-NEXT:    lbu s7, 15(a0)
-; RV32I-NEXT:    lbu s5, 16(a0)
-; RV32I-NEXT:    lbu s11, 17(a0)
-; RV32I-NEXT:    lbu ra, 18(a0)
-; RV32I-NEXT:    lbu a3, 19(a0)
-; RV32I-NEXT:    lbu t5, 20(a0)
-; RV32I-NEXT:    lbu t6, 21(a0)
-; RV32I-NEXT:    lbu a7, 22(a0)
-; RV32I-NEXT:    lbu t0, 23(a0)
+; RV32I-NEXT:    lbu a7, 4(a0)
+; RV32I-NEXT:    lbu t0, 5(a0)
+; RV32I-NEXT:    lbu t1, 6(a0)
+; RV32I-NEXT:    lbu t2, 7(a0)
+; RV32I-NEXT:    lbu t3, 8(a0)
+; RV32I-NEXT:    lbu t4, 9(a0)
+; RV32I-NEXT:    lbu t5, 10(a0)
+; RV32I-NEXT:    lbu t6, 11(a0)
+; RV32I-NEXT:    lbu s0, 12(a0)
+; RV32I-NEXT:    lbu s1, 13(a0)
+; RV32I-NEXT:    lbu s2, 14(a0)
+; RV32I-NEXT:    lbu s3, 15(a0)
+; RV32I-NEXT:    lbu s4, 16(a0)
+; RV32I-NEXT:    lbu s5, 17(a0)
+; RV32I-NEXT:    lbu s6, 18(a0)
+; RV32I-NEXT:    lbu s7, 19(a0)
 ; RV32I-NEXT:    slli a4, a4, 8
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    slli t3, t3, 8
-; RV32I-NEXT:    slli t4, t4, 16
-; RV32I-NEXT:    slli s0, s0, 24
-; RV32I-NEXT:    or a4, a4, s1
-; RV32I-NEXT:    sw a4, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t2, t2, 24
+; RV32I-NEXT:    or a3, a4, a3
 ; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a5, t3, t1
-; RV32I-NEXT:    or a6, s0, t4
-; RV32I-NEXT:    lbu t1, 24(a0)
+; RV32I-NEXT:    or a5, t0, a7
+; RV32I-NEXT:    or a6, t2, t1
+; RV32I-NEXT:    lbu s8, 20(a0)
+; RV32I-NEXT:    lbu s9, 21(a0)
+; RV32I-NEXT:    lbu s10, 22(a0)
+; RV32I-NEXT:    lbu s11, 23(a0)
+; RV32I-NEXT:    slli t4, t4, 8
+; RV32I-NEXT:    slli t5, t5, 16
+; RV32I-NEXT:    slli t6, t6, 24
+; RV32I-NEXT:    slli s1, s1, 8
+; RV32I-NEXT:    slli s2, s2, 16
+; RV32I-NEXT:    slli s3, s3, 24
+; RV32I-NEXT:    or a7, t4, t3
+; RV32I-NEXT:    or t0, t6, t5
+; RV32I-NEXT:    or t1, s1, s0
+; RV32I-NEXT:    or t2, s3, s2
+; RV32I-NEXT:    lbu t6, 24(a0)
 ; RV32I-NEXT:    lbu s0, 25(a0)
 ; RV32I-NEXT:    lbu s1, 26(a0)
 ; RV32I-NEXT:    lbu s2, 27(a0)
-; RV32I-NEXT:    slli s3, s3, 8
+; RV32I-NEXT:    slli s5, s5, 8
 ; RV32I-NEXT:    slli s6, s6, 16
-; RV32I-NEXT:    slli s8, s8, 24
-; RV32I-NEXT:    slli s10, s10, 8
-; RV32I-NEXT:    or t2, s3, t2
-; RV32I-NEXT:    or t3, s8, s6
-; RV32I-NEXT:    or t4, s10, s9
-; RV32I-NEXT:    lbu s3, 28(a0)
-; RV32I-NEXT:    lbu s6, 29(a0)
-; RV32I-NEXT:    lbu s8, 30(a0)
-; RV32I-NEXT:    lbu s9, 31(a0)
-; RV32I-NEXT:    slli s4, s4, 16
 ; RV32I-NEXT:    slli s7, s7, 24
-; RV32I-NEXT:    slli s11, s11, 8
-; RV32I-NEXT:    slli ra, ra, 16
-; RV32I-NEXT:    slli a3, a3, 24
-; RV32I-NEXT:    or a0, s7, s4
-; RV32I-NEXT:    or s4, s11, s5
-; RV32I-NEXT:    or s5, a3, ra
-; RV32I-NEXT:    lbu a3, 0(a1)
-; RV32I-NEXT:    lbu s7, 1(a1)
-; RV32I-NEXT:    lbu s10, 2(a1)
+; RV32I-NEXT:    slli s9, s9, 8
+; RV32I-NEXT:    or t3, s5, s4
+; RV32I-NEXT:    or t4, s7, s6
+; RV32I-NEXT:    or t5, s9, s8
+; RV32I-NEXT:    lbu s3, 28(a0)
+; RV32I-NEXT:    lbu s4, 29(a0)
+; RV32I-NEXT:    lbu s5, 30(a0)
+; RV32I-NEXT:    lbu s6, 31(a0)
+; RV32I-NEXT:    slli s10, s10, 16
+; RV32I-NEXT:    slli s11, s11, 24
+; RV32I-NEXT:    slli s0, s0, 8
+; RV32I-NEXT:    slli s1, s1, 16
+; RV32I-NEXT:    slli s2, s2, 24
+; RV32I-NEXT:    or a0, s11, s10
+; RV32I-NEXT:    or t6, s0, t6
+; RV32I-NEXT:    or s0, s2, s1
+; RV32I-NEXT:    lbu s1, 0(a1)
+; RV32I-NEXT:    lbu s2, 1(a1)
+; RV32I-NEXT:    lbu s7, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    sw zero, 48(sp)
+; RV32I-NEXT:    sw zero, 52(sp)
 ; RV32I-NEXT:    sw zero, 56(sp)
 ; RV32I-NEXT:    sw zero, 60(sp)
-; RV32I-NEXT:    sw zero, 64(sp)
-; RV32I-NEXT:    sw zero, 68(sp)
+; RV32I-NEXT:    sw zero, 32(sp)
+; RV32I-NEXT:    sw zero, 36(sp)
 ; RV32I-NEXT:    sw zero, 40(sp)
 ; RV32I-NEXT:    sw zero, 44(sp)
-; RV32I-NEXT:    sw zero, 48(sp)
-; RV32I-NEXT:    sw zero, 52(sp)
-; RV32I-NEXT:    slli t6, t6, 8
-; RV32I-NEXT:    or t5, t6, t5
-; RV32I-NEXT:    addi t6, sp, 8
-; RV32I-NEXT:    slli a7, a7, 16
-; RV32I-NEXT:    slli t0, t0, 24
-; RV32I-NEXT:    slli s0, s0, 8
-; RV32I-NEXT:    slli s1, s1, 16
-; RV32I-NEXT:    slli s2, s2, 24
-; RV32I-NEXT:    slli s6, s6, 8
-; RV32I-NEXT:    slli s8, s8, 16
-; RV32I-NEXT:    slli s9, s9, 24
-; RV32I-NEXT:    slli s7, s7, 8
-; RV32I-NEXT:    slli s10, s10, 16
+; RV32I-NEXT:    slli s4, s4, 8
+; RV32I-NEXT:    or s3, s4, s3
+; RV32I-NEXT:    mv s4, sp
+; RV32I-NEXT:    slli s5, s5, 16
+; RV32I-NEXT:    slli s6, s6, 24
+; RV32I-NEXT:    slli s2, s2, 8
+; RV32I-NEXT:    slli s7, s7, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a7, t0, a7
-; RV32I-NEXT:    or t0, s0, t1
-; RV32I-NEXT:    or t1, s2, s1
-; RV32I-NEXT:    or s0, s6, s3
-; RV32I-NEXT:    or s1, s9, s8
-; RV32I-NEXT:    or a3, s7, a3
-; RV32I-NEXT:    or a1, a1, s10
-; RV32I-NEXT:    lw s2, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    or a4, a4, s2
-; RV32I-NEXT:    or a5, a6, a5
-; RV32I-NEXT:    or a6, t3, t2
-; RV32I-NEXT:    or a0, a0, t4
-; RV32I-NEXT:    or t2, s5, s4
-; RV32I-NEXT:    or a7, a7, t5
-; RV32I-NEXT:    or t0, t1, t0
-; RV32I-NEXT:    or s0, s1, s0
-; RV32I-NEXT:    or a1, a1, a3
-; RV32I-NEXT:    sw t2, 24(sp)
-; RV32I-NEXT:    sw a7, 28(sp)
-; RV32I-NEXT:    sw t0, 32(sp)
-; RV32I-NEXT:    sw s0, 36(sp)
-; RV32I-NEXT:    sw a4, 8(sp)
-; RV32I-NEXT:    sw a5, 12(sp)
-; RV32I-NEXT:    sw a6, 16(sp)
+; RV32I-NEXT:    or s5, s6, s5
+; RV32I-NEXT:    or s1, s2, s1
+; RV32I-NEXT:    or a1, a1, s7
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a5, t0, a7
+; RV32I-NEXT:    or a6, t2, t1
+; RV32I-NEXT:    or a7, t4, t3
+; RV32I-NEXT:    or a0, a0, t5
+; RV32I-NEXT:    or t0, s0, t6
+; RV32I-NEXT:    or t1, s5, s3
+; RV32I-NEXT:    or a1, a1, s1
+; RV32I-NEXT:    sw a7, 16(sp)
 ; RV32I-NEXT:    sw a0, 20(sp)
+; RV32I-NEXT:    sw t0, 24(sp)
+; RV32I-NEXT:    sw t1, 28(sp)
+; RV32I-NEXT:    sw a3, 0(sp)
+; RV32I-NEXT:    sw a4, 4(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a6, 12(sp)
 ; RV32I-NEXT:    slli t1, a1, 3
 ; RV32I-NEXT:    andi a1, a1, 28
-; RV32I-NEXT:    add a1, t6, a1
+; RV32I-NEXT:    add a1, s4, a1
 ; RV32I-NEXT:    andi a0, t1, 24
-; RV32I-NEXT:    xori t0, a0, 31
+; RV32I-NEXT:    xori a7, a0, 31
 ; RV32I-NEXT:    lw a3, 0(a1)
 ; RV32I-NEXT:    lw a4, 4(a1)
 ; RV32I-NEXT:    lw a5, 8(a1)
 ; RV32I-NEXT:    lw a6, 12(a1)
-; RV32I-NEXT:    lw a7, 16(a1)
+; RV32I-NEXT:    lw t0, 16(a1)
 ; RV32I-NEXT:    lw t2, 20(a1)
 ; RV32I-NEXT:    lw t3, 24(a1)
 ; RV32I-NEXT:    lw t4, 28(a1)
@@ -2344,33 +2341,33 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    srl a1, a3, t1
 ; RV32I-NEXT:    slli t6, a4, 1
 ; RV32I-NEXT:    srl a3, a6, t1
-; RV32I-NEXT:    slli s0, a7, 1
+; RV32I-NEXT:    slli s0, t0, 1
 ; RV32I-NEXT:    srl a4, a5, t1
 ; RV32I-NEXT:    slli s1, a6, 1
 ; RV32I-NEXT:    srl a5, t2, t1
 ; RV32I-NEXT:    slli s2, t3, 1
-; RV32I-NEXT:    srl a6, a7, t1
+; RV32I-NEXT:    srl a6, t0, t1
 ; RV32I-NEXT:    slli t2, t2, 1
-; RV32I-NEXT:    srl a7, t3, t1
+; RV32I-NEXT:    srl t0, t3, t1
 ; RV32I-NEXT:    slli t3, t4, 1
 ; RV32I-NEXT:    srl t1, t4, t1
-; RV32I-NEXT:    sll t4, t5, t0
-; RV32I-NEXT:    sll t5, t6, t0
-; RV32I-NEXT:    sll t6, s0, t0
-; RV32I-NEXT:    sll s0, s1, t0
-; RV32I-NEXT:    sll s1, s2, t0
-; RV32I-NEXT:    sll t2, t2, t0
-; RV32I-NEXT:    sll t3, t3, t0
+; RV32I-NEXT:    sll t4, t5, a7
+; RV32I-NEXT:    sll t5, t6, a7
+; RV32I-NEXT:    sll t6, s0, a7
+; RV32I-NEXT:    sll s0, s1, a7
+; RV32I-NEXT:    sll s1, s2, a7
+; RV32I-NEXT:    sll t2, t2, a7
+; RV32I-NEXT:    sll t3, t3, a7
 ; RV32I-NEXT:    srli s2, t1, 24
 ; RV32I-NEXT:    srli s3, t1, 16
 ; RV32I-NEXT:    srli s4, t1, 8
-; RV32I-NEXT:    or t0, a0, t4
+; RV32I-NEXT:    or a7, a0, t4
 ; RV32I-NEXT:    or t4, a1, t5
 ; RV32I-NEXT:    or t5, a3, t6
 ; RV32I-NEXT:    or s0, a4, s0
 ; RV32I-NEXT:    or s1, a5, s1
 ; RV32I-NEXT:    or t2, a6, t2
-; RV32I-NEXT:    or t3, a7, t3
+; RV32I-NEXT:    or t3, t0, t3
 ; RV32I-NEXT:    sb t1, 28(a2)
 ; RV32I-NEXT:    sb s4, 29(a2)
 ; RV32I-NEXT:    sb s3, 30(a2)
@@ -2387,23 +2384,23 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    srli s6, s0, 24
 ; RV32I-NEXT:    srli s7, s0, 16
 ; RV32I-NEXT:    srli s0, s0, 8
-; RV32I-NEXT:    srli s8, t5, 24
-; RV32I-NEXT:    srli s9, t5, 16
-; RV32I-NEXT:    srli t5, t5, 8
-; RV32I-NEXT:    srli s10, t4, 24
-; RV32I-NEXT:    srli s11, t4, 16
-; RV32I-NEXT:    srli t4, t4, 8
-; RV32I-NEXT:    sb a7, 24(a2)
+; RV32I-NEXT:    sb t0, 24(a2)
+; RV32I-NEXT:    srli t0, t5, 24
 ; RV32I-NEXT:    sb t3, 25(a2)
+; RV32I-NEXT:    srli t3, t5, 16
+; RV32I-NEXT:    srli t5, t5, 8
 ; RV32I-NEXT:    sb t6, 26(a2)
+; RV32I-NEXT:    srli t6, t4, 24
 ; RV32I-NEXT:    sb t1, 27(a2)
-; RV32I-NEXT:    srli a7, t0, 24
+; RV32I-NEXT:    srli t1, t4, 16
+; RV32I-NEXT:    srli t4, t4, 8
 ; RV32I-NEXT:    sb a6, 16(a2)
+; RV32I-NEXT:    srli a6, a7, 24
 ; RV32I-NEXT:    sb t2, 17(a2)
 ; RV32I-NEXT:    sb s3, 18(a2)
 ; RV32I-NEXT:    sb s2, 19(a2)
-; RV32I-NEXT:    srli a6, t0, 16
-; RV32I-NEXT:    srli t0, t0, 8
+; RV32I-NEXT:    srli t2, a7, 16
+; RV32I-NEXT:    srli a7, a7, 8
 ; RV32I-NEXT:    sb a5, 20(a2)
 ; RV32I-NEXT:    sb s1, 21(a2)
 ; RV32I-NEXT:    sb s5, 22(a2)
@@ -2414,30 +2411,29 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sb s6, 11(a2)
 ; RV32I-NEXT:    sb a3, 12(a2)
 ; RV32I-NEXT:    sb t5, 13(a2)
-; RV32I-NEXT:    sb s9, 14(a2)
-; RV32I-NEXT:    sb s8, 15(a2)
+; RV32I-NEXT:    sb t3, 14(a2)
+; RV32I-NEXT:    sb t0, 15(a2)
 ; RV32I-NEXT:    sb a1, 0(a2)
 ; RV32I-NEXT:    sb t4, 1(a2)
-; RV32I-NEXT:    sb s11, 2(a2)
-; RV32I-NEXT:    sb s10, 3(a2)
+; RV32I-NEXT:    sb t1, 2(a2)
+; RV32I-NEXT:    sb t6, 3(a2)
 ; RV32I-NEXT:    sb a0, 4(a2)
-; RV32I-NEXT:    sb t0, 5(a2)
-; RV32I-NEXT:    sb a6, 6(a2)
-; RV32I-NEXT:    sb a7, 7(a2)
-; RV32I-NEXT:    lw ra, 124(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 120(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 116(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 112(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s4, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s5, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s6, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s7, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s8, 88(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s9, 84(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s10, 80(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s11, 76(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 128
+; RV32I-NEXT:    sb a7, 5(a2)
+; RV32I-NEXT:    sb t2, 6(a2)
+; RV32I-NEXT:    sb a6, 7(a2)
+; RV32I-NEXT:    lw s0, 108(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 104(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 100(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 96(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s5, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s6, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s7, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s8, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s9, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s10, 68(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s11, 64(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 112
 ; RV32I-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
   %byteOff = load i256, ptr %byteOff.ptr, align 1
@@ -2682,132 +2678,128 @@ define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
 ;
 ; RV32I-LABEL: lshr_32bytes_wordOff:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -128
-; RV32I-NEXT:    sw ra, 124(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 120(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 116(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 112(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s5, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s6, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s7, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s8, 88(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s9, 84(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s10, 80(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s11, 76(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv a3, a1
-; RV32I-NEXT:    lbu a5, 0(a0)
-; RV32I-NEXT:    lbu a7, 1(a0)
-; RV32I-NEXT:    lbu t0, 2(a0)
-; RV32I-NEXT:    lbu t1, 3(a0)
-; RV32I-NEXT:    lbu s2, 4(a0)
-; RV32I-NEXT:    lbu s4, 5(a0)
-; RV32I-NEXT:    lbu s5, 6(a0)
-; RV32I-NEXT:    lbu s6, 7(a0)
-; RV32I-NEXT:    lbu s3, 8(a0)
-; RV32I-NEXT:    lbu s9, 9(a0)
-; RV32I-NEXT:    lbu s10, 10(a0)
-; RV32I-NEXT:    lbu s11, 11(a0)
-; RV32I-NEXT:    lbu ra, 12(a0)
-; RV32I-NEXT:    lbu a1, 13(a0)
-; RV32I-NEXT:    lbu t4, 14(a0)
-; RV32I-NEXT:    lbu t6, 15(a0)
-; RV32I-NEXT:    lbu a4, 16(a0)
-; RV32I-NEXT:    sw a4, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a6, 17(a0)
-; RV32I-NEXT:    lbu t2, 18(a0)
-; RV32I-NEXT:    lbu t3, 19(a0)
-; RV32I-NEXT:    lbu a4, 20(a0)
-; RV32I-NEXT:    lbu t5, 21(a0)
+; RV32I-NEXT:    addi sp, sp, -112
+; RV32I-NEXT:    sw s0, 108(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 104(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 100(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 96(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s5, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s6, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s7, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s8, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s9, 72(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s10, 68(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s11, 64(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a3, 0(a0)
+; RV32I-NEXT:    lbu a4, 1(a0)
+; RV32I-NEXT:    lbu a5, 2(a0)
+; RV32I-NEXT:    lbu a6, 3(a0)
+; RV32I-NEXT:    lbu a7, 4(a0)
+; RV32I-NEXT:    lbu t0, 5(a0)
+; RV32I-NEXT:    lbu t1, 6(a0)
+; RV32I-NEXT:    lbu t2, 7(a0)
+; RV32I-NEXT:    lbu t3, 8(a0)
+; RV32I-NEXT:    lbu t4, 9(a0)
+; RV32I-NEXT:    lbu t5, 10(a0)
+; RV32I-NEXT:    lbu t6, 11(a0)
+; RV32I-NEXT:    lbu s2, 12(a0)
+; RV32I-NEXT:    lbu s3, 13(a0)
+; RV32I-NEXT:    lbu s4, 14(a0)
+; RV32I-NEXT:    lbu s5, 15(a0)
+; RV32I-NEXT:    lbu s6, 16(a0)
+; RV32I-NEXT:    lbu s7, 17(a0)
+; RV32I-NEXT:    lbu s8, 18(a0)
+; RV32I-NEXT:    lbu s9, 19(a0)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a6, a6, 24
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t2, t2, 24
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a5, t0, a7
+; RV32I-NEXT:    or a6, t2, t1
+; RV32I-NEXT:    lbu s10, 20(a0)
+; RV32I-NEXT:    lbu s11, 21(a0)
 ; RV32I-NEXT:    lbu s0, 22(a0)
 ; RV32I-NEXT:    lbu s1, 23(a0)
-; RV32I-NEXT:    slli a7, a7, 8
-; RV32I-NEXT:    slli t0, t0, 16
-; RV32I-NEXT:    slli t1, t1, 24
-; RV32I-NEXT:    slli s4, s4, 8
-; RV32I-NEXT:    slli s5, s5, 16
-; RV32I-NEXT:    slli s6, s6, 24
-; RV32I-NEXT:    or a5, a7, a5
-; RV32I-NEXT:    or a7, t1, t0
-; RV32I-NEXT:    or t0, s4, s2
-; RV32I-NEXT:    or t1, s6, s5
-; RV32I-NEXT:    lbu s2, 24(a0)
-; RV32I-NEXT:    lbu s6, 25(a0)
-; RV32I-NEXT:    lbu s7, 26(a0)
-; RV32I-NEXT:    lbu s8, 27(a0)
-; RV32I-NEXT:    slli s9, s9, 8
-; RV32I-NEXT:    slli s10, s10, 16
-; RV32I-NEXT:    slli s11, s11, 24
-; RV32I-NEXT:    slli a1, a1, 8
-; RV32I-NEXT:    or s3, s9, s3
-; RV32I-NEXT:    or s4, s11, s10
-; RV32I-NEXT:    or s5, a1, ra
-; RV32I-NEXT:    lbu s9, 28(a0)
-; RV32I-NEXT:    lbu a1, 29(a0)
-; RV32I-NEXT:    lbu s10, 30(a0)
+; RV32I-NEXT:    slli t4, t4, 8
+; RV32I-NEXT:    slli t5, t5, 16
+; RV32I-NEXT:    slli t6, t6, 24
+; RV32I-NEXT:    slli s3, s3, 8
+; RV32I-NEXT:    slli s4, s4, 16
+; RV32I-NEXT:    slli s5, s5, 24
+; RV32I-NEXT:    or a7, t4, t3
+; RV32I-NEXT:    or t0, t6, t5
+; RV32I-NEXT:    or t1, s3, s2
+; RV32I-NEXT:    or t2, s5, s4
+; RV32I-NEXT:    lbu t3, 24(a0)
+; RV32I-NEXT:    lbu s2, 25(a0)
+; RV32I-NEXT:    lbu s3, 26(a0)
+; RV32I-NEXT:    lbu s4, 27(a0)
+; RV32I-NEXT:    slli s7, s7, 8
+; RV32I-NEXT:    slli s8, s8, 16
+; RV32I-NEXT:    slli s9, s9, 24
+; RV32I-NEXT:    slli s11, s11, 8
+; RV32I-NEXT:    or t4, s7, s6
+; RV32I-NEXT:    or t5, s9, s8
+; RV32I-NEXT:    or t6, s11, s10
+; RV32I-NEXT:    lbu s5, 28(a0)
+; RV32I-NEXT:    lbu s6, 29(a0)
+; RV32I-NEXT:    lbu s7, 30(a0)
 ; RV32I-NEXT:    lbu a0, 31(a0)
-; RV32I-NEXT:    lbu a3, 0(a3)
+; RV32I-NEXT:    lbu a1, 0(a1)
+; RV32I-NEXT:    sw zero, 48(sp)
+; RV32I-NEXT:    sw zero, 52(sp)
 ; RV32I-NEXT:    sw zero, 56(sp)
 ; RV32I-NEXT:    sw zero, 60(sp)
-; RV32I-NEXT:    sw zero, 64(sp)
-; RV32I-NEXT:    sw zero, 68(sp)
+; RV32I-NEXT:    sw zero, 32(sp)
+; RV32I-NEXT:    sw zero, 36(sp)
 ; RV32I-NEXT:    sw zero, 40(sp)
 ; RV32I-NEXT:    sw zero, 44(sp)
-; RV32I-NEXT:    sw zero, 48(sp)
-; RV32I-NEXT:    sw zero, 52(sp)
-; RV32I-NEXT:    slli t4, t4, 16
-; RV32I-NEXT:    slli t6, t6, 24
-; RV32I-NEXT:    or t4, t6, t4
-; RV32I-NEXT:    addi t6, sp, 8
-; RV32I-NEXT:    slli a6, a6, 8
-; RV32I-NEXT:    slli t2, t2, 16
-; RV32I-NEXT:    slli t3, t3, 24
-; RV32I-NEXT:    slli t5, t5, 8
 ; RV32I-NEXT:    slli s0, s0, 16
 ; RV32I-NEXT:    slli s1, s1, 24
+; RV32I-NEXT:    or s0, s1, s0
+; RV32I-NEXT:    mv s1, sp
+; RV32I-NEXT:    slli s2, s2, 8
+; RV32I-NEXT:    slli s3, s3, 16
+; RV32I-NEXT:    slli s4, s4, 24
 ; RV32I-NEXT:    slli s6, s6, 8
 ; RV32I-NEXT:    slli s7, s7, 16
-; RV32I-NEXT:    slli s8, s8, 24
-; RV32I-NEXT:    slli a1, a1, 8
-; RV32I-NEXT:    slli s10, s10, 16
 ; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    slli a3, a3, 2
-; RV32I-NEXT:    lw s11, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    or a6, a6, s11
-; RV32I-NEXT:    or t2, t3, t2
-; RV32I-NEXT:    or a4, t5, a4
-; RV32I-NEXT:    or s0, s1, s0
-; RV32I-NEXT:    or t3, s6, s2
-; RV32I-NEXT:    or t5, s8, s7
-; RV32I-NEXT:    or a1, a1, s9
-; RV32I-NEXT:    or a0, a0, s10
-; RV32I-NEXT:    andi a3, a3, 28
-; RV32I-NEXT:    or a5, a7, a5
-; RV32I-NEXT:    or a7, t1, t0
-; RV32I-NEXT:    or t0, s4, s3
-; RV32I-NEXT:    or t1, t4, s5
-; RV32I-NEXT:    or a6, t2, a6
-; RV32I-NEXT:    or a4, s0, a4
-; RV32I-NEXT:    or t2, t5, t3
-; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    add t6, t6, a3
-; RV32I-NEXT:    sw a6, 24(sp)
-; RV32I-NEXT:    sw a4, 28(sp)
-; RV32I-NEXT:    sw t2, 32(sp)
-; RV32I-NEXT:    sw a0, 36(sp)
+; RV32I-NEXT:    slli a1, a1, 2
+; RV32I-NEXT:    or t3, s2, t3
+; RV32I-NEXT:    or s2, s4, s3
+; RV32I-NEXT:    or s3, s6, s5
+; RV32I-NEXT:    or a0, a0, s7
+; RV32I-NEXT:    andi a1, a1, 28
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a5, t0, a7
+; RV32I-NEXT:    or a6, t2, t1
+; RV32I-NEXT:    or a7, t5, t4
+; RV32I-NEXT:    or t0, s0, t6
+; RV32I-NEXT:    or t1, s2, t3
+; RV32I-NEXT:    or a0, a0, s3
+; RV32I-NEXT:    add s1, s1, a1
+; RV32I-NEXT:    sw a7, 16(sp)
+; RV32I-NEXT:    sw t0, 20(sp)
+; RV32I-NEXT:    sw t1, 24(sp)
+; RV32I-NEXT:    sw a0, 28(sp)
+; RV32I-NEXT:    sw a3, 0(sp)
+; RV32I-NEXT:    sw a4, 4(sp)
 ; RV32I-NEXT:    sw a5, 8(sp)
-; RV32I-NEXT:    sw a7, 12(sp)
-; RV32I-NEXT:    sw t0, 16(sp)
-; RV32I-NEXT:    sw t1, 20(sp)
-; RV32I-NEXT:    lw a6, 16(t6)
-; RV32I-NEXT:    lw a5, 20(t6)
-; RV32I-NEXT:    lw a7, 24(t6)
-; RV32I-NEXT:    lw a1, 0(t6)
-; RV32I-NEXT:    lw a0, 4(t6)
-; RV32I-NEXT:    lw a4, 8(t6)
-; RV32I-NEXT:    lw a3, 12(t6)
-; RV32I-NEXT:    lw t0, 28(t6)
+; RV32I-NEXT:    sw a6, 12(sp)
+; RV32I-NEXT:    lw a6, 16(s1)
+; RV32I-NEXT:    lw a5, 20(s1)
+; RV32I-NEXT:    lw a7, 24(s1)
+; RV32I-NEXT:    lw a1, 0(s1)
+; RV32I-NEXT:    lw a0, 4(s1)
+; RV32I-NEXT:    lw a4, 8(s1)
+; RV32I-NEXT:    lw a3, 12(s1)
+; RV32I-NEXT:    lw t0, 28(s1)
 ; RV32I-NEXT:    srli t1, a7, 24
 ; RV32I-NEXT:    srli t2, a7, 16
 ; RV32I-NEXT:    srli t3, a7, 8
@@ -2822,21 +2814,21 @@ define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
 ; RV32I-NEXT:    srli s5, a5, 8
 ; RV32I-NEXT:    srli s6, a4, 24
 ; RV32I-NEXT:    srli s7, a4, 16
-; RV32I-NEXT:    srli s8, a4, 8
-; RV32I-NEXT:    srli s9, a3, 24
-; RV32I-NEXT:    srli s10, a3, 16
-; RV32I-NEXT:    srli s11, a3, 8
 ; RV32I-NEXT:    sb a7, 24(a2)
-; RV32I-NEXT:    srli a7, a1, 24
+; RV32I-NEXT:    srli a7, a4, 8
 ; RV32I-NEXT:    sb t3, 25(a2)
+; RV32I-NEXT:    srli t3, a3, 24
 ; RV32I-NEXT:    sb t2, 26(a2)
+; RV32I-NEXT:    srli t2, a3, 16
 ; RV32I-NEXT:    sb t1, 27(a2)
-; RV32I-NEXT:    srli t1, a1, 16
+; RV32I-NEXT:    srli t1, a3, 8
 ; RV32I-NEXT:    sb t0, 28(a2)
+; RV32I-NEXT:    srli t0, a1, 24
 ; RV32I-NEXT:    sb t6, 29(a2)
+; RV32I-NEXT:    srli t6, a1, 16
 ; RV32I-NEXT:    sb t5, 30(a2)
 ; RV32I-NEXT:    sb t4, 31(a2)
-; RV32I-NEXT:    srli t0, a1, 8
+; RV32I-NEXT:    srli t4, a1, 8
 ; RV32I-NEXT:    sb a6, 16(a2)
 ; RV32I-NEXT:    sb s2, 17(a2)
 ; RV32I-NEXT:    sb s1, 18(a2)
@@ -2848,36 +2840,35 @@ define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
 ; RV32I-NEXT:    sb s3, 23(a2)
 ; RV32I-NEXT:    srli a5, a0, 16
 ; RV32I-NEXT:    sb a4, 8(a2)
-; RV32I-NEXT:    sb s8, 9(a2)
+; RV32I-NEXT:    sb a7, 9(a2)
 ; RV32I-NEXT:    sb s7, 10(a2)
 ; RV32I-NEXT:    sb s6, 11(a2)
 ; RV32I-NEXT:    srli a4, a0, 8
 ; RV32I-NEXT:    sb a3, 12(a2)
-; RV32I-NEXT:    sb s11, 13(a2)
-; RV32I-NEXT:    sb s10, 14(a2)
-; RV32I-NEXT:    sb s9, 15(a2)
+; RV32I-NEXT:    sb t1, 13(a2)
+; RV32I-NEXT:    sb t2, 14(a2)
+; RV32I-NEXT:    sb t3, 15(a2)
 ; RV32I-NEXT:    sb a1, 0(a2)
-; RV32I-NEXT:    sb t0, 1(a2)
-; RV32I-NEXT:    sb t1, 2(a2)
-; RV32I-NEXT:    sb a7, 3(a2)
+; RV32I-NEXT:    sb t4, 1(a2)
+; RV32I-NEXT:    sb t6, 2(a2)
+; RV32I-NEXT:    sb t0, 3(a2)
 ; RV32I-NEXT:    sb a0, 4(a2)
 ; RV32I-NEXT:    sb a4, 5(a2)
 ; RV32I-NEXT:    sb a5, 6(a2)
 ; RV32I-NEXT:    sb a6, 7(a2)
-; RV32I-NEXT:    lw ra, 124(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 120(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 116(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 112(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s4, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s5, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s6, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s7, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s8, 88(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s9, 84(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s10, 80(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s11, 76(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 128
+; RV32I-NEXT:    lw s0, 108(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 104(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 100(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 96(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s5, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s6, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s7, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s8, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s9, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s10, 68(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s11, 64(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 112
 ; RV32I-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
   %wordOff = load i256, ptr %wordOff.ptr, align 1
@@ -2903,111 +2894,111 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; RV64I-NEXT:    sd s9, 80(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s10, 72(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s11, 64(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a5, 0(a0)
-; RV64I-NEXT:    lbu a7, 1(a0)
-; RV64I-NEXT:    lbu t2, 2(a0)
-; RV64I-NEXT:    lbu s3, 3(a0)
-; RV64I-NEXT:    lbu t0, 4(a0)
-; RV64I-NEXT:    lbu s8, 5(a0)
-; RV64I-NEXT:    lbu s9, 6(a0)
-; RV64I-NEXT:    lbu s10, 7(a0)
-; RV64I-NEXT:    lbu s2, 8(a0)
-; RV64I-NEXT:    lbu s4, 9(a0)
-; RV64I-NEXT:    lbu s5, 10(a0)
-; RV64I-NEXT:    lbu s6, 11(a0)
-; RV64I-NEXT:    lbu s7, 12(a0)
-; RV64I-NEXT:    lbu s11, 13(a0)
-; RV64I-NEXT:    lbu t1, 14(a0)
-; RV64I-NEXT:    lbu t3, 15(a0)
-; RV64I-NEXT:    lbu a3, 16(a0)
-; RV64I-NEXT:    lbu a6, 17(a0)
-; RV64I-NEXT:    lbu t4, 18(a0)
-; RV64I-NEXT:    lbu t5, 19(a0)
-; RV64I-NEXT:    lbu a4, 20(a0)
-; RV64I-NEXT:    lbu t6, 21(a0)
-; RV64I-NEXT:    lbu s0, 22(a0)
-; RV64I-NEXT:    lbu s1, 23(a0)
-; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    slli t2, t2, 16
+; RV64I-NEXT:    lbu a3, 0(a0)
+; RV64I-NEXT:    lbu a4, 1(a0)
+; RV64I-NEXT:    lbu a5, 2(a0)
+; RV64I-NEXT:    lbu a6, 3(a0)
+; RV64I-NEXT:    lbu a7, 4(a0)
+; RV64I-NEXT:    lbu t0, 5(a0)
+; RV64I-NEXT:    lbu t1, 6(a0)
+; RV64I-NEXT:    lbu t2, 7(a0)
+; RV64I-NEXT:    lbu t3, 8(a0)
+; RV64I-NEXT:    lbu t4, 9(a0)
+; RV64I-NEXT:    lbu t5, 10(a0)
+; RV64I-NEXT:    lbu t6, 11(a0)
+; RV64I-NEXT:    lbu s0, 12(a0)
+; RV64I-NEXT:    lbu s1, 13(a0)
+; RV64I-NEXT:    lbu s2, 14(a0)
+; RV64I-NEXT:    lbu s3, 15(a0)
+; RV64I-NEXT:    lbu s4, 16(a0)
+; RV64I-NEXT:    lbu s5, 17(a0)
+; RV64I-NEXT:    lbu s6, 18(a0)
+; RV64I-NEXT:    lbu s7, 19(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    slli a5, a5, 16
+; RV64I-NEXT:    slli a6, a6, 24
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t2, t2, 24
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a5, t0, a7
+; RV64I-NEXT:    or a6, t2, t1
+; RV64I-NEXT:    lbu s8, 20(a0)
+; RV64I-NEXT:    lbu s9, 21(a0)
+; RV64I-NEXT:    lbu s10, 22(a0)
+; RV64I-NEXT:    lbu s11, 23(a0)
+; RV64I-NEXT:    slli t4, t4, 8
+; RV64I-NEXT:    slli t5, t5, 16
+; RV64I-NEXT:    slli t6, t6, 24
+; RV64I-NEXT:    slli s1, s1, 8
+; RV64I-NEXT:    slli s2, s2, 16
 ; RV64I-NEXT:    slli s3, s3, 24
-; RV64I-NEXT:    slli s8, s8, 8
-; RV64I-NEXT:    slli s9, s9, 16
-; RV64I-NEXT:    slli s10, s10, 24
-; RV64I-NEXT:    or a5, a7, a5
-; RV64I-NEXT:    or a7, s3, t2
-; RV64I-NEXT:    or t0, s8, t0
-; RV64I-NEXT:    or t2, s10, s9
-; RV64I-NEXT:    lbu s3, 24(a0)
-; RV64I-NEXT:    lbu s8, 25(a0)
-; RV64I-NEXT:    lbu s9, 26(a0)
-; RV64I-NEXT:    lbu s10, 27(a0)
-; RV64I-NEXT:    slli s4, s4, 8
-; RV64I-NEXT:    slli s5, s5, 16
-; RV64I-NEXT:    slli s6, s6, 24
-; RV64I-NEXT:    slli s11, s11, 8
-; RV64I-NEXT:    or s2, s4, s2
-; RV64I-NEXT:    or s4, s6, s5
-; RV64I-NEXT:    or s5, s11, s7
-; RV64I-NEXT:    lbu s6, 28(a0)
-; RV64I-NEXT:    lbu s7, 29(a0)
-; RV64I-NEXT:    lbu s11, 30(a0)
+; RV64I-NEXT:    or a7, t4, t3
+; RV64I-NEXT:    or t0, t6, t5
+; RV64I-NEXT:    or t1, s1, s0
+; RV64I-NEXT:    or t2, s3, s2
+; RV64I-NEXT:    lbu t3, 24(a0)
+; RV64I-NEXT:    lbu t4, 25(a0)
+; RV64I-NEXT:    lbu t5, 26(a0)
+; RV64I-NEXT:    lbu t6, 27(a0)
+; RV64I-NEXT:    slli s5, s5, 8
+; RV64I-NEXT:    slli s6, s6, 16
+; RV64I-NEXT:    slli s7, s7, 24
+; RV64I-NEXT:    slli s9, s9, 8
+; RV64I-NEXT:    or s0, s5, s4
+; RV64I-NEXT:    or s1, s7, s6
+; RV64I-NEXT:    or s2, s9, s8
+; RV64I-NEXT:    lbu s3, 28(a0)
+; RV64I-NEXT:    lbu s4, 29(a0)
+; RV64I-NEXT:    lbu s5, 30(a0)
 ; RV64I-NEXT:    lbu a0, 31(a0)
 ; RV64I-NEXT:    lbu a1, 0(a1)
 ; RV64I-NEXT:    sd zero, 32(sp)
 ; RV64I-NEXT:    sd zero, 40(sp)
 ; RV64I-NEXT:    sd zero, 48(sp)
 ; RV64I-NEXT:    sd zero, 56(sp)
-; RV64I-NEXT:    slli t1, t1, 16
-; RV64I-NEXT:    slli t3, t3, 24
-; RV64I-NEXT:    or t1, t3, t1
-; RV64I-NEXT:    mv t3, sp
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    slli t4, t4, 16
-; RV64I-NEXT:    slli t5, t5, 24
-; RV64I-NEXT:    slli t6, t6, 8
-; RV64I-NEXT:    slli s0, s0, 16
-; RV64I-NEXT:    slli s1, s1, 24
-; RV64I-NEXT:    slli s8, s8, 8
-; RV64I-NEXT:    slli s9, s9, 16
-; RV64I-NEXT:    slli s10, s10, 24
-; RV64I-NEXT:    slli s7, s7, 8
-; RV64I-NEXT:    slli s11, s11, 16
+; RV64I-NEXT:    slli s10, s10, 16
+; RV64I-NEXT:    slli s11, s11, 24
+; RV64I-NEXT:    or s6, s11, s10
+; RV64I-NEXT:    mv s7, sp
+; RV64I-NEXT:    slli t4, t4, 8
+; RV64I-NEXT:    slli t5, t5, 16
+; RV64I-NEXT:    slli t6, t6, 24
+; RV64I-NEXT:    slli s4, s4, 8
+; RV64I-NEXT:    slli s5, s5, 16
 ; RV64I-NEXT:    slli a0, a0, 24
 ; RV64I-NEXT:    slli a1, a1, 3
-; RV64I-NEXT:    or a3, a6, a3
-; RV64I-NEXT:    or a6, t5, t4
-; RV64I-NEXT:    or a4, t6, a4
-; RV64I-NEXT:    or s0, s1, s0
-; RV64I-NEXT:    or t4, s8, s3
-; RV64I-NEXT:    or t5, s10, s9
-; RV64I-NEXT:    or t6, s7, s6
-; RV64I-NEXT:    or a0, a0, s11
+; RV64I-NEXT:    or t3, t4, t3
+; RV64I-NEXT:    or t4, t6, t5
+; RV64I-NEXT:    or t5, s4, s3
+; RV64I-NEXT:    or a0, a0, s5
 ; RV64I-NEXT:    andi a1, a1, 24
-; RV64I-NEXT:    or a5, a7, a5
-; RV64I-NEXT:    or a7, t2, t0
-; RV64I-NEXT:    or t0, s4, s2
-; RV64I-NEXT:    or t1, t1, s5
-; RV64I-NEXT:    or a3, a6, a3
-; RV64I-NEXT:    or a4, s0, a4
-; RV64I-NEXT:    or a6, t5, t4
-; RV64I-NEXT:    or a0, a0, t6
-; RV64I-NEXT:    add t3, t3, a1
-; RV64I-NEXT:    slli a7, a7, 32
-; RV64I-NEXT:    slli t1, t1, 32
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a5, t0, a7
+; RV64I-NEXT:    or a6, t2, t1
+; RV64I-NEXT:    or s0, s1, s0
+; RV64I-NEXT:    or a7, s6, s2
+; RV64I-NEXT:    or t0, t4, t3
+; RV64I-NEXT:    or a0, a0, t5
+; RV64I-NEXT:    add s7, s7, a1
 ; RV64I-NEXT:    slli a4, a4, 32
+; RV64I-NEXT:    slli a6, a6, 32
+; RV64I-NEXT:    slli a7, a7, 32
 ; RV64I-NEXT:    slli a0, a0, 32
-; RV64I-NEXT:    or a1, a7, a5
-; RV64I-NEXT:    or a5, t1, t0
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    or a0, a0, a6
-; RV64I-NEXT:    sd a1, 0(sp)
-; RV64I-NEXT:    sd a5, 8(sp)
-; RV64I-NEXT:    sd a3, 16(sp)
+; RV64I-NEXT:    or a1, a6, a5
+; RV64I-NEXT:    or a4, a7, s0
+; RV64I-NEXT:    or a0, a0, t0
+; RV64I-NEXT:    sd a3, 0(sp)
+; RV64I-NEXT:    sd a1, 8(sp)
+; RV64I-NEXT:    sd a4, 16(sp)
 ; RV64I-NEXT:    sd a0, 24(sp)
-; RV64I-NEXT:    ld a4, 16(t3)
-; RV64I-NEXT:    ld a0, 8(t3)
-; RV64I-NEXT:    ld a1, 0(t3)
-; RV64I-NEXT:    ld a3, 24(t3)
+; RV64I-NEXT:    ld a4, 16(s7)
+; RV64I-NEXT:    ld a0, 8(s7)
+; RV64I-NEXT:    ld a1, 0(s7)
+; RV64I-NEXT:    ld a3, 24(s7)
 ; RV64I-NEXT:    srli a5, a4, 56
 ; RV64I-NEXT:    srli a6, a4, 48
 ; RV64I-NEXT:    srli a7, a4, 40
@@ -3026,25 +3017,25 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; RV64I-NEXT:    srli s5, a1, 48
 ; RV64I-NEXT:    srli s6, a1, 40
 ; RV64I-NEXT:    srli s7, a1, 32
-; RV64I-NEXT:    srli s8, a1, 24
-; RV64I-NEXT:    srli s9, a1, 16
-; RV64I-NEXT:    srli s10, a1, 8
-; RV64I-NEXT:    srli s11, a0, 56
 ; RV64I-NEXT:    sb t0, 20(a2)
+; RV64I-NEXT:    srli t0, a1, 24
 ; RV64I-NEXT:    sb a7, 21(a2)
+; RV64I-NEXT:    srli a7, a1, 16
 ; RV64I-NEXT:    sb a6, 22(a2)
+; RV64I-NEXT:    srli a6, a1, 8
 ; RV64I-NEXT:    sb a5, 23(a2)
-; RV64I-NEXT:    srli a5, a0, 48
+; RV64I-NEXT:    srli a5, a0, 56
 ; RV64I-NEXT:    sb a4, 16(a2)
+; RV64I-NEXT:    srli a4, a0, 48
 ; RV64I-NEXT:    sb t3, 17(a2)
 ; RV64I-NEXT:    sb t2, 18(a2)
 ; RV64I-NEXT:    sb t1, 19(a2)
-; RV64I-NEXT:    srli a4, a0, 40
+; RV64I-NEXT:    srli t1, a0, 40
 ; RV64I-NEXT:    sb s0, 28(a2)
 ; RV64I-NEXT:    sb t6, 29(a2)
 ; RV64I-NEXT:    sb t5, 30(a2)
 ; RV64I-NEXT:    sb t4, 31(a2)
-; RV64I-NEXT:    srli a6, a0, 32
+; RV64I-NEXT:    srli t2, a0, 32
 ; RV64I-NEXT:    sb a3, 24(a2)
 ; RV64I-NEXT:    sb s3, 25(a2)
 ; RV64I-NEXT:    sb s2, 26(a2)
@@ -3054,19 +3045,19 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; RV64I-NEXT:    sb s6, 5(a2)
 ; RV64I-NEXT:    sb s5, 6(a2)
 ; RV64I-NEXT:    sb s4, 7(a2)
-; RV64I-NEXT:    srli a7, a0, 16
+; RV64I-NEXT:    srli t3, a0, 16
 ; RV64I-NEXT:    sb a1, 0(a2)
-; RV64I-NEXT:    sb s10, 1(a2)
-; RV64I-NEXT:    sb s9, 2(a2)
-; RV64I-NEXT:    sb s8, 3(a2)
+; RV64I-NEXT:    sb a6, 1(a2)
+; RV64I-NEXT:    sb a7, 2(a2)
+; RV64I-NEXT:    sb t0, 3(a2)
 ; RV64I-NEXT:    srli a1, a0, 8
-; RV64I-NEXT:    sb a6, 12(a2)
-; RV64I-NEXT:    sb a4, 13(a2)
-; RV64I-NEXT:    sb a5, 14(a2)
-; RV64I-NEXT:    sb s11, 15(a2)
+; RV64I-NEXT:    sb t2, 12(a2)
+; RV64I-NEXT:    sb t1, 13(a2)
+; RV64I-NEXT:    sb a4, 14(a2)
+; RV64I-NEXT:    sb a5, 15(a2)
 ; RV64I-NEXT:    sb a0, 8(a2)
 ; RV64I-NEXT:    sb a1, 9(a2)
-; RV64I-NEXT:    sb a7, 10(a2)
+; RV64I-NEXT:    sb t3, 10(a2)
 ; RV64I-NEXT:    sb a3, 11(a2)
 ; RV64I-NEXT:    ld s0, 152(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 144(sp) # 8-byte Folded Reload
@@ -3085,132 +3076,128 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ;
 ; RV32I-LABEL: lshr_32bytes_dwordOff:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -128
-; RV32I-NEXT:    sw ra, 124(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 120(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 116(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 112(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s5, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s6, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s7, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s8, 88(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s9, 84(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s10, 80(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s11, 76(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv a3, a1
-; RV32I-NEXT:    lbu a5, 0(a0)
-; RV32I-NEXT:    lbu a7, 1(a0)
-; RV32I-NEXT:    lbu t0, 2(a0)
-; RV32I-NEXT:    lbu t1, 3(a0)
-; RV32I-NEXT:    lbu s2, 4(a0)
-; RV32I-NEXT:    lbu s4, 5(a0)
-; RV32I-NEXT:    lbu s5, 6(a0)
-; RV32I-NEXT:    lbu s6, 7(a0)
-; RV32I-NEXT:    lbu s3, 8(a0)
-; RV32I-NEXT:    lbu s9, 9(a0)
-; RV32I-NEXT:    lbu s10, 10(a0)
-; RV32I-NEXT:    lbu s11, 11(a0)
-; RV32I-NEXT:    lbu ra, 12(a0)
-; RV32I-NEXT:    lbu a1, 13(a0)
-; RV32I-NEXT:    lbu t4, 14(a0)
-; RV32I-NEXT:    lbu t6, 15(a0)
-; RV32I-NEXT:    lbu a4, 16(a0)
-; RV32I-NEXT:    sw a4, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a6, 17(a0)
-; RV32I-NEXT:    lbu t2, 18(a0)
-; RV32I-NEXT:    lbu t3, 19(a0)
-; RV32I-NEXT:    lbu a4, 20(a0)
-; RV32I-NEXT:    lbu t5, 21(a0)
-; RV32I-NEXT:    lbu s0, 22(a0)
-; RV32I-NEXT:    lbu s1, 23(a0)
-; RV32I-NEXT:    slli a7, a7, 8
-; RV32I-NEXT:    slli t0, t0, 16
-; RV32I-NEXT:    slli t1, t1, 24
-; RV32I-NEXT:    slli s4, s4, 8
-; RV32I-NEXT:    slli s5, s5, 16
-; RV32I-NEXT:    slli s6, s6, 24
-; RV32I-NEXT:    or a5, a7, a5
-; RV32I-NEXT:    or a7, t1, t0
-; RV32I-NEXT:    or t0, s4, s2
-; RV32I-NEXT:    or t1, s6, s5
-; RV32I-NEXT:    lbu s2, 24(a0)
-; RV32I-NEXT:    lbu s6, 25(a0)
-; RV32I-NEXT:    lbu s7, 26(a0)
-; RV32I-NEXT:    lbu s8, 27(a0)
-; RV32I-NEXT:    slli s9, s9, 8
-; RV32I-NEXT:    slli s10, s10, 16
-; RV32I-NEXT:    slli s11, s11, 24
-; RV32I-NEXT:    slli a1, a1, 8
-; RV32I-NEXT:    or s3, s9, s3
-; RV32I-NEXT:    or s4, s11, s10
-; RV32I-NEXT:    or s5, a1, ra
-; RV32I-NEXT:    lbu s9, 28(a0)
-; RV32I-NEXT:    lbu a1, 29(a0)
-; RV32I-NEXT:    lbu s10, 30(a0)
+; RV32I-NEXT:    addi sp, sp, -112
+; RV32I-NEXT:    sw s0, 108(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 104(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 100(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 96(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s5, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s6, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s7, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s8, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s9, 72(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s10, 68(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s11, 64(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a3, 0(a0)
+; RV32I-NEXT:    lbu a4, 1(a0)
+; RV32I-NEXT:    lbu a5, 2(a0)
+; RV32I-NEXT:    lbu a6, 3(a0)
+; RV32I-NEXT:    lbu a7, 4(a0)
+; RV32I-NEXT:    lbu t0, 5(a0)
+; RV32I-NEXT:    lbu t1, 6(a0)
+; RV32I-NEXT:    lbu t2, 7(a0)
+; RV32I-NEXT:    lbu t3, 8(a0)
+; RV32I-NEXT:    lbu t4, 9(a0)
+; RV32I-NEXT:    lbu t5, 10(a0)
+; RV32I-NEXT:    lbu t6, 11(a0)
+; RV32I-NEXT:    lbu s2, 12(a0)
+; RV32I-NEXT:    lbu s3, 13(a0)
+; RV32I-NEXT:    lbu s4, 14(a0)
+; RV32I-NEXT:    lbu s5, 15(a0)
+; RV32I-NEXT:    lbu s6, 16(a0)
+; RV32I-NEXT:    lbu s7, 17(a0)
+; RV32I-NEXT:    lbu s8, 18(a0)
+; RV32I-NEXT:    lbu s9, 19(a0)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a6, a6, 24
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t2, t2, 24
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a5, t0, a7
+; RV32I-NEXT:    or a6, t2, t1
+; RV32I-NEXT:    lbu s10, 20(a0)
+; RV32I-NEXT:    lbu s11, 21(a0)
+; RV32I-NEXT:    lbu s0, 22(a0)
+; RV32I-NEXT:    lbu s1, 23(a0)
+; RV32I-NEXT:    slli t4, t4, 8
+; RV32I-NEXT:    slli t5, t5, 16
+; RV32I-NEXT:    slli t6, t6, 24
+; RV32I-NEXT:    slli s3, s3, 8
+; RV32I-NEXT:    slli s4, s4, 16
+; RV32I-NEXT:    slli s5, s5, 24
+; RV32I-NEXT:    or a7, t4, t3
+; RV32I-NEXT:    or t0, t6, t5
+; RV32I-NEXT:    or t1, s3, s2
+; RV32I-NEXT:    or t2, s5, s4
+; RV32I-NEXT:    lbu t3, 24(a0)
+; RV32I-NEXT:    lbu s2, 25(a0)
+; RV32I-NEXT:    lbu s3, 26(a0)
+; RV32I-NEXT:    lbu s4, 27(a0)
+; RV32I-NEXT:    slli s7, s7, 8
+; RV32I-NEXT:    slli s8, s8, 16
+; RV32I-NEXT:    slli s9, s9, 24
+; RV32I-NEXT:    slli s11, s11, 8
+; RV32I-NEXT:    or t4, s7, s6
+; RV32I-NEXT:    or t5, s9, s8
+; RV32I-NEXT:    or t6, s11, s10
+; RV32I-NEXT:    lbu s5, 28(a0)
+; RV32I-NEXT:    lbu s6, 29(a0)
+; RV32I-NEXT:    lbu s7, 30(a0)
 ; RV32I-NEXT:    lbu a0, 31(a0)
-; RV32I-NEXT:    lbu a3, 0(a3)
+; RV32I-NEXT:    lbu a1, 0(a1)
+; RV32I-NEXT:    sw zero, 48(sp)
+; RV32I-NEXT:    sw zero, 52(sp)
 ; RV32I-NEXT:    sw zero, 56(sp)
 ; RV32I-NEXT:    sw zero, 60(sp)
-; RV32I-NEXT:    sw zero, 64(sp)
-; RV32I-NEXT:    sw zero, 68(sp)
+; RV32I-NEXT:    sw zero, 32(sp)
+; RV32I-NEXT:    sw zero, 36(sp)
 ; RV32I-NEXT:    sw zero, 40(sp)
 ; RV32I-NEXT:    sw zero, 44(sp)
-; RV32I-NEXT:    sw zero, 48(sp)
-; RV32I-NEXT:    sw zero, 52(sp)
-; RV32I-NEXT:    slli t4, t4, 16
-; RV32I-NEXT:    slli t6, t6, 24
-; RV32I-NEXT:    or t4, t6, t4
-; RV32I-NEXT:    addi t6, sp, 8
-; RV32I-NEXT:    slli a6, a6, 8
-; RV32I-NEXT:    slli t2, t2, 16
-; RV32I-NEXT:    slli t3, t3, 24
-; RV32I-NEXT:    slli t5, t5, 8
 ; RV32I-NEXT:    slli s0, s0, 16
 ; RV32I-NEXT:    slli s1, s1, 24
+; RV32I-NEXT:    or s0, s1, s0
+; RV32I-NEXT:    mv s1, sp
+; RV32I-NEXT:    slli s2, s2, 8
+; RV32I-NEXT:    slli s3, s3, 16
+; RV32I-NEXT:    slli s4, s4, 24
 ; RV32I-NEXT:    slli s6, s6, 8
 ; RV32I-NEXT:    slli s7, s7, 16
-; RV32I-NEXT:    slli s8, s8, 24
-; RV32I-NEXT:    slli a1, a1, 8
-; RV32I-NEXT:    slli s10, s10, 16
 ; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    slli a3, a3, 3
-; RV32I-NEXT:    lw s11, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    or a6, a6, s11
-; RV32I-NEXT:    or t2, t3, t2
-; RV32I-NEXT:    or a4, t5, a4
-; RV32I-NEXT:    or s0, s1, s0
-; RV32I-NEXT:    or t3, s6, s2
-; RV32I-NEXT:    or t5, s8, s7
-; RV32I-NEXT:    or a1, a1, s9
-; RV32I-NEXT:    or a0, a0, s10
-; RV32I-NEXT:    andi a3, a3, 24
-; RV32I-NEXT:    or a5, a7, a5
-; RV32I-NEXT:    or a7, t1, t0
-; RV32I-NEXT:    or t0, s4, s3
-; RV32I-NEXT:    or t1, t4, s5
-; RV32I-NEXT:    or a6, t2, a6
-; RV32I-NEXT:    or a4, s0, a4
-; RV32I-NEXT:    or t2, t5, t3
-; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    add t6, t6, a3
-; RV32I-NEXT:    sw a6, 24(sp)
-; RV32I-NEXT:    sw a4, 28(sp)
-; RV32I-NEXT:    sw t2, 32(sp)
-; RV32I-NEXT:    sw a0, 36(sp)
+; RV32I-NEXT:    slli a1, a1, 3
+; RV32I-NEXT:    or t3, s2, t3
+; RV32I-NEXT:    or s2, s4, s3
+; RV32I-NEXT:    or s3, s6, s5
+; RV32I-NEXT:    or a0, a0, s7
+; RV32I-NEXT:    andi a1, a1, 24
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a5, t0, a7
+; RV32I-NEXT:    or a6, t2, t1
+; RV32I-NEXT:    or a7, t5, t4
+; RV32I-NEXT:    or t0, s0, t6
+; RV32I-NEXT:    or t1, s2, t3
+; RV32I-NEXT:    or a0, a0, s3
+; RV32I-NEXT:    add s1, s1, a1
+; RV32I-NEXT:    sw a7, 16(sp)
+; RV32I-NEXT:    sw t0, 20(sp)
+; RV32I-NEXT:    sw t1, 24(sp)
+; RV32I-NEXT:    sw a0, 28(sp)
+; RV32I-NEXT:    sw a3, 0(sp)
+; RV32I-NEXT:    sw a4, 4(sp)
 ; RV32I-NEXT:    sw a5, 8(sp)
-; RV32I-NEXT:    sw a7, 12(sp)
-; RV32I-NEXT:    sw t0, 16(sp)
-; RV32I-NEXT:    sw t1, 20(sp)
-; RV32I-NEXT:    lw a6, 16(t6)
-; RV32I-NEXT:    lw a5, 20(t6)
-; RV32I-NEXT:    lw a7, 24(t6)
-; RV32I-NEXT:    lw a1, 0(t6)
-; RV32I-NEXT:    lw a0, 4(t6)
-; RV32I-NEXT:    lw a4, 8(t6)
-; RV32I-NEXT:    lw a3, 12(t6)
-; RV32I-NEXT:    lw t0, 28(t6)
+; RV32I-NEXT:    sw a6, 12(sp)
+; RV32I-NEXT:    lw a6, 16(s1)
+; RV32I-NEXT:    lw a5, 20(s1)
+; RV32I-NEXT:    lw a7, 24(s1)
+; RV32I-NEXT:    lw a1, 0(s1)
+; RV32I-NEXT:    lw a0, 4(s1)
+; RV32I-NEXT:    lw a4, 8(s1)
+; RV32I-NEXT:    lw a3, 12(s1)
+; RV32I-NEXT:    lw t0, 28(s1)
 ; RV32I-NEXT:    srli t1, a7, 24
 ; RV32I-NEXT:    srli t2, a7, 16
 ; RV32I-NEXT:    srli t3, a7, 8
@@ -3225,21 +3212,21 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; RV32I-NEXT:    srli s5, a5, 8
 ; RV32I-NEXT:    srli s6, a4, 24
 ; RV32I-NEXT:    srli s7, a4, 16
-; RV32I-NEXT:    srli s8, a4, 8
-; RV32I-NEXT:    srli s9, a3, 24
-; RV32I-NEXT:    srli s10, a3, 16
-; RV32I-NEXT:    srli s11, a3, 8
 ; RV32I-NEXT:    sb a7, 24(a2)
-; RV32I-NEXT:    srli a7, a1, 24
+; RV32I-NEXT:    srli a7, a4, 8
 ; RV32I-NEXT:    sb t3, 25(a2)
+; RV32I-NEXT:    srli t3, a3, 24
 ; RV32I-NEXT:    sb t2, 26(a2)
+; RV32I-NEXT:    srli t2, a3, 16
 ; RV32I-NEXT:    sb t1, 27(a2)
-; RV32I-NEXT:    srli t1, a1, 16
+; RV32I-NEXT:    srli t1, a3, 8
 ; RV32I-NEXT:    sb t0, 28(a2)
+; RV32I-NEXT:    srli t0, a1, 24
 ; RV32I-NEXT:    sb t6, 29(a2)
+; RV32I-NEXT:    srli t6, a1, 16
 ; RV32I-NEXT:    sb t5, 30(a2)
 ; RV32I-NEXT:    sb t4, 31(a2)
-; RV32I-NEXT:    srli t0, a1, 8
+; RV32I-NEXT:    srli t4, a1, 8
 ; RV32I-NEXT:    sb a6, 16(a2)
 ; RV32I-NEXT:    sb s2, 17(a2)
 ; RV32I-NEXT:    sb s1, 18(a2)
@@ -3251,36 +3238,35 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; RV32I-NEXT:    sb s3, 23(a2)
 ; RV32I-NEXT:    srli a5, a0, 16
 ; RV32I-NEXT:    sb a4, 8(a2)
-; RV32I-NEXT:    sb s8, 9(a2)
+; RV32I-NEXT:    sb a7, 9(a2)
 ; RV32I-NEXT:    sb s7, 10(a2)
 ; RV32I-NEXT:    sb s6, 11(a2)
 ; RV32I-NEXT:    srli a4, a0, 8
 ; RV32I-NEXT:    sb a3, 12(a2)
-; RV32I-NEXT:    sb s11, 13(a2)
-; RV32I-NEXT:    sb s10, 14(a2)
-; RV32I-NEXT:    sb s9, 15(a2)
+; RV32I-NEXT:    sb t1, 13(a2)
+; RV32I-NEXT:    sb t2, 14(a2)
+; RV32I-NEXT:    sb t3, 15(a2)
 ; RV32I-NEXT:    sb a1, 0(a2)
-; RV32I-NEXT:    sb t0, 1(a2)
-; RV32I-NEXT:    sb t1, 2(a2)
-; RV32I-NEXT:    sb a7, 3(a2)
+; RV32I-NEXT:    sb t4, 1(a2)
+; RV32I-NEXT:    sb t6, 2(a2)
+; RV32I-NEXT:    sb t0, 3(a2)
 ; RV32I-NEXT:    sb a0, 4(a2)
 ; RV32I-NEXT:    sb a4, 5(a2)
 ; RV32I-NEXT:    sb a5, 6(a2)
 ; RV32I-NEXT:    sb a6, 7(a2)
-; RV32I-NEXT:    lw ra, 124(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 120(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 116(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 112(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s4, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s5, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s6, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s7, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s8, 88(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s9, 84(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s10, 80(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s11, 76(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 128
+; RV32I-NEXT:    lw s0, 108(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 104(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 100(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 96(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s5, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s6, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s7, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s8, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s9, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s10, 68(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s11, 64(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 112
 ; RV32I-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
   %dwordOff = load i256, ptr %dwordOff.ptr, align 1
@@ -3524,132 +3510,129 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ;
 ; RV32I-LABEL: shl_32bytes:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -128
-; RV32I-NEXT:    sw ra, 124(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 120(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 116(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 112(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s5, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s6, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s7, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s8, 88(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s9, 84(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s10, 80(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s11, 76(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu s1, 0(a0)
+; RV32I-NEXT:    addi sp, sp, -112
+; RV32I-NEXT:    sw s0, 108(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 104(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 100(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 96(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s5, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s6, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s7, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s8, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s9, 72(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s10, 68(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s11, 64(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a3, 0(a0)
 ; RV32I-NEXT:    lbu a4, 1(a0)
 ; RV32I-NEXT:    lbu a5, 2(a0)
 ; RV32I-NEXT:    lbu a6, 3(a0)
-; RV32I-NEXT:    lbu t1, 4(a0)
-; RV32I-NEXT:    lbu t3, 5(a0)
-; RV32I-NEXT:    lbu t4, 6(a0)
-; RV32I-NEXT:    lbu s0, 7(a0)
-; RV32I-NEXT:    lbu t2, 8(a0)
-; RV32I-NEXT:    lbu s3, 9(a0)
-; RV32I-NEXT:    lbu s6, 10(a0)
-; RV32I-NEXT:    lbu s8, 11(a0)
-; RV32I-NEXT:    lbu s9, 12(a0)
-; RV32I-NEXT:    lbu s10, 13(a0)
-; RV32I-NEXT:    lbu s4, 14(a0)
-; RV32I-NEXT:    lbu s7, 15(a0)
-; RV32I-NEXT:    lbu s5, 16(a0)
-; RV32I-NEXT:    lbu s11, 17(a0)
-; RV32I-NEXT:    lbu ra, 18(a0)
-; RV32I-NEXT:    lbu a3, 19(a0)
-; RV32I-NEXT:    lbu t5, 20(a0)
-; RV32I-NEXT:    lbu t6, 21(a0)
-; RV32I-NEXT:    lbu a7, 22(a0)
-; RV32I-NEXT:    lbu t0, 23(a0)
+; RV32I-NEXT:    lbu a7, 4(a0)
+; RV32I-NEXT:    lbu t0, 5(a0)
+; RV32I-NEXT:    lbu t1, 6(a0)
+; RV32I-NEXT:    lbu t2, 7(a0)
+; RV32I-NEXT:    lbu t3, 8(a0)
+; RV32I-NEXT:    lbu t4, 9(a0)
+; RV32I-NEXT:    lbu t5, 10(a0)
+; RV32I-NEXT:    lbu t6, 11(a0)
+; RV32I-NEXT:    lbu s0, 12(a0)
+; RV32I-NEXT:    lbu s1, 13(a0)
+; RV32I-NEXT:    lbu s2, 14(a0)
+; RV32I-NEXT:    lbu s3, 15(a0)
+; RV32I-NEXT:    lbu s4, 16(a0)
+; RV32I-NEXT:    lbu s5, 17(a0)
+; RV32I-NEXT:    lbu s6, 18(a0)
+; RV32I-NEXT:    lbu s7, 19(a0)
 ; RV32I-NEXT:    slli a4, a4, 8
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    slli t3, t3, 8
-; RV32I-NEXT:    slli t4, t4, 16
-; RV32I-NEXT:    slli s0, s0, 24
-; RV32I-NEXT:    or a4, a4, s1
-; RV32I-NEXT:    sw a4, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t2, t2, 24
+; RV32I-NEXT:    or a3, a4, a3
 ; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a5, t3, t1
-; RV32I-NEXT:    or a6, s0, t4
-; RV32I-NEXT:    lbu t1, 24(a0)
+; RV32I-NEXT:    or a5, t0, a7
+; RV32I-NEXT:    or a6, t2, t1
+; RV32I-NEXT:    lbu s8, 20(a0)
+; RV32I-NEXT:    lbu s9, 21(a0)
+; RV32I-NEXT:    lbu s10, 22(a0)
+; RV32I-NEXT:    lbu s11, 23(a0)
+; RV32I-NEXT:    slli t4, t4, 8
+; RV32I-NEXT:    slli t5, t5, 16
+; RV32I-NEXT:    slli t6, t6, 24
+; RV32I-NEXT:    slli s1, s1, 8
+; RV32I-NEXT:    slli s2, s2, 16
+; RV32I-NEXT:    slli s3, s3, 24
+; RV32I-NEXT:    or a7, t4, t3
+; RV32I-NEXT:    or t0, t6, t5
+; RV32I-NEXT:    or t1, s1, s0
+; RV32I-NEXT:    or t2, s3, s2
+; RV32I-NEXT:    lbu t6, 24(a0)
 ; RV32I-NEXT:    lbu s0, 25(a0)
 ; RV32I-NEXT:    lbu s1, 26(a0)
 ; RV32I-NEXT:    lbu s2, 27(a0)
-; RV32I-NEXT:    slli s3, s3, 8
+; RV32I-NEXT:    slli s5, s5, 8
 ; RV32I-NEXT:    slli s6, s6, 16
-; RV32I-NEXT:    slli s8, s8, 24
-; RV32I-NEXT:    slli s10, s10, 8
-; RV32I-NEXT:    or t2, s3, t2
-; RV32I-NEXT:    or t3, s8, s6
-; RV32I-NEXT:    or t4, s10, s9
-; RV32I-NEXT:    lbu s3, 28(a0)
-; RV32I-NEXT:    lbu s6, 29(a0)
-; RV32I-NEXT:    lbu s8, 30(a0)
-; RV32I-NEXT:    lbu s9, 31(a0)
-; RV32I-NEXT:    slli s4, s4, 16
 ; RV32I-NEXT:    slli s7, s7, 24
-; RV32I-NEXT:    slli s11, s11, 8
-; RV32I-NEXT:    slli ra, ra, 16
-; RV32I-NEXT:    slli a3, a3, 24
-; RV32I-NEXT:    or a0, s7, s4
-; RV32I-NEXT:    or s4, s11, s5
-; RV32I-NEXT:    or s5, a3, ra
-; RV32I-NEXT:    lbu a3, 0(a1)
-; RV32I-NEXT:    lbu s7, 1(a1)
-; RV32I-NEXT:    lbu s10, 2(a1)
+; RV32I-NEXT:    slli s9, s9, 8
+; RV32I-NEXT:    or t3, s5, s4
+; RV32I-NEXT:    or t4, s7, s6
+; RV32I-NEXT:    or t5, s9, s8
+; RV32I-NEXT:    lbu s3, 28(a0)
+; RV32I-NEXT:    lbu s4, 29(a0)
+; RV32I-NEXT:    lbu s5, 30(a0)
+; RV32I-NEXT:    lbu s6, 31(a0)
+; RV32I-NEXT:    slli s10, s10, 16
+; RV32I-NEXT:    slli s11, s11, 24
+; RV32I-NEXT:    slli s0, s0, 8
+; RV32I-NEXT:    slli s1, s1, 16
+; RV32I-NEXT:    slli s2, s2, 24
+; RV32I-NEXT:    or a0, s11, s10
+; RV32I-NEXT:    or t6, s0, t6
+; RV32I-NEXT:    or s0, s2, s1
+; RV32I-NEXT:    lbu s1, 0(a1)
+; RV32I-NEXT:    lbu s2, 1(a1)
+; RV32I-NEXT:    lbu s7, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    sw zero, 16(sp)
+; RV32I-NEXT:    sw zero, 20(sp)
 ; RV32I-NEXT:    sw zero, 24(sp)
 ; RV32I-NEXT:    sw zero, 28(sp)
-; RV32I-NEXT:    sw zero, 32(sp)
-; RV32I-NEXT:    sw zero, 36(sp)
+; RV32I-NEXT:    sw zero, 0(sp)
+; RV32I-NEXT:    sw zero, 4(sp)
 ; RV32I-NEXT:    sw zero, 8(sp)
 ; RV32I-NEXT:    sw zero, 12(sp)
-; RV32I-NEXT:    sw zero, 16(sp)
-; RV32I-NEXT:    sw zero, 20(sp)
-; RV32I-NEXT:    slli t6, t6, 8
-; RV32I-NEXT:    or t5, t6, t5
-; RV32I-NEXT:    addi t6, sp, 40
-; RV32I-NEXT:    slli a7, a7, 16
-; RV32I-NEXT:    slli t0, t0, 24
-; RV32I-NEXT:    slli s0, s0, 8
-; RV32I-NEXT:    slli s1, s1, 16
-; RV32I-NEXT:    slli s2, s2, 24
-; RV32I-NEXT:    slli s6, s6, 8
-; RV32I-NEXT:    slli s8, s8, 16
-; RV32I-NEXT:    slli s9, s9, 24
-; RV32I-NEXT:    slli s7, s7, 8
-; RV32I-NEXT:    slli s10, s10, 16
+; RV32I-NEXT:    slli s4, s4, 8
+; RV32I-NEXT:    or s3, s4, s3
+; RV32I-NEXT:    addi s4, sp, 32
+; RV32I-NEXT:    slli s5, s5, 16
+; RV32I-NEXT:    slli s6, s6, 24
+; RV32I-NEXT:    slli s2, s2, 8
+; RV32I-NEXT:    slli s7, s7, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a7, t0, a7
-; RV32I-NEXT:    or t0, s0, t1
-; RV32I-NEXT:    or t1, s2, s1
-; RV32I-NEXT:    or s0, s6, s3
-; RV32I-NEXT:    or s1, s9, s8
-; RV32I-NEXT:    or a3, s7, a3
-; RV32I-NEXT:    or a1, a1, s10
-; RV32I-NEXT:    lw s2, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    or a4, a4, s2
-; RV32I-NEXT:    or a5, a6, a5
-; RV32I-NEXT:    or a6, t3, t2
-; RV32I-NEXT:    or a0, a0, t4
-; RV32I-NEXT:    or t2, s5, s4
-; RV32I-NEXT:    or a7, a7, t5
-; RV32I-NEXT:    or t0, t1, t0
-; RV32I-NEXT:    or s0, s1, s0
-; RV32I-NEXT:    or a1, a1, a3
-; RV32I-NEXT:    sw t2, 56(sp)
-; RV32I-NEXT:    sw a7, 60(sp)
-; RV32I-NEXT:    sw t0, 64(sp)
-; RV32I-NEXT:    sw s0, 68(sp)
-; RV32I-NEXT:    sw a4, 40(sp)
-; RV32I-NEXT:    sw a5, 44(sp)
-; RV32I-NEXT:    sw a6, 48(sp)
+; RV32I-NEXT:    or s5, s6, s5
+; RV32I-NEXT:    or s1, s2, s1
+; RV32I-NEXT:    or a1, a1, s7
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a5, t0, a7
+; RV32I-NEXT:    or a6, t2, t1
+; RV32I-NEXT:    or a7, t4, t3
+; RV32I-NEXT:    or a0, a0, t5
+; RV32I-NEXT:    or t0, s0, t6
+; RV32I-NEXT:    or t1, s5, s3
+; RV32I-NEXT:    or a1, a1, s1
+; RV32I-NEXT:    sw a7, 48(sp)
 ; RV32I-NEXT:    sw a0, 52(sp)
+; RV32I-NEXT:    sw t0, 56(sp)
+; RV32I-NEXT:    sw t1, 60(sp)
+; RV32I-NEXT:    sw a3, 32(sp)
+; RV32I-NEXT:    sw a4, 36(sp)
+; RV32I-NEXT:    sw a5, 40(sp)
+; RV32I-NEXT:    sw a6, 44(sp)
 ; RV32I-NEXT:    slli a3, a1, 3
 ; RV32I-NEXT:    andi a1, a1, 28
-; RV32I-NEXT:    sub a1, t6, a1
+; RV32I-NEXT:    sub a1, s4, a1
 ; RV32I-NEXT:    andi a0, a3, 24
 ; RV32I-NEXT:    xori a0, a0, 31
 ; RV32I-NEXT:    lw a4, 0(a1)
@@ -3664,10 +3647,10 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    srli t4, a4, 1
 ; RV32I-NEXT:    sll t5, a7, a3
 ; RV32I-NEXT:    srli t6, a6, 1
-; RV32I-NEXT:    sll s0, a6, a3
+; RV32I-NEXT:    sll a6, a6, a3
 ; RV32I-NEXT:    srli a5, a5, 1
-; RV32I-NEXT:    sll s1, t1, a3
-; RV32I-NEXT:    srli a6, t0, 1
+; RV32I-NEXT:    sll s0, t1, a3
+; RV32I-NEXT:    srli s1, t0, 1
 ; RV32I-NEXT:    sll s2, t0, a3
 ; RV32I-NEXT:    srli a7, a7, 1
 ; RV32I-NEXT:    sll s3, a1, a3
@@ -3675,56 +3658,56 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sll s4, t2, a3
 ; RV32I-NEXT:    srli t0, t1, 1
 ; RV32I-NEXT:    sll s5, a4, a3
-; RV32I-NEXT:    srl t2, t4, a0
-; RV32I-NEXT:    srl t4, t6, a0
-; RV32I-NEXT:    srl t6, a5, a0
-; RV32I-NEXT:    srl s6, a6, a0
-; RV32I-NEXT:    srl s7, a7, a0
-; RV32I-NEXT:    srl s8, a1, a0
-; RV32I-NEXT:    srl s9, t0, a0
-; RV32I-NEXT:    srli t1, s4, 24
-; RV32I-NEXT:    srli a7, s3, 24
+; RV32I-NEXT:    srl t4, t4, a0
+; RV32I-NEXT:    srl a4, t6, a0
+; RV32I-NEXT:    srl t1, a5, a0
+; RV32I-NEXT:    srl t6, s1, a0
+; RV32I-NEXT:    srl s1, a7, a0
+; RV32I-NEXT:    srl s6, a1, a0
+; RV32I-NEXT:    srl s7, t0, a0
+; RV32I-NEXT:    srli t2, s4, 24
+; RV32I-NEXT:    srli t0, s3, 24
 ; RV32I-NEXT:    srli a5, s2, 24
-; RV32I-NEXT:    srli a3, s1, 24
-; RV32I-NEXT:    srli a1, s0, 24
+; RV32I-NEXT:    srli a3, s0, 24
+; RV32I-NEXT:    srli a1, a6, 24
 ; RV32I-NEXT:    srli a0, t5, 24
-; RV32I-NEXT:    srli s10, s5, 24
-; RV32I-NEXT:    srli s11, s5, 16
-; RV32I-NEXT:    srli ra, s5, 8
-; RV32I-NEXT:    srli a4, t3, 24
-; RV32I-NEXT:    or a6, t3, t2
-; RV32I-NEXT:    or t0, t5, t4
-; RV32I-NEXT:    or t2, s0, t6
-; RV32I-NEXT:    or t3, s1, s6
-; RV32I-NEXT:    or t4, s2, s7
-; RV32I-NEXT:    or t5, s3, s8
-; RV32I-NEXT:    or t6, s4, s9
+; RV32I-NEXT:    srli s8, s5, 24
+; RV32I-NEXT:    or a4, t5, a4
+; RV32I-NEXT:    srli t5, s5, 16
+; RV32I-NEXT:    or t1, a6, t1
+; RV32I-NEXT:    srli s9, s5, 8
+; RV32I-NEXT:    or a7, t3, t4
+; RV32I-NEXT:    srli a6, t3, 24
+; RV32I-NEXT:    or t3, s0, t6
+; RV32I-NEXT:    or t4, s2, s1
+; RV32I-NEXT:    or t6, s3, s6
+; RV32I-NEXT:    or s0, s4, s7
 ; RV32I-NEXT:    sb s5, 0(a2)
-; RV32I-NEXT:    sb ra, 1(a2)
-; RV32I-NEXT:    sb s11, 2(a2)
-; RV32I-NEXT:    sb s10, 3(a2)
-; RV32I-NEXT:    srli s0, t6, 16
-; RV32I-NEXT:    srli s1, t6, 8
-; RV32I-NEXT:    srli s2, t5, 16
-; RV32I-NEXT:    srli s3, t5, 8
+; RV32I-NEXT:    sb s9, 1(a2)
+; RV32I-NEXT:    sb t5, 2(a2)
+; RV32I-NEXT:    sb s8, 3(a2)
+; RV32I-NEXT:    srli t5, s0, 16
+; RV32I-NEXT:    srli s1, s0, 8
+; RV32I-NEXT:    srli s2, t6, 16
+; RV32I-NEXT:    srli s3, t6, 8
 ; RV32I-NEXT:    srli s4, t4, 16
 ; RV32I-NEXT:    srli s5, t4, 8
 ; RV32I-NEXT:    srli s6, t3, 16
 ; RV32I-NEXT:    srli s7, t3, 8
-; RV32I-NEXT:    srli s8, t2, 16
-; RV32I-NEXT:    srli s9, t2, 8
-; RV32I-NEXT:    srli s10, t0, 16
-; RV32I-NEXT:    srli s11, t0, 8
-; RV32I-NEXT:    sb t6, 24(a2)
+; RV32I-NEXT:    sb s0, 24(a2)
+; RV32I-NEXT:    srli s0, t1, 16
 ; RV32I-NEXT:    sb s1, 25(a2)
-; RV32I-NEXT:    sb s0, 26(a2)
-; RV32I-NEXT:    sb t1, 27(a2)
-; RV32I-NEXT:    srli t1, a6, 16
-; RV32I-NEXT:    sb t5, 28(a2)
+; RV32I-NEXT:    srli s1, t1, 8
+; RV32I-NEXT:    sb t5, 26(a2)
+; RV32I-NEXT:    srli t5, a4, 16
+; RV32I-NEXT:    sb t2, 27(a2)
+; RV32I-NEXT:    srli t2, a4, 8
+; RV32I-NEXT:    sb t6, 28(a2)
+; RV32I-NEXT:    srli t6, a7, 16
 ; RV32I-NEXT:    sb s3, 29(a2)
 ; RV32I-NEXT:    sb s2, 30(a2)
-; RV32I-NEXT:    sb a7, 31(a2)
-; RV32I-NEXT:    srli a7, a6, 8
+; RV32I-NEXT:    sb t0, 31(a2)
+; RV32I-NEXT:    srli t0, a7, 8
 ; RV32I-NEXT:    sb t4, 16(a2)
 ; RV32I-NEXT:    sb s5, 17(a2)
 ; RV32I-NEXT:    sb s4, 18(a2)
@@ -3733,32 +3716,31 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sb s7, 21(a2)
 ; RV32I-NEXT:    sb s6, 22(a2)
 ; RV32I-NEXT:    sb a3, 23(a2)
-; RV32I-NEXT:    sb t2, 8(a2)
-; RV32I-NEXT:    sb s9, 9(a2)
-; RV32I-NEXT:    sb s8, 10(a2)
+; RV32I-NEXT:    sb t1, 8(a2)
+; RV32I-NEXT:    sb s1, 9(a2)
+; RV32I-NEXT:    sb s0, 10(a2)
 ; RV32I-NEXT:    sb a1, 11(a2)
-; RV32I-NEXT:    sb t0, 12(a2)
-; RV32I-NEXT:    sb s11, 13(a2)
-; RV32I-NEXT:    sb s10, 14(a2)
+; RV32I-NEXT:    sb a4, 12(a2)
+; RV32I-NEXT:    sb t2, 13(a2)
+; RV32I-NEXT:    sb t5, 14(a2)
 ; RV32I-NEXT:    sb a0, 15(a2)
-; RV32I-NEXT:    sb a6, 4(a2)
-; RV32I-NEXT:    sb a7, 5(a2)
-; RV32I-NEXT:    sb t1, 6(a2)
-; RV32I-NEXT:    sb a4, 7(a2)
-; RV32I-NEXT:    lw ra, 124(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 120(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 116(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 112(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s4, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s5, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s6, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s7, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s8, 88(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s9, 84(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s10, 80(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s11, 76(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 128
+; RV32I-NEXT:    sb a7, 4(a2)
+; RV32I-NEXT:    sb t0, 5(a2)
+; RV32I-NEXT:    sb t6, 6(a2)
+; RV32I-NEXT:    sb a6, 7(a2)
+; RV32I-NEXT:    lw s0, 108(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 104(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 100(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 96(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s5, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s6, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s7, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s8, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s9, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s10, 68(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s11, 64(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 112
 ; RV32I-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
   %byteOff = load i256, ptr %byteOff.ptr, align 1
@@ -4003,132 +3985,128 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw
 ;
 ; RV32I-LABEL: shl_32bytes_wordOff:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -128
-; RV32I-NEXT:    sw ra, 124(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 120(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 116(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 112(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s5, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s6, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s7, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s8, 88(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s9, 84(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s10, 80(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s11, 76(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv a3, a1
-; RV32I-NEXT:    lbu a5, 0(a0)
-; RV32I-NEXT:    lbu a7, 1(a0)
-; RV32I-NEXT:    lbu t0, 2(a0)
-; RV32I-NEXT:    lbu t1, 3(a0)
-; RV32I-NEXT:    lbu s2, 4(a0)
-; RV32I-NEXT:    lbu s4, 5(a0)
-; RV32I-NEXT:    lbu s5, 6(a0)
-; RV32I-NEXT:    lbu s6, 7(a0)
-; RV32I-NEXT:    lbu s3, 8(a0)
-; RV32I-NEXT:    lbu s9, 9(a0)
-; RV32I-NEXT:    lbu s10, 10(a0)
-; RV32I-NEXT:    lbu s11, 11(a0)
-; RV32I-NEXT:    lbu ra, 12(a0)
-; RV32I-NEXT:    lbu a1, 13(a0)
-; RV32I-NEXT:    lbu t4, 14(a0)
-; RV32I-NEXT:    lbu t6, 15(a0)
-; RV32I-NEXT:    lbu a4, 16(a0)
-; RV32I-NEXT:    sw a4, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a6, 17(a0)
-; RV32I-NEXT:    lbu t2, 18(a0)
-; RV32I-NEXT:    lbu t3, 19(a0)
-; RV32I-NEXT:    lbu a4, 20(a0)
-; RV32I-NEXT:    lbu t5, 21(a0)
+; RV32I-NEXT:    addi sp, sp, -112
+; RV32I-NEXT:    sw s0, 108(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 104(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 100(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 96(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s5, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s6, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s7, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s8, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s9, 72(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s10, 68(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s11, 64(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a3, 0(a0)
+; RV32I-NEXT:    lbu a4, 1(a0)
+; RV32I-NEXT:    lbu a5, 2(a0)
+; RV32I-NEXT:    lbu a6, 3(a0)
+; RV32I-NEXT:    lbu a7, 4(a0)
+; RV32I-NEXT:    lbu t0, 5(a0)
+; RV32I-NEXT:    lbu t1, 6(a0)
+; RV32I-NEXT:    lbu t2, 7(a0)
+; RV32I-NEXT:    lbu t3, 8(a0)
+; RV32I-NEXT:    lbu t4, 9(a0)
+; RV32I-NEXT:    lbu t5, 10(a0)
+; RV32I-NEXT:    lbu t6, 11(a0)
+; RV32I-NEXT:    lbu s2, 12(a0)
+; RV32I-NEXT:    lbu s3, 13(a0)
+; RV32I-NEXT:    lbu s4, 14(a0)
+; RV32I-NEXT:    lbu s5, 15(a0)
+; RV32I-NEXT:    lbu s6, 16(a0)
+; RV32I-NEXT:    lbu s7, 17(a0)
+; RV32I-NEXT:    lbu s8, 18(a0)
+; RV32I-NEXT:    lbu s9, 19(a0)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a6, a6, 24
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t2, t2, 24
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a5, t0, a7
+; RV32I-NEXT:    or a6, t2, t1
+; RV32I-NEXT:    lbu s10, 20(a0)
+; RV32I-NEXT:    lbu s11, 21(a0)
 ; RV32I-NEXT:    lbu s0, 22(a0)
 ; RV32I-NEXT:    lbu s1, 23(a0)
-; RV32I-NEXT:    slli a7, a7, 8
-; RV32I-NEXT:    slli t0, t0, 16
-; RV32I-NEXT:    slli t1, t1, 24
-; RV32I-NEXT:    slli s4, s4, 8
-; RV32I-NEXT:    slli s5, s5, 16
-; RV32I-NEXT:    slli s6, s6, 24
-; RV32I-NEXT:    or a5, a7, a5
-; RV32I-NEXT:    or a7, t1, t0
-; RV32I-NEXT:    or t0, s4, s2
-; RV32I-NEXT:    or t1, s6, s5
-; RV32I-NEXT:    lbu s2, 24(a0)
-; RV32I-NEXT:    lbu s6, 25(a0)
-; RV32I-NEXT:    lbu s7, 26(a0)
-; RV32I-NEXT:    lbu s8, 27(a0)
-; RV32I-NEXT:    slli s9, s9, 8
-; RV32I-NEXT:    slli s10, s10, 16
-; RV32I-NEXT:    slli s11, s11, 24
-; RV32I-NEXT:    slli a1, a1, 8
-; RV32I-NEXT:    or s3, s9, s3
-; RV32I-NEXT:    or s4, s11, s10
-; RV32I-NEXT:    or s5, a1, ra
-; RV32I-NEXT:    lbu s9, 28(a0)
-; RV32I-NEXT:    lbu a1, 29(a0)
-; RV32I-NEXT:    lbu s10, 30(a0)
+; RV32I-NEXT:    slli t4, t4, 8
+; RV32I-NEXT:    slli t5, t5, 16
+; RV32I-NEXT:    slli t6, t6, 24
+; RV32I-NEXT:    slli s3, s3, 8
+; RV32I-NEXT:    slli s4, s4, 16
+; RV32I-NEXT:    slli s5, s5, 24
+; RV32I-NEXT:    or a7, t4, t3
+; RV32I-NEXT:    or t0, t6, t5
+; RV32I-NEXT:    or t1, s3, s2
+; RV32I-NEXT:    or t2, s5, s4
+; RV32I-NEXT:    lbu t3, 24(a0)
+; RV32I-NEXT:    lbu s2, 25(a0)
+; RV32I-NEXT:    lbu s3, 26(a0)
+; RV32I-NEXT:    lbu s4, 27(a0)
+; RV32I-NEXT:    slli s7, s7, 8
+; RV32I-NEXT:    slli s8, s8, 16
+; RV32I-NEXT:    slli s9, s9, 24
+; RV32I-NEXT:    slli s11, s11, 8
+; RV32I-NEXT:    or t4, s7, s6
+; RV32I-NEXT:    or t5, s9, s8
+; RV32I-NEXT:    or t6, s11, s10
+; RV32I-NEXT:    lbu s5, 28(a0)
+; RV32I-NEXT:    lbu s6, 29(a0)
+; RV32I-NEXT:    lbu s7, 30(a0)
 ; RV32I-NEXT:    lbu a0, 31(a0)
-; RV32I-NEXT:    lbu a3, 0(a3)
+; RV32I-NEXT:    lbu a1, 0(a1)
+; RV32I-NEXT:    sw zero, 16(sp)
+; RV32I-NEXT:    sw zero, 20(sp)
 ; RV32I-NEXT:    sw zero, 24(sp)
 ; RV32I-NEXT:    sw zero, 28(sp)
-; RV32I-NEXT:    sw zero, 32(sp)
-; RV32I-NEXT:    sw zero, 36(sp)
+; RV32I-NEXT:    sw zero, 0(sp)
+; RV32I-NEXT:    sw zero, 4(sp)
 ; RV32I-NEXT:    sw zero, 8(sp)
 ; RV32I-NEXT:    sw zero, 12(sp)
-; RV32I-NEXT:    sw zero, 16(sp)
-; RV32I-NEXT:    sw zero, 20(sp)
-; RV32I-NEXT:    slli t4, t4, 16
-; RV32I-NEXT:    slli t6, t6, 24
-; RV32I-NEXT:    or t4, t6, t4
-; RV32I-NEXT:    addi t6, sp, 40
-; RV32I-NEXT:    slli a6, a6, 8
-; RV32I-NEXT:    slli t2, t2, 16
-; RV32I-NEXT:    slli t3, t3, 24
-; RV32I-NEXT:    slli t5, t5, 8
 ; RV32I-NEXT:    slli s0, s0, 16
 ; RV32I-NEXT:    slli s1, s1, 24
+; RV32I-NEXT:    or s0, s1, s0
+; RV32I-NEXT:    addi s1, sp, 32
+; RV32I-NEXT:    slli s2, s2, 8
+; RV32I-NEXT:    slli s3, s3, 16
+; RV32I-NEXT:    slli s4, s4, 24
 ; RV32I-NEXT:    slli s6, s6, 8
 ; RV32I-NEXT:    slli s7, s7, 16
-; RV32I-NEXT:    slli s8, s8, 24
-; RV32I-NEXT:    slli a1, a1, 8
-; RV32I-NEXT:    slli s10, s10, 16
 ; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    slli a3, a3, 2
-; RV32I-NEXT:    lw s11, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    or a6, a6, s11
-; RV32I-NEXT:    or t2, t3, t2
-; RV32I-NEXT:    or a4, t5, a4
-; RV32I-NEXT:    or s0, s1, s0
-; RV32I-NEXT:    or t3, s6, s2
-; RV32I-NEXT:    or t5, s8, s7
-; RV32I-NEXT:    or a1, a1, s9
-; RV32I-NEXT:    or a0, a0, s10
-; RV32I-NEXT:    andi a3, a3, 28
-; RV32I-NEXT:    or a5, a7, a5
-; RV32I-NEXT:    or a7, t1, t0
-; RV32I-NEXT:    or t0, s4, s3
-; RV32I-NEXT:    or t1, t4, s5
-; RV32I-NEXT:    or a6, t2, a6
-; RV32I-NEXT:    or a4, s0, a4
-; RV32I-NEXT:    or t2, t5, t3
-; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    sub t3, t6, a3
-; RV32I-NEXT:    sw a6, 56(sp)
-; RV32I-NEXT:    sw a4, 60(sp)
-; RV32I-NEXT:    sw t2, 64(sp)
-; RV32I-NEXT:    sw a0, 68(sp)
+; RV32I-NEXT:    slli a1, a1, 2
+; RV32I-NEXT:    or t3, s2, t3
+; RV32I-NEXT:    or s2, s4, s3
+; RV32I-NEXT:    or s3, s6, s5
+; RV32I-NEXT:    or a0, a0, s7
+; RV32I-NEXT:    andi a1, a1, 28
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a5, t0, a7
+; RV32I-NEXT:    or a6, t2, t1
+; RV32I-NEXT:    or a7, t5, t4
+; RV32I-NEXT:    or t0, s0, t6
+; RV32I-NEXT:    or t1, s2, t3
+; RV32I-NEXT:    or a0, a0, s3
+; RV32I-NEXT:    sub s1, s1, a1
+; RV32I-NEXT:    sw a7, 48(sp)
+; RV32I-NEXT:    sw t0, 52(sp)
+; RV32I-NEXT:    sw t1, 56(sp)
+; RV32I-NEXT:    sw a0, 60(sp)
+; RV32I-NEXT:    sw a3, 32(sp)
+; RV32I-NEXT:    sw a4, 36(sp)
 ; RV32I-NEXT:    sw a5, 40(sp)
-; RV32I-NEXT:    sw a7, 44(sp)
-; RV32I-NEXT:    sw t0, 48(sp)
-; RV32I-NEXT:    sw t1, 52(sp)
-; RV32I-NEXT:    lw a6, 16(t3)
-; RV32I-NEXT:    lw a5, 20(t3)
-; RV32I-NEXT:    lw a7, 24(t3)
-; RV32I-NEXT:    lw a1, 0(t3)
-; RV32I-NEXT:    lw a0, 4(t3)
-; RV32I-NEXT:    lw a4, 8(t3)
-; RV32I-NEXT:    lw a3, 12(t3)
-; RV32I-NEXT:    lw t0, 28(t3)
+; RV32I-NEXT:    sw a6, 44(sp)
+; RV32I-NEXT:    lw a6, 16(s1)
+; RV32I-NEXT:    lw a5, 20(s1)
+; RV32I-NEXT:    lw a7, 24(s1)
+; RV32I-NEXT:    lw a1, 0(s1)
+; RV32I-NEXT:    lw a0, 4(s1)
+; RV32I-NEXT:    lw a4, 8(s1)
+; RV32I-NEXT:    lw a3, 12(s1)
+; RV32I-NEXT:    lw t0, 28(s1)
 ; RV32I-NEXT:    srli t1, a7, 24
 ; RV32I-NEXT:    srli t2, a7, 16
 ; RV32I-NEXT:    srli t3, a7, 8
@@ -4143,21 +4121,21 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw
 ; RV32I-NEXT:    srli s5, a5, 8
 ; RV32I-NEXT:    srli s6, a4, 24
 ; RV32I-NEXT:    srli s7, a4, 16
-; RV32I-NEXT:    srli s8, a4, 8
-; RV32I-NEXT:    srli s9, a3, 24
-; RV32I-NEXT:    srli s10, a3, 16
-; RV32I-NEXT:    srli s11, a3, 8
 ; RV32I-NEXT:    sb a7, 24(a2)
-; RV32I-NEXT:    srli a7, a1, 24
+; RV32I-NEXT:    srli a7, a4, 8
 ; RV32I-NEXT:    sb t3, 25(a2)
+; RV32I-NEXT:    srli t3, a3, 24
 ; RV32I-NEXT:    sb t2, 26(a2)
+; RV32I-NEXT:    srli t2, a3, 16
 ; RV32I-NEXT:    sb t1, 27(a2)
-; RV32I-NEXT:    srli t1, a1, 16
+; RV32I-NEXT:    srli t1, a3, 8
 ; RV32I-NEXT:    sb t0, 28(a2)
+; RV32I-NEXT:    srli t0, a1, 24
 ; RV32I-NEXT:    sb t6, 29(a2)
+; RV32I-NEXT:    srli t6, a1, 16
 ; RV32I-NEXT:    sb t5, 30(a2)
 ; RV32I-NEXT:    sb t4, 31(a2)
-; RV32I-NEXT:    srli t0, a1, 8
+; RV32I-NEXT:    srli t4, a1, 8
 ; RV32I-NEXT:    sb a6, 16(a2)
 ; RV32I-NEXT:    sb s2, 17(a2)
 ; RV32I-NEXT:    sb s1, 18(a2)
@@ -4169,36 +4147,35 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw
 ; RV32I-NEXT:    sb s3, 23(a2)
 ; RV32I-NEXT:    srli a5, a0, 16
 ; RV32I-NEXT:    sb a4, 8(a2)
-; RV32I-NEXT:    sb s8, 9(a2)
+; RV32I-NEXT:    sb a7, 9(a2)
 ; RV32I-NEXT:    sb s7, 10(a2)
 ; RV32I-NEXT:    sb s6, 11(a2)
 ; RV32I-NEXT:    srli a4, a0, 8
 ; RV32I-NEXT:    sb a3, 12(a2)
-; RV32I-NEXT:    sb s11, 13(a2)
-; RV32I-NEXT:    sb s10, 14(a2)
-; RV32I-NEXT:    sb s9, 15(a2)
+; RV32I-NEXT:    sb t1, 13(a2)
+; RV32I-NEXT:    sb t2, 14(a2)
+; RV32I-NEXT:    sb t3, 15(a2)
 ; RV32I-NEXT:    sb a1, 0(a2)
-; RV32I-NEXT:    sb t0, 1(a2)
-; RV32I-NEXT:    sb t1, 2(a2)
-; RV32I-NEXT:    sb a7, 3(a2)
+; RV32I-NEXT:    sb t4, 1(a2)
+; RV32I-NEXT:    sb t6, 2(a2)
+; RV32I-NEXT:    sb t0, 3(a2)
 ; RV32I-NEXT:    sb a0, 4(a2)
 ; RV32I-NEXT:    sb a4, 5(a2)
 ; RV32I-NEXT:    sb a5, 6(a2)
 ; RV32I-NEXT:    sb a6, 7(a2)
-; RV32I-NEXT:    lw ra, 124(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 120(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 116(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 112(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s4, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s5, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s6, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s7, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s8, 88(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s9, 84(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s10, 80(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s11, 76(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 128
+; RV32I-NEXT:    lw s0, 108(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 104(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 100(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 96(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s5, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s6, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s7, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s8, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s9, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s10, 68(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s11, 64(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 112
 ; RV32I-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
   %wordOff = load i256, ptr %wordOff.ptr, align 1
@@ -4224,111 +4201,111 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
 ; RV64I-NEXT:    sd s9, 80(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s10, 72(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s11, 64(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a5, 0(a0)
-; RV64I-NEXT:    lbu a7, 1(a0)
-; RV64I-NEXT:    lbu t2, 2(a0)
-; RV64I-NEXT:    lbu s3, 3(a0)
-; RV64I-NEXT:    lbu t0, 4(a0)
-; RV64I-NEXT:    lbu s8, 5(a0)
-; RV64I-NEXT:    lbu s9, 6(a0)
-; RV64I-NEXT:    lbu s10, 7(a0)
-; RV64I-NEXT:    lbu s2, 8(a0)
-; RV64I-NEXT:    lbu s4, 9(a0)
-; RV64I-NEXT:    lbu s5, 10(a0)
-; RV64I-NEXT:    lbu s6, 11(a0)
-; RV64I-NEXT:    lbu s7, 12(a0)
-; RV64I-NEXT:    lbu s11, 13(a0)
-; RV64I-NEXT:    lbu t1, 14(a0)
-; RV64I-NEXT:    lbu t3, 15(a0)
-; RV64I-NEXT:    lbu a3, 16(a0)
-; RV64I-NEXT:    lbu a6, 17(a0)
-; RV64I-NEXT:    lbu t4, 18(a0)
-; RV64I-NEXT:    lbu t5, 19(a0)
-; RV64I-NEXT:    lbu a4, 20(a0)
-; RV64I-NEXT:    lbu t6, 21(a0)
-; RV64I-NEXT:    lbu s0, 22(a0)
-; RV64I-NEXT:    lbu s1, 23(a0)
-; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    slli t2, t2, 16
+; RV64I-NEXT:    lbu a3, 0(a0)
+; RV64I-NEXT:    lbu a4, 1(a0)
+; RV64I-NEXT:    lbu a5, 2(a0)
+; RV64I-NEXT:    lbu a6, 3(a0)
+; RV64I-NEXT:    lbu a7, 4(a0)
+; RV64I-NEXT:    lbu t0, 5(a0)
+; RV64I-NEXT:    lbu t1, 6(a0)
+; RV64I-NEXT:    lbu t2, 7(a0)
+; RV64I-NEXT:    lbu t3, 8(a0)
+; RV64I-NEXT:    lbu t4, 9(a0)
+; RV64I-NEXT:    lbu t5, 10(a0)
+; RV64I-NEXT:    lbu t6, 11(a0)
+; RV64I-NEXT:    lbu s0, 12(a0)
+; RV64I-NEXT:    lbu s1, 13(a0)
+; RV64I-NEXT:    lbu s2, 14(a0)
+; RV64I-NEXT:    lbu s3, 15(a0)
+; RV64I-NEXT:    lbu s4, 16(a0)
+; RV64I-NEXT:    lbu s5, 17(a0)
+; RV64I-NEXT:    lbu s6, 18(a0)
+; RV64I-NEXT:    lbu s7, 19(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    slli a5, a5, 16
+; RV64I-NEXT:    slli a6, a6, 24
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t2, t2, 24
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a5, t0, a7
+; RV64I-NEXT:    or a6, t2, t1
+; RV64I-NEXT:    lbu s8, 20(a0)
+; RV64I-NEXT:    lbu s9, 21(a0)
+; RV64I-NEXT:    lbu s10, 22(a0)
+; RV64I-NEXT:    lbu s11, 23(a0)
+; RV64I-NEXT:    slli t4, t4, 8
+; RV64I-NEXT:    slli t5, t5, 16
+; RV64I-NEXT:    slli t6, t6, 24
+; RV64I-NEXT:    slli s1, s1, 8
+; RV64I-NEXT:    slli s2, s2, 16
 ; RV64I-NEXT:    slli s3, s3, 24
-; RV64I-NEXT:    slli s8, s8, 8
-; RV64I-NEXT:    slli s9, s9, 16
-; RV64I-NEXT:    slli s10, s10, 24
-; RV64I-NEXT:    or a5, a7, a5
-; RV64I-NEXT:    or a7, s3, t2
-; RV64I-NEXT:    or t0, s8, t0
-; RV64I-NEXT:    or t2, s10, s9
-; RV64I-NEXT:    lbu s3, 24(a0)
-; RV64I-NEXT:    lbu s8, 25(a0)
-; RV64I-NEXT:    lbu s9, 26(a0)
-; RV64I-NEXT:    lbu s10, 27(a0)
-; RV64I-NEXT:    slli s4, s4, 8
-; RV64I-NEXT:    slli s5, s5, 16
-; RV64I-NEXT:    slli s6, s6, 24
-; RV64I-NEXT:    slli s11, s11, 8
-; RV64I-NEXT:    or s2, s4, s2
-; RV64I-NEXT:    or s4, s6, s5
-; RV64I-NEXT:    or s5, s11, s7
-; RV64I-NEXT:    lbu s6, 28(a0)
-; RV64I-NEXT:    lbu s7, 29(a0)
-; RV64I-NEXT:    lbu s11, 30(a0)
+; RV64I-NEXT:    or a7, t4, t3
+; RV64I-NEXT:    or t0, t6, t5
+; RV64I-NEXT:    or t1, s1, s0
+; RV64I-NEXT:    or t2, s3, s2
+; RV64I-NEXT:    lbu t3, 24(a0)
+; RV64I-NEXT:    lbu t4, 25(a0)
+; RV64I-NEXT:    lbu t5, 26(a0)
+; RV64I-NEXT:    lbu t6, 27(a0)
+; RV64I-NEXT:    slli s5, s5, 8
+; RV64I-NEXT:    slli s6, s6, 16
+; RV64I-NEXT:    slli s7, s7, 24
+; RV64I-NEXT:    slli s9, s9, 8
+; RV64I-NEXT:    or s0, s5, s4
+; RV64I-NEXT:    or s1, s7, s6
+; RV64I-NEXT:    or s2, s9, s8
+; RV64I-NEXT:    lbu s3, 28(a0)
+; RV64I-NEXT:    lbu s4, 29(a0)
+; RV64I-NEXT:    lbu s5, 30(a0)
 ; RV64I-NEXT:    lbu a0, 31(a0)
 ; RV64I-NEXT:    lbu a1, 0(a1)
 ; RV64I-NEXT:    sd zero, 0(sp)
 ; RV64I-NEXT:    sd zero, 8(sp)
 ; RV64I-NEXT:    sd zero, 16(sp)
 ; RV64I-NEXT:    sd zero, 24(sp)
-; RV64I-NEXT:    slli t1, t1, 16
-; RV64I-NEXT:    slli t3, t3, 24
-; RV64I-NEXT:    or t1, t3, t1
-; RV64I-NEXT:    addi t3, sp, 32
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    slli t4, t4, 16
-; RV64I-NEXT:    slli t5, t5, 24
-; RV64I-NEXT:    slli t6, t6, 8
-; RV64I-NEXT:    slli s0, s0, 16
-; RV64I-NEXT:    slli s1, s1, 24
-; RV64I-NEXT:    slli s8, s8, 8
-; RV64I-NEXT:    slli s9, s9, 16
-; RV64I-NEXT:    slli s10, s10, 24
-; RV64I-NEXT:    slli s7, s7, 8
-; RV64I-NEXT:    slli s11, s11, 16
+; RV64I-NEXT:    slli s10, s10, 16
+; RV64I-NEXT:    slli s11, s11, 24
+; RV64I-NEXT:    or s6, s11, s10
+; RV64I-NEXT:    addi s7, sp, 32
+; RV64I-NEXT:    slli t4, t4, 8
+; RV64I-NEXT:    slli t5, t5, 16
+; RV64I-NEXT:    slli t6, t6, 24
+; RV64I-NEXT:    slli s4, s4, 8
+; RV64I-NEXT:    slli s5, s5, 16
 ; RV64I-NEXT:    slli a0, a0, 24
 ; RV64I-NEXT:    slli a1, a1, 3
-; RV64I-NEXT:    or a3, a6, a3
-; RV64I-NEXT:    or a6, t5, t4
-; RV64I-NEXT:    or a4, t6, a4
-; RV64I-NEXT:    or s0, s1, s0
-; RV64I-NEXT:    or t4, s8, s3
-; RV64I-NEXT:    or t5, s10, s9
-; RV64I-NEXT:    or t6, s7, s6
-; RV64I-NEXT:    or a0, a0, s11
+; RV64I-NEXT:    or t3, t4, t3
+; RV64I-NEXT:    or t4, t6, t5
+; RV64I-NEXT:    or t5, s4, s3
+; RV64I-NEXT:    or a0, a0, s5
 ; RV64I-NEXT:    andi a1, a1, 24
-; RV64I-NEXT:    or a5, a7, a5
-; RV64I-NEXT:    or a7, t2, t0
-; RV64I-NEXT:    or t0, s4, s2
-; RV64I-NEXT:    or t1, t1, s5
-; RV64I-NEXT:    or a3, a6, a3
-; RV64I-NEXT:    or a4, s0, a4
-; RV64I-NEXT:    or a6, t5, t4
-; RV64I-NEXT:    or a0, a0, t6
-; RV64I-NEXT:    sub t2, t3, a1
-; RV64I-NEXT:    slli a7, a7, 32
-; RV64I-NEXT:    slli t1, t1, 32
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a5, t0, a7
+; RV64I-NEXT:    or a6, t2, t1
+; RV64I-NEXT:    or s0, s1, s0
+; RV64I-NEXT:    or a7, s6, s2
+; RV64I-NEXT:    or t0, t4, t3
+; RV64I-NEXT:    or a0, a0, t5
+; RV64I-NEXT:    sub t1, s7, a1
 ; RV64I-NEXT:    slli a4, a4, 32
+; RV64I-NEXT:    slli a6, a6, 32
+; RV64I-NEXT:    slli a7, a7, 32
 ; RV64I-NEXT:    slli a0, a0, 32
-; RV64I-NEXT:    or a1, a7, a5
-; RV64I-NEXT:    or a5, t1, t0
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    or a0, a0, a6
-; RV64I-NEXT:    sd a1, 32(sp)
-; RV64I-NEXT:    sd a5, 40(sp)
-; RV64I-NEXT:    sd a3, 48(sp)
+; RV64I-NEXT:    or a1, a6, a5
+; RV64I-NEXT:    or a4, a7, s0
+; RV64I-NEXT:    or a0, a0, t0
+; RV64I-NEXT:    sd a3, 32(sp)
+; RV64I-NEXT:    sd a1, 40(sp)
+; RV64I-NEXT:    sd a4, 48(sp)
 ; RV64I-NEXT:    sd a0, 56(sp)
-; RV64I-NEXT:    ld a4, 16(t2)
-; RV64I-NEXT:    ld a0, 8(t2)
-; RV64I-NEXT:    ld a1, 0(t2)
-; RV64I-NEXT:    ld a3, 24(t2)
+; RV64I-NEXT:    ld a4, 16(t1)
+; RV64I-NEXT:    ld a0, 8(t1)
+; RV64I-NEXT:    ld a1, 0(t1)
+; RV64I-NEXT:    ld a3, 24(t1)
 ; RV64I-NEXT:    srli a5, a4, 56
 ; RV64I-NEXT:    srli a6, a4, 48
 ; RV64I-NEXT:    srli a7, a4, 40
@@ -4347,25 +4324,25 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
 ; RV64I-NEXT:    srli s5, a1, 48
 ; RV64I-NEXT:    srli s6, a1, 40
 ; RV64I-NEXT:    srli s7, a1, 32
-; RV64I-NEXT:    srli s8, a1, 24
-; RV64I-NEXT:    srli s9, a1, 16
-; RV64I-NEXT:    srli s10, a1, 8
-; RV64I-NEXT:    srli s11, a0, 56
 ; RV64I-NEXT:    sb t0, 20(a2)
+; RV64I-NEXT:    srli t0, a1, 24
 ; RV64I-NEXT:    sb a7, 21(a2)
+; RV64I-NEXT:    srli a7, a1, 16
 ; RV64I-NEXT:    sb a6, 22(a2)
+; RV64I-NEXT:    srli a6, a1, 8
 ; RV64I-NEXT:    sb a5, 23(a2)
-; RV64I-NEXT:    srli a5, a0, 48
+; RV64I-NEXT:    srli a5, a0, 56
 ; RV64I-NEXT:    sb a4, 16(a2)
+; RV64I-NEXT:    srli a4, a0, 48
 ; RV64I-NEXT:    sb t3, 17(a2)
 ; RV64I-NEXT:    sb t2, 18(a2)
 ; RV64I-NEXT:    sb t1, 19(a2)
-; RV64I-NEXT:    srli a4, a0, 40
+; RV64I-NEXT:    srli t1, a0, 40
 ; RV64I-NEXT:    sb s0, 28(a2)
 ; RV64I-NEXT:    sb t6, 29(a2)
 ; RV64I-NEXT:    sb t5, 30(a2)
 ; RV64I-NEXT:    sb t4, 31(a2)
-; RV64I-NEXT:    srli a6, a0, 32
+; RV64I-NEXT:    srli t2, a0, 32
 ; RV64I-NEXT:    sb a3, 24(a2)
 ; RV64I-NEXT:    sb s3, 25(a2)
 ; RV64I-NEXT:    sb s2, 26(a2)
@@ -4375,19 +4352,19 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
 ; RV64I-NEXT:    sb s6, 5(a2)
 ; RV64I-NEXT:    sb s5, 6(a2)
 ; RV64I-NEXT:    sb s4, 7(a2)
-; RV64I-NEXT:    srli a7, a0, 16
+; RV64I-NEXT:    srli t3, a0, 16
 ; RV64I-NEXT:    sb a1, 0(a2)
-; RV64I-NEXT:    sb s10, 1(a2)
-; RV64I-NEXT:    sb s9, 2(a2)
-; RV64I-NEXT:    sb s8, 3(a2)
+; RV64I-NEXT:    sb a6, 1(a2)
+; RV64I-NEXT:    sb a7, 2(a2)
+; RV64I-NEXT:    sb t0, 3(a2)
 ; RV64I-NEXT:    srli a1, a0, 8
-; RV64I-NEXT:    sb a6, 12(a2)
-; RV64I-NEXT:    sb a4, 13(a2)
-; RV64I-NEXT:    sb a5, 14(a2)
-; RV64I-NEXT:    sb s11, 15(a2)
+; RV64I-NEXT:    sb t2, 12(a2)
+; RV64I-NEXT:    sb t1, 13(a2)
+; RV64I-NEXT:    sb a4, 14(a2)
+; RV64I-NEXT:    sb a5, 15(a2)
 ; RV64I-NEXT:    sb a0, 8(a2)
 ; RV64I-NEXT:    sb a1, 9(a2)
-; RV64I-NEXT:    sb a7, 10(a2)
+; RV64I-NEXT:    sb t3, 10(a2)
 ; RV64I-NEXT:    sb a3, 11(a2)
 ; RV64I-NEXT:    ld s0, 152(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 144(sp) # 8-byte Folded Reload
@@ -4406,132 +4383,128 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
 ;
 ; RV32I-LABEL: shl_32bytes_dwordOff:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -128
-; RV32I-NEXT:    sw ra, 124(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 120(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 116(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 112(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s5, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s6, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s7, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s8, 88(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s9, 84(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s10, 80(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s11, 76(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv a3, a1
-; RV32I-NEXT:    lbu a5, 0(a0)
-; RV32I-NEXT:    lbu a7, 1(a0)
-; RV32I-NEXT:    lbu t0, 2(a0)
-; RV32I-NEXT:    lbu t1, 3(a0)
-; RV32I-NEXT:    lbu s2, 4(a0)
-; RV32I-NEXT:    lbu s4, 5(a0)
-; RV32I-NEXT:    lbu s5, 6(a0)
-; RV32I-NEXT:    lbu s6, 7(a0)
-; RV32I-NEXT:    lbu s3, 8(a0)
-; RV32I-NEXT:    lbu s9, 9(a0)
-; RV32I-NEXT:    lbu s10, 10(a0)
-; RV32I-NEXT:    lbu s11, 11(a0)
-; RV32I-NEXT:    lbu ra, 12(a0)
-; RV32I-NEXT:    lbu a1, 13(a0)
-; RV32I-NEXT:    lbu t4, 14(a0)
-; RV32I-NEXT:    lbu t6, 15(a0)
-; RV32I-NEXT:    lbu a4, 16(a0)
-; RV32I-NEXT:    sw a4, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a6, 17(a0)
-; RV32I-NEXT:    lbu t2, 18(a0)
-; RV32I-NEXT:    lbu t3, 19(a0)
-; RV32I-NEXT:    lbu a4, 20(a0)
-; RV32I-NEXT:    lbu t5, 21(a0)
+; RV32I-NEXT:    addi sp, sp, -112
+; RV32I-NEXT:    sw s0, 108(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 104(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 100(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 96(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s5, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s6, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s7, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s8, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s9, 72(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s10, 68(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s11, 64(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a3, 0(a0)
+; RV32I-NEXT:    lbu a4, 1(a0)
+; RV32I-NEXT:    lbu a5, 2(a0)
+; RV32I-NEXT:    lbu a6, 3(a0)
+; RV32I-NEXT:    lbu a7, 4(a0)
+; RV32I-NEXT:    lbu t0, 5(a0)
+; RV32I-NEXT:    lbu t1, 6(a0)
+; RV32I-NEXT:    lbu t2, 7(a0)
+; RV32I-NEXT:    lbu t3, 8(a0)
+; RV32I-NEXT:    lbu t4, 9(a0)
+; RV32I-NEXT:    lbu t5, 10(a0)
+; RV32I-NEXT:    lbu t6, 11(a0)
+; RV32I-NEXT:    lbu s2, 12(a0)
+; RV32I-NEXT:    lbu s3, 13(a0)
+; RV32I-NEXT:    lbu s4, 14(a0)
+; RV32I-NEXT:    lbu s5, 15(a0)
+; RV32I-NEXT:    lbu s6, 16(a0)
+; RV32I-NEXT:    lbu s7, 17(a0)
+; RV32I-NEXT:    lbu s8, 18(a0)
+; RV32I-NEXT:    lbu s9, 19(a0)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a6, a6, 24
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t2, t2, 24
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a5, t0, a7
+; RV32I-NEXT:    or a6, t2, t1
+; RV32I-NEXT:    lbu s10, 20(a0)
+; RV32I-NEXT:    lbu s11, 21(a0)
 ; RV32I-NEXT:    lbu s0, 22(a0)
 ; RV32I-NEXT:    lbu s1, 23(a0)
-; RV32I-NEXT:    slli a7, a7, 8
-; RV32I-NEXT:    slli t0, t0, 16
-; RV32I-NEXT:    slli t1, t1, 24
-; RV32I-NEXT:    slli s4, s4, 8
-; RV32I-NEXT:    slli s5, s5, 16
-; RV32I-NEXT:    slli s6, s6, 24
-; RV32I-NEXT:    or a5, a7, a5
-; RV32I-NEXT:    or a7, t1, t0
-; RV32I-NEXT:    or t0, s4, s2
-; RV32I-NEXT:    or t1, s6, s5
-; RV32I-NEXT:    lbu s2, 24(a0)
-; RV32I-NEXT:    lbu s6, 25(a0)
-; RV32I-NEXT:    lbu s7, 26(a0)
-; RV32I-NEXT:    lbu s8, 27(a0)
-; RV32I-NEXT:    slli s9, s9, 8
-; RV32I-NEXT:    slli s10, s10, 16
-; RV32I-NEXT:    slli s11, s11, 24
-; RV32I-NEXT:    slli a1, a1, 8
-; RV32I-NEXT:    or s3, s9, s3
-; RV32I-NEXT:    or s4, s11, s10
-; RV32I-NEXT:    or s5, a1, ra
-; RV32I-NEXT:    lbu s9, 28(a0)
-; RV32I-NEXT:    lbu a1, 29(a0)
-; RV32I-NEXT:    lbu s10, 30(a0)
+; RV32I-NEXT:    slli t4, t4, 8
+; RV32I-NEXT:    slli t5, t5, 16
+; RV32I-NEXT:    slli t6, t6, 24
+; RV32I-NEXT:    slli s3, s3, 8
+; RV32I-NEXT:    slli s4, s4, 16
+; RV32I-NEXT:    slli s5, s5, 24
+; RV32I-NEXT:    or a7, t4, t3
+; RV32I-NEXT:    or t0, t6, t5
+; RV32I-NEXT:    or t1, s3, s2
+; RV32I-NEXT:    or t2, s5, s4
+; RV32I-NEXT:    lbu t3, 24(a0)
+; RV32I-NEXT:    lbu s2, 25(a0)
+; RV32I-NEXT:    lbu s3, 26(a0)
+; RV32I-NEXT:    lbu s4, 27(a0)
+; RV32I-NEXT:    slli s7, s7, 8
+; RV32I-NEXT:    slli s8, s8, 16
+; RV32I-NEXT:    slli s9, s9, 24
+; RV32I-NEXT:    slli s11, s11, 8
+; RV32I-NEXT:    or t4, s7, s6
+; RV32I-NEXT:    or t5, s9, s8
+; RV32I-NEXT:    or t6, s11, s10
+; RV32I-NEXT:    lbu s5, 28(a0)
+; RV32I-NEXT:    lbu s6, 29(a0)
+; RV32I-NEXT:    lbu s7, 30(a0)
 ; RV32I-NEXT:    lbu a0, 31(a0)
-; RV32I-NEXT:    lbu a3, 0(a3)
+; RV32I-NEXT:    lbu a1, 0(a1)
+; RV32I-NEXT:    sw zero, 16(sp)
+; RV32I-NEXT:    sw zero, 20(sp)
 ; RV32I-NEXT:    sw zero, 24(sp)
 ; RV32I-NEXT:    sw zero, 28(sp)
-; RV32I-NEXT:    sw zero, 32(sp)
-; RV32I-NEXT:    sw zero, 36(sp)
+; RV32I-NEXT:    sw zero, 0(sp)
+; RV32I-NEXT:    sw zero, 4(sp)
 ; RV32I-NEXT:    sw zero, 8(sp)
 ; RV32I-NEXT:    sw zero, 12(sp)
-; RV32I-NEXT:    sw zero, 16(sp)
-; RV32I-NEXT:    sw zero, 20(sp)
-; RV32I-NEXT:    slli t4, t4, 16
-; RV32I-NEXT:    slli t6, t6, 24
-; RV32I-NEXT:    or t4, t6, t4
-; RV32I-NEXT:    addi t6, sp, 40
-; RV32I-NEXT:    slli a6, a6, 8
-; RV32I-NEXT:    slli t2, t2, 16
-; RV32I-NEXT:    slli t3, t3, 24
-; RV32I-NEXT:    slli t5, t5, 8
 ; RV32I-NEXT:    slli s0, s0, 16
 ; RV32I-NEXT:    slli s1, s1, 24
+; RV32I-NEXT:    or s0, s1, s0
+; RV32I-NEXT:    addi s1, sp, 32
+; RV32I-NEXT:    slli s2, s2, 8
+; RV32I-NEXT:    slli s3, s3, 16
+; RV32I-NEXT:    slli s4, s4, 24
 ; RV32I-NEXT:    slli s6, s6, 8
 ; RV32I-NEXT:    slli s7, s7, 16
-; RV32I-NEXT:    slli s8, s8, 24
-; RV32I-NEXT:    slli a1, a1, 8
-; RV32I-NEXT:    slli s10, s10, 16
 ; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    slli a3, a3, 3
-; RV32I-NEXT:    lw s11, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    or a6, a6, s11
-; RV32I-NEXT:    or t2, t3, t2
-; RV32I-NEXT:    or a4, t5, a4
-; RV32I-NEXT:    or s0, s1, s0
-; RV32I-NEXT:    or t3, s6, s2
-; RV32I-NEXT:    or t5, s8, s7
-; RV32I-NEXT:    or a1, a1, s9
-; RV32I-NEXT:    or a0, a0, s10
-; RV32I-NEXT:    andi a3, a3, 24
-; RV32I-NEXT:    or a5, a7, a5
-; RV32I-NEXT:    or a7, t1, t0
-; RV32I-NEXT:    or t0, s4, s3
-; RV32I-NEXT:    or t1, t4, s5
-; RV32I-NEXT:    or a6, t2, a6
-; RV32I-NEXT:    or a4, s0, a4
-; RV32I-NEXT:    or t2, t5, t3
-; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    sub t3, t6, a3
-; RV32I-NEXT:    sw a6, 56(sp)
-; RV32I-NEXT:    sw a4, 60(sp)
-; RV32I-NEXT:    sw t2, 64(sp)
-; RV32I-NEXT:    sw a0, 68(sp)
+; RV32I-NEXT:    slli a1, a1, 3
+; RV32I-NEXT:    or t3, s2, t3
+; RV32I-NEXT:    or s2, s4, s3
+; RV32I-NEXT:    or s3, s6, s5
+; RV32I-NEXT:    or a0, a0, s7
+; RV32I-NEXT:    andi a1, a1, 24
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a5, t0, a7
+; RV32I-NEXT:    or a6, t2, t1
+; RV32I-NEXT:    or a7, t5, t4
+; RV32I-NEXT:    or t0, s0, t6
+; RV32I-NEXT:    or t1, s2, t3
+; RV32I-NEXT:    or a0, a0, s3
+; RV32I-NEXT:    sub s1, s1, a1
+; RV32I-NEXT:    sw a7, 48(sp)
+; RV32I-NEXT:    sw t0, 52(sp)
+; RV32I-NEXT:    sw t1, 56(sp)
+; RV32I-NEXT:    sw a0, 60(sp)
+; RV32I-NEXT:    sw a3, 32(sp)
+; RV32I-NEXT:    sw a4, 36(sp)
 ; RV32I-NEXT:    sw a5, 40(sp)
-; RV32I-NEXT:    sw a7, 44(sp)
-; RV32I-NEXT:    sw t0, 48(sp)
-; RV32I-NEXT:    sw t1, 52(sp)
-; RV32I-NEXT:    lw a6, 16(t3)
-; RV32I-NEXT:    lw a5, 20(t3)
-; RV32I-NEXT:    lw a7, 24(t3)
-; RV32I-NEXT:    lw a1, 0(t3)
-; RV32I-NEXT:    lw a0, 4(t3)
-; RV32I-NEXT:    lw a4, 8(t3)
-; RV32I-NEXT:    lw a3, 12(t3)
-; RV32I-NEXT:    lw t0, 28(t3)
+; RV32I-NEXT:    sw a6, 44(sp)
+; RV32I-NEXT:    lw a6, 16(s1)
+; RV32I-NEXT:    lw a5, 20(s1)
+; RV32I-NEXT:    lw a7, 24(s1)
+; RV32I-NEXT:    lw a1, 0(s1)
+; RV32I-NEXT:    lw a0, 4(s1)
+; RV32I-NEXT:    lw a4, 8(s1)
+; RV32I-NEXT:    lw a3, 12(s1)
+; RV32I-NEXT:    lw t0, 28(s1)
 ; RV32I-NEXT:    srli t1, a7, 24
 ; RV32I-NEXT:    srli t2, a7, 16
 ; RV32I-NEXT:    srli t3, a7, 8
@@ -4546,21 +4519,21 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
 ; RV32I-NEXT:    srli s5, a5, 8
 ; RV32I-NEXT:    srli s6, a4, 24
 ; RV32I-NEXT:    srli s7, a4, 16
-; RV32I-NEXT:    srli s8, a4, 8
-; RV32I-NEXT:    srli s9, a3, 24
-; RV32I-NEXT:    srli s10, a3, 16
-; RV32I-NEXT:    srli s11, a3, 8
 ; RV32I-NEXT:    sb a7, 24(a2)
-; RV32I-NEXT:    srli a7, a1, 24
+; RV32I-NEXT:    srli a7, a4, 8
 ; RV32I-NEXT:    sb t3, 25(a2)
+; RV32I-NEXT:    srli t3, a3, 24
 ; RV32I-NEXT:    sb t2, 26(a2)
+; RV32I-NEXT:    srli t2, a3, 16
 ; RV32I-NEXT:    sb t1, 27(a2)
-; RV32I-NEXT:    srli t1, a1, 16
+; RV32I-NEXT:    srli t1, a3, 8
 ; RV32I-NEXT:    sb t0, 28(a2)
+; RV32I-NEXT:    srli t0, a1, 24
 ; RV32I-NEXT:    sb t6, 29(a2)
+; RV32I-NEXT:    srli t6, a1, 16
 ; RV32I-NEXT:    sb t5, 30(a2)
 ; RV32I-NEXT:    sb t4, 31(a2)
-; RV32I-NEXT:    srli t0, a1, 8
+; RV32I-NEXT:    srli t4, a1, 8
 ; RV32I-NEXT:    sb a6, 16(a2)
 ; RV32I-NEXT:    sb s2, 17(a2)
 ; RV32I-NEXT:    sb s1, 18(a2)
@@ -4572,36 +4545,35 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
 ; RV32I-NEXT:    sb s3, 23(a2)
 ; RV32I-NEXT:    srli a5, a0, 16
 ; RV32I-NEXT:    sb a4, 8(a2)
-; RV32I-NEXT:    sb s8, 9(a2)
+; RV32I-NEXT:    sb a7, 9(a2)
 ; RV32I-NEXT:    sb s7, 10(a2)
 ; RV32I-NEXT:    sb s6, 11(a2)
 ; RV32I-NEXT:    srli a4, a0, 8
 ; RV32I-NEXT:    sb a3, 12(a2)
-; RV32I-NEXT:    sb s11, 13(a2)
-; RV32I-NEXT:    sb s10, 14(a2)
-; RV32I-NEXT:    sb s9, 15(a2)
+; RV32I-NEXT:    sb t1, 13(a2)
+; RV32I-NEXT:    sb t2, 14(a2)
+; RV32I-NEXT:    sb t3, 15(a2)
 ; RV32I-NEXT:    sb a1, 0(a2)
-; RV32I-NEXT:    sb t0, 1(a2)
-; RV32I-NEXT:    sb t1, 2(a2)
-; RV32I-NEXT:    sb a7, 3(a2)
+; RV32I-NEXT:    sb t4, 1(a2)
+; RV32I-NEXT:    sb t6, 2(a2)
+; RV32I-NEXT:    sb t0, 3(a2)
 ; RV32I-NEXT:    sb a0, 4(a2)
 ; RV32I-NEXT:    sb a4, 5(a2)
 ; RV32I-NEXT:    sb a5, 6(a2)
 ; RV32I-NEXT:    sb a6, 7(a2)
-; RV32I-NEXT:    lw ra, 124(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 120(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 116(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 112(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s4, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s5, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s6, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s7, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s8, 88(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s9, 84(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s10, 80(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s11, 76(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 128
+; RV32I-NEXT:    lw s0, 108(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 104(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 100(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 96(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s5, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s6, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s7, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s8, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s9, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s10, 68(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s11, 64(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 112
 ; RV32I-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
   %dwordOff = load i256, ptr %dwordOff.ptr, align 1
@@ -4846,140 +4818,137 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ;
 ; RV32I-LABEL: ashr_32bytes:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -128
-; RV32I-NEXT:    sw ra, 124(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 120(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 116(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 112(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s5, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s6, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s7, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s8, 88(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s9, 84(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s10, 80(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s11, 76(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu t6, 0(a0)
+; RV32I-NEXT:    addi sp, sp, -112
+; RV32I-NEXT:    sw s0, 108(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 104(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 100(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 96(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s5, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s6, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s7, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s8, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s9, 72(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s10, 68(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s11, 64(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a3, 0(a0)
 ; RV32I-NEXT:    lbu a4, 1(a0)
 ; RV32I-NEXT:    lbu a5, 2(a0)
 ; RV32I-NEXT:    lbu a6, 3(a0)
-; RV32I-NEXT:    lbu t1, 4(a0)
-; RV32I-NEXT:    lbu t3, 5(a0)
-; RV32I-NEXT:    lbu t4, 6(a0)
-; RV32I-NEXT:    lbu t5, 7(a0)
-; RV32I-NEXT:    lbu t2, 8(a0)
-; RV32I-NEXT:    lbu s1, 9(a0)
-; RV32I-NEXT:    lbu s7, 10(a0)
-; RV32I-NEXT:    lbu s8, 11(a0)
-; RV32I-NEXT:    lbu s9, 12(a0)
-; RV32I-NEXT:    lbu s10, 13(a0)
-; RV32I-NEXT:    lbu s4, 14(a0)
-; RV32I-NEXT:    lbu s6, 15(a0)
-; RV32I-NEXT:    lbu s5, 16(a0)
-; RV32I-NEXT:    lbu s11, 17(a0)
-; RV32I-NEXT:    lbu ra, 18(a0)
-; RV32I-NEXT:    lbu a3, 19(a0)
-; RV32I-NEXT:    lbu s2, 20(a0)
-; RV32I-NEXT:    lbu s3, 21(a0)
-; RV32I-NEXT:    lbu a7, 22(a0)
-; RV32I-NEXT:    lbu t0, 23(a0)
+; RV32I-NEXT:    lbu a7, 4(a0)
+; RV32I-NEXT:    lbu t0, 5(a0)
+; RV32I-NEXT:    lbu t1, 6(a0)
+; RV32I-NEXT:    lbu t2, 7(a0)
+; RV32I-NEXT:    lbu t3, 8(a0)
+; RV32I-NEXT:    lbu t4, 9(a0)
+; RV32I-NEXT:    lbu t5, 10(a0)
+; RV32I-NEXT:    lbu t6, 11(a0)
+; RV32I-NEXT:    lbu s0, 12(a0)
+; RV32I-NEXT:    lbu s1, 13(a0)
+; RV32I-NEXT:    lbu s2, 14(a0)
+; RV32I-NEXT:    lbu s3, 15(a0)
+; RV32I-NEXT:    lbu s4, 16(a0)
+; RV32I-NEXT:    lbu s5, 17(a0)
+; RV32I-NEXT:    lbu s6, 18(a0)
+; RV32I-NEXT:    lbu s7, 19(a0)
 ; RV32I-NEXT:    slli a4, a4, 8
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    slli t3, t3, 8
-; RV32I-NEXT:    slli t4, t4, 16
-; RV32I-NEXT:    slli t5, t5, 24
-; RV32I-NEXT:    or a4, a4, t6
-; RV32I-NEXT:    sw a4, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t2, t2, 24
+; RV32I-NEXT:    or a3, a4, a3
 ; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a5, t3, t1
-; RV32I-NEXT:    or a6, t5, t4
-; RV32I-NEXT:    lbu t1, 24(a0)
-; RV32I-NEXT:    lbu t5, 25(a0)
-; RV32I-NEXT:    lbu t6, 26(a0)
-; RV32I-NEXT:    lbu s0, 27(a0)
+; RV32I-NEXT:    or a5, t0, a7
+; RV32I-NEXT:    or a6, t2, t1
+; RV32I-NEXT:    lbu s8, 20(a0)
+; RV32I-NEXT:    lbu s9, 21(a0)
+; RV32I-NEXT:    lbu s10, 22(a0)
+; RV32I-NEXT:    lbu s11, 23(a0)
+; RV32I-NEXT:    slli t4, t4, 8
+; RV32I-NEXT:    slli t5, t5, 16
+; RV32I-NEXT:    slli t6, t6, 24
 ; RV32I-NEXT:    slli s1, s1, 8
-; RV32I-NEXT:    slli s7, s7, 16
-; RV32I-NEXT:    slli s8, s8, 24
-; RV32I-NEXT:    slli s10, s10, 8
-; RV32I-NEXT:    or t2, s1, t2
-; RV32I-NEXT:    or t3, s8, s7
-; RV32I-NEXT:    or t4, s10, s9
-; RV32I-NEXT:    lbu s1, 28(a0)
-; RV32I-NEXT:    lbu s7, 29(a0)
-; RV32I-NEXT:    lbu s8, 30(a0)
-; RV32I-NEXT:    lbu s9, 31(a0)
-; RV32I-NEXT:    slli s4, s4, 16
-; RV32I-NEXT:    slli s6, s6, 24
-; RV32I-NEXT:    slli s11, s11, 8
-; RV32I-NEXT:    slli ra, ra, 16
-; RV32I-NEXT:    slli a3, a3, 24
-; RV32I-NEXT:    or a0, s6, s4
-; RV32I-NEXT:    or s4, s11, s5
-; RV32I-NEXT:    or s5, a3, ra
-; RV32I-NEXT:    lbu a3, 0(a1)
-; RV32I-NEXT:    lbu s6, 1(a1)
-; RV32I-NEXT:    lbu s10, 2(a1)
-; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli s3, s3, 8
-; RV32I-NEXT:    or s2, s3, s2
-; RV32I-NEXT:    addi s3, sp, 8
-; RV32I-NEXT:    slli a7, a7, 16
-; RV32I-NEXT:    slli t0, t0, 24
-; RV32I-NEXT:    slli t5, t5, 8
-; RV32I-NEXT:    slli t6, t6, 16
-; RV32I-NEXT:    slli s0, s0, 24
-; RV32I-NEXT:    slli s7, s7, 8
-; RV32I-NEXT:    slli s8, s8, 16
-; RV32I-NEXT:    slli s9, s9, 24
-; RV32I-NEXT:    slli s6, s6, 8
+; RV32I-NEXT:    slli s2, s2, 16
+; RV32I-NEXT:    slli s3, s3, 24
+; RV32I-NEXT:    or a7, t4, t3
+; RV32I-NEXT:    or t0, t6, t5
+; RV32I-NEXT:    or t1, s1, s0
+; RV32I-NEXT:    or t2, s3, s2
+; RV32I-NEXT:    lbu t6, 24(a0)
+; RV32I-NEXT:    lbu s0, 25(a0)
+; RV32I-NEXT:    lbu s1, 26(a0)
+; RV32I-NEXT:    lbu s2, 27(a0)
+; RV32I-NEXT:    slli s5, s5, 8
+; RV32I-NEXT:    slli s6, s6, 16
+; RV32I-NEXT:    slli s7, s7, 24
+; RV32I-NEXT:    slli s9, s9, 8
+; RV32I-NEXT:    or t3, s5, s4
+; RV32I-NEXT:    or t4, s7, s6
+; RV32I-NEXT:    or t5, s9, s8
+; RV32I-NEXT:    lbu s3, 28(a0)
+; RV32I-NEXT:    lbu s4, 29(a0)
+; RV32I-NEXT:    lbu s5, 30(a0)
+; RV32I-NEXT:    lbu a0, 31(a0)
 ; RV32I-NEXT:    slli s10, s10, 16
+; RV32I-NEXT:    slli s11, s11, 24
+; RV32I-NEXT:    slli s0, s0, 8
+; RV32I-NEXT:    slli s1, s1, 16
+; RV32I-NEXT:    slli s2, s2, 24
+; RV32I-NEXT:    or s6, s11, s10
+; RV32I-NEXT:    or t6, s0, t6
+; RV32I-NEXT:    or s0, s2, s1
+; RV32I-NEXT:    lbu s1, 0(a1)
+; RV32I-NEXT:    lbu s2, 1(a1)
+; RV32I-NEXT:    lbu s7, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli s4, s4, 8
+; RV32I-NEXT:    or s3, s4, s3
+; RV32I-NEXT:    mv s4, sp
+; RV32I-NEXT:    slli s5, s5, 16
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    slli s2, s2, 8
+; RV32I-NEXT:    slli s7, s7, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a7, t0, a7
-; RV32I-NEXT:    or t0, t5, t1
+; RV32I-NEXT:    or s5, a0, s5
+; RV32I-NEXT:    or s1, s2, s1
+; RV32I-NEXT:    or a1, a1, s7
+; RV32I-NEXT:    srai a0, a0, 31
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a5, t0, a7
+; RV32I-NEXT:    or a6, t2, t1
+; RV32I-NEXT:    or a7, t4, t3
+; RV32I-NEXT:    or t0, s6, t5
 ; RV32I-NEXT:    or t1, s0, t6
-; RV32I-NEXT:    or t5, s7, s1
-; RV32I-NEXT:    or t6, s9, s8
-; RV32I-NEXT:    or a3, s6, a3
-; RV32I-NEXT:    or a1, a1, s10
-; RV32I-NEXT:    srai s0, s9, 31
-; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    or a4, a4, s1
-; RV32I-NEXT:    or a5, a6, a5
-; RV32I-NEXT:    or a6, t3, t2
-; RV32I-NEXT:    or a0, a0, t4
-; RV32I-NEXT:    or t2, s5, s4
-; RV32I-NEXT:    or a7, a7, s2
-; RV32I-NEXT:    or t0, t1, t0
-; RV32I-NEXT:    or t1, t6, t5
-; RV32I-NEXT:    or a1, a1, a3
-; RV32I-NEXT:    sw s0, 56(sp)
-; RV32I-NEXT:    sw s0, 60(sp)
-; RV32I-NEXT:    sw s0, 64(sp)
-; RV32I-NEXT:    sw s0, 68(sp)
-; RV32I-NEXT:    sw s0, 40(sp)
-; RV32I-NEXT:    sw s0, 44(sp)
-; RV32I-NEXT:    sw s0, 48(sp)
-; RV32I-NEXT:    sw s0, 52(sp)
-; RV32I-NEXT:    sw t2, 24(sp)
-; RV32I-NEXT:    sw a7, 28(sp)
-; RV32I-NEXT:    sw t0, 32(sp)
-; RV32I-NEXT:    sw t1, 36(sp)
-; RV32I-NEXT:    sw a4, 8(sp)
-; RV32I-NEXT:    sw a5, 12(sp)
-; RV32I-NEXT:    sw a6, 16(sp)
-; RV32I-NEXT:    sw a0, 20(sp)
+; RV32I-NEXT:    or t2, s5, s3
+; RV32I-NEXT:    or a1, a1, s1
+; RV32I-NEXT:    sw a0, 48(sp)
+; RV32I-NEXT:    sw a0, 52(sp)
+; RV32I-NEXT:    sw a0, 56(sp)
+; RV32I-NEXT:    sw a0, 60(sp)
+; RV32I-NEXT:    sw a0, 32(sp)
+; RV32I-NEXT:    sw a0, 36(sp)
+; RV32I-NEXT:    sw a0, 40(sp)
+; RV32I-NEXT:    sw a0, 44(sp)
+; RV32I-NEXT:    sw a7, 16(sp)
+; RV32I-NEXT:    sw t0, 20(sp)
+; RV32I-NEXT:    sw t1, 24(sp)
+; RV32I-NEXT:    sw t2, 28(sp)
+; RV32I-NEXT:    sw a3, 0(sp)
+; RV32I-NEXT:    sw a4, 4(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a6, 12(sp)
 ; RV32I-NEXT:    slli t1, a1, 3
 ; RV32I-NEXT:    andi a1, a1, 28
-; RV32I-NEXT:    add a1, s3, a1
+; RV32I-NEXT:    add a1, s4, a1
 ; RV32I-NEXT:    andi a0, t1, 24
-; RV32I-NEXT:    xori t0, a0, 31
+; RV32I-NEXT:    xori a7, a0, 31
 ; RV32I-NEXT:    lw a3, 0(a1)
 ; RV32I-NEXT:    lw a4, 4(a1)
 ; RV32I-NEXT:    lw a5, 8(a1)
 ; RV32I-NEXT:    lw a6, 12(a1)
-; RV32I-NEXT:    lw a7, 16(a1)
+; RV32I-NEXT:    lw t0, 16(a1)
 ; RV32I-NEXT:    lw t2, 20(a1)
 ; RV32I-NEXT:    lw t3, 24(a1)
 ; RV32I-NEXT:    lw t4, 28(a1)
@@ -4988,33 +4957,33 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    srl a1, a3, t1
 ; RV32I-NEXT:    slli t6, a4, 1
 ; RV32I-NEXT:    srl a3, a6, t1
-; RV32I-NEXT:    slli s0, a7, 1
+; RV32I-NEXT:    slli s0, t0, 1
 ; RV32I-NEXT:    srl a4, a5, t1
 ; RV32I-NEXT:    slli s1, a6, 1
 ; RV32I-NEXT:    srl a5, t2, t1
 ; RV32I-NEXT:    slli s2, t3, 1
-; RV32I-NEXT:    srl a6, a7, t1
+; RV32I-NEXT:    srl a6, t0, t1
 ; RV32I-NEXT:    slli t2, t2, 1
-; RV32I-NEXT:    srl a7, t3, t1
+; RV32I-NEXT:    srl t0, t3, t1
 ; RV32I-NEXT:    slli t3, t4, 1
 ; RV32I-NEXT:    sra t1, t4, t1
-; RV32I-NEXT:    sll t4, t5, t0
-; RV32I-NEXT:    sll t5, t6, t0
-; RV32I-NEXT:    sll t6, s0, t0
-; RV32I-NEXT:    sll s0, s1, t0
-; RV32I-NEXT:    sll s1, s2, t0
-; RV32I-NEXT:    sll t2, t2, t0
-; RV32I-NEXT:    sll t3, t3, t0
+; RV32I-NEXT:    sll t4, t5, a7
+; RV32I-NEXT:    sll t5, t6, a7
+; RV32I-NEXT:    sll t6, s0, a7
+; RV32I-NEXT:    sll s0, s1, a7
+; RV32I-NEXT:    sll s1, s2, a7
+; RV32I-NEXT:    sll t2, t2, a7
+; RV32I-NEXT:    sll t3, t3, a7
 ; RV32I-NEXT:    srli s2, t1, 24
 ; RV32I-NEXT:    srli s3, t1, 16
 ; RV32I-NEXT:    srli s4, t1, 8
-; RV32I-NEXT:    or t0, a0, t4
+; RV32I-NEXT:    or a7, a0, t4
 ; RV32I-NEXT:    or t4, a1, t5
 ; RV32I-NEXT:    or t5, a3, t6
 ; RV32I-NEXT:    or s0, a4, s0
 ; RV32I-NEXT:    or s1, a5, s1
 ; RV32I-NEXT:    or t2, a6, t2
-; RV32I-NEXT:    or t3, a7, t3
+; RV32I-NEXT:    or t3, t0, t3
 ; RV32I-NEXT:    sb t1, 28(a2)
 ; RV32I-NEXT:    sb s4, 29(a2)
 ; RV32I-NEXT:    sb s3, 30(a2)
@@ -5031,23 +5000,23 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    srli s6, s0, 24
 ; RV32I-NEXT:    srli s7, s0, 16
 ; RV32I-NEXT:    srli s0, s0, 8
-; RV32I-NEXT:    srli s8, t5, 24
-; RV32I-NEXT:    srli s9, t5, 16
-; RV32I-NEXT:    srli t5, t5, 8
-; RV32I-NEXT:    srli s10, t4, 24
-; RV32I-NEXT:    srli s11, t4, 16
-; RV32I-NEXT:    srli t4, t4, 8
-; RV32I-NEXT:    sb a7, 24(a2)
+; RV32I-NEXT:    sb t0, 24(a2)
+; RV32I-NEXT:    srli t0, t5, 24
 ; RV32I-NEXT:    sb t3, 25(a2)
+; RV32I-NEXT:    srli t3, t5, 16
+; RV32I-NEXT:    srli t5, t5, 8
 ; RV32I-NEXT:    sb t6, 26(a2)
+; RV32I-NEXT:    srli t6, t4, 24
 ; RV32I-NEXT:    sb t1, 27(a2)
-; RV32I-NEXT:    srli a7, t0, 24
+; RV32I-NEXT:    srli t1, t4, 16
+; RV32I-NEXT:    srli t4, t4, 8
 ; RV32I-NEXT:    sb a6, 16(a2)
+; RV32I-NEXT:    srli a6, a7, 24
 ; RV32I-NEXT:    sb t2, 17(a2)
 ; RV32I-NEXT:    sb s3, 18(a2)
 ; RV32I-NEXT:    sb s2, 19(a2)
-; RV32I-NEXT:    srli a6, t0, 16
-; RV32I-NEXT:    srli t0, t0, 8
+; RV32I-NEXT:    srli t2, a7, 16
+; RV32I-NEXT:    srli a7, a7, 8
 ; RV32I-NEXT:    sb a5, 20(a2)
 ; RV32I-NEXT:    sb s1, 21(a2)
 ; RV32I-NEXT:    sb s5, 22(a2)
@@ -5058,30 +5027,29 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sb s6, 11(a2)
 ; RV32I-NEXT:    sb a3, 12(a2)
 ; RV32I-NEXT:    sb t5, 13(a2)
-; RV32I-NEXT:    sb s9, 14(a2)
-; RV32I-NEXT:    sb s8, 15(a2)
+; RV32I-NEXT:    sb t3, 14(a2)
+; RV32I-NEXT:    sb t0, 15(a2)
 ; RV32I-NEXT:    sb a1, 0(a2)
 ; RV32I-NEXT:    sb t4, 1(a2)
-; RV32I-NEXT:    sb s11, 2(a2)
-; RV32I-NEXT:    sb s10, 3(a2)
+; RV32I-NEXT:    sb t1, 2(a2)
+; RV32I-NEXT:    sb t6, 3(a2)
 ; RV32I-NEXT:    sb a0, 4(a2)
-; RV32I-NEXT:    sb t0, 5(a2)
-; RV32I-NEXT:    sb a6, 6(a2)
-; RV32I-NEXT:    sb a7, 7(a2)
-; RV32I-NEXT:    lw ra, 124(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 120(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 116(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 112(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s4, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s5, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s6, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s7, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s8, 88(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s9, 84(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s10, 80(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s11, 76(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 128
+; RV32I-NEXT:    sb a7, 5(a2)
+; RV32I-NEXT:    sb t2, 6(a2)
+; RV32I-NEXT:    sb a6, 7(a2)
+; RV32I-NEXT:    lw s0, 108(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 104(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 100(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 96(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s5, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s6, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s7, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s8, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s9, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s10, 68(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s11, 64(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 112
 ; RV32I-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
   %byteOff = load i256, ptr %byteOff.ptr, align 1
@@ -5327,130 +5295,129 @@ define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
 ;
 ; RV32I-LABEL: ashr_32bytes_wordOff:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -128
-; RV32I-NEXT:    sw ra, 124(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 120(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 116(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 112(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s5, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s6, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s7, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s8, 88(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s9, 84(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s10, 80(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s11, 76(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a5, 0(a0)
-; RV32I-NEXT:    lbu a6, 1(a0)
-; RV32I-NEXT:    lbu a7, 2(a0)
-; RV32I-NEXT:    lbu t1, 3(a0)
-; RV32I-NEXT:    lbu s0, 4(a0)
-; RV32I-NEXT:    lbu s2, 5(a0)
-; RV32I-NEXT:    lbu s3, 6(a0)
-; RV32I-NEXT:    lbu s6, 7(a0)
-; RV32I-NEXT:    lbu s1, 8(a0)
-; RV32I-NEXT:    lbu s7, 9(a0)
-; RV32I-NEXT:    lbu s8, 10(a0)
-; RV32I-NEXT:    lbu s9, 11(a0)
-; RV32I-NEXT:    lbu s10, 12(a0)
-; RV32I-NEXT:    lbu s11, 13(a0)
-; RV32I-NEXT:    lbu s4, 14(a0)
-; RV32I-NEXT:    lbu s5, 15(a0)
-; RV32I-NEXT:    lbu a3, 16(a0)
-; RV32I-NEXT:    lbu t0, 17(a0)
-; RV32I-NEXT:    lbu t2, 18(a0)
-; RV32I-NEXT:    lbu t3, 19(a0)
-; RV32I-NEXT:    lbu a4, 20(a0)
-; RV32I-NEXT:    lbu t4, 21(a0)
-; RV32I-NEXT:    lbu t5, 22(a0)
-; RV32I-NEXT:    lbu t6, 23(a0)
-; RV32I-NEXT:    slli a6, a6, 8
-; RV32I-NEXT:    slli a7, a7, 16
-; RV32I-NEXT:    slli t1, t1, 24
-; RV32I-NEXT:    slli s2, s2, 8
-; RV32I-NEXT:    slli s3, s3, 16
-; RV32I-NEXT:    slli s6, s6, 24
-; RV32I-NEXT:    or a5, a6, a5
-; RV32I-NEXT:    or a6, t1, a7
-; RV32I-NEXT:    or a7, s2, s0
-; RV32I-NEXT:    or t1, s6, s3
-; RV32I-NEXT:    lbu s0, 24(a0)
-; RV32I-NEXT:    lbu s6, 25(a0)
-; RV32I-NEXT:    lbu ra, 26(a0)
-; RV32I-NEXT:    lbu s2, 27(a0)
-; RV32I-NEXT:    slli s7, s7, 8
-; RV32I-NEXT:    slli s8, s8, 16
-; RV32I-NEXT:    slli s9, s9, 24
-; RV32I-NEXT:    slli s11, s11, 8
-; RV32I-NEXT:    or s1, s7, s1
-; RV32I-NEXT:    or s7, s9, s8
-; RV32I-NEXT:    or s3, s11, s10
-; RV32I-NEXT:    lbu s8, 28(a0)
-; RV32I-NEXT:    lbu s9, 29(a0)
-; RV32I-NEXT:    lbu s10, 30(a0)
-; RV32I-NEXT:    lbu a0, 31(a0)
-; RV32I-NEXT:    lbu a1, 0(a1)
-; RV32I-NEXT:    slli s4, s4, 16
-; RV32I-NEXT:    slli s5, s5, 24
-; RV32I-NEXT:    or s4, s5, s4
-; RV32I-NEXT:    addi s5, sp, 8
+; RV32I-NEXT:    addi sp, sp, -112
+; RV32I-NEXT:    sw s0, 108(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 104(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 100(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 96(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s5, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s6, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s7, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s8, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s9, 72(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s10, 68(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s11, 64(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a3, 0(a0)
+; RV32I-NEXT:    lbu a4, 1(a0)
+; RV32I-NEXT:    lbu a5, 2(a0)
+; RV32I-NEXT:    lbu a6, 3(a0)
+; RV32I-NEXT:    lbu a7, 4(a0)
+; RV32I-NEXT:    lbu t0, 5(a0)
+; RV32I-NEXT:    lbu t1, 6(a0)
+; RV32I-NEXT:    lbu t2, 7(a0)
+; RV32I-NEXT:    lbu t3, 8(a0)
+; RV32I-NEXT:    lbu t4, 9(a0)
+; RV32I-NEXT:    lbu t5, 10(a0)
+; RV32I-NEXT:    lbu t6, 11(a0)
+; RV32I-NEXT:    lbu s0, 12(a0)
+; RV32I-NEXT:    lbu s1, 13(a0)
+; RV32I-NEXT:    lbu s2, 14(a0)
+; RV32I-NEXT:    lbu s3, 15(a0)
+; RV32I-NEXT:    lbu s4, 16(a0)
+; RV32I-NEXT:    lbu s5, 17(a0)
+; RV32I-NEXT:    lbu s6, 18(a0)
+; RV32I-NEXT:    lbu s7, 19(a0)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a6, a6, 24
 ; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    slli t2, t2, 16
-; RV32I-NEXT:    slli t3, t3, 24
+; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t2, t2, 24
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a5, t0, a7
+; RV32I-NEXT:    or a6, t2, t1
+; RV32I-NEXT:    lbu s8, 20(a0)
+; RV32I-NEXT:    lbu s9, 21(a0)
+; RV32I-NEXT:    lbu s10, 22(a0)
+; RV32I-NEXT:    lbu s11, 23(a0)
 ; RV32I-NEXT:    slli t4, t4, 8
 ; RV32I-NEXT:    slli t5, t5, 16
 ; RV32I-NEXT:    slli t6, t6, 24
-; RV32I-NEXT:    slli s6, s6, 8
-; RV32I-NEXT:    slli ra, ra, 16
-; RV32I-NEXT:    slli s2, s2, 24
+; RV32I-NEXT:    slli s1, s1, 8
+; RV32I-NEXT:    slli s2, s2, 16
+; RV32I-NEXT:    slli s3, s3, 24
+; RV32I-NEXT:    or a7, t4, t3
+; RV32I-NEXT:    or t0, t6, t5
+; RV32I-NEXT:    or t1, s1, s0
+; RV32I-NEXT:    or t2, s3, s2
+; RV32I-NEXT:    lbu t3, 24(a0)
+; RV32I-NEXT:    lbu t5, 25(a0)
+; RV32I-NEXT:    lbu t6, 26(a0)
+; RV32I-NEXT:    lbu s0, 27(a0)
+; RV32I-NEXT:    slli s5, s5, 8
+; RV32I-NEXT:    slli s6, s6, 16
+; RV32I-NEXT:    slli s7, s7, 24
 ; RV32I-NEXT:    slli s9, s9, 8
+; RV32I-NEXT:    or t4, s5, s4
+; RV32I-NEXT:    or s1, s7, s6
+; RV32I-NEXT:    or s2, s9, s8
+; RV32I-NEXT:    lbu s3, 28(a0)
+; RV32I-NEXT:    lbu s4, 29(a0)
+; RV32I-NEXT:    lbu s5, 30(a0)
+; RV32I-NEXT:    lbu a0, 31(a0)
+; RV32I-NEXT:    lbu a1, 0(a1)
 ; RV32I-NEXT:    slli s10, s10, 16
+; RV32I-NEXT:    slli s11, s11, 24
+; RV32I-NEXT:    or s6, s11, s10
+; RV32I-NEXT:    mv s7, sp
+; RV32I-NEXT:    slli t5, t5, 8
+; RV32I-NEXT:    slli t6, t6, 16
+; RV32I-NEXT:    slli s0, s0, 24
+; RV32I-NEXT:    slli s4, s4, 8
+; RV32I-NEXT:    slli s5, s5, 16
 ; RV32I-NEXT:    slli a0, a0, 24
 ; RV32I-NEXT:    slli a1, a1, 2
-; RV32I-NEXT:    or a3, t0, a3
-; RV32I-NEXT:    or t0, t3, t2
-; RV32I-NEXT:    or a4, t4, a4
-; RV32I-NEXT:    or t2, t6, t5
-; RV32I-NEXT:    or t3, s6, s0
-; RV32I-NEXT:    or t4, s2, ra
-; RV32I-NEXT:    or t5, s9, s8
-; RV32I-NEXT:    or t6, a0, s10
+; RV32I-NEXT:    or t3, t5, t3
+; RV32I-NEXT:    or t5, s0, t6
+; RV32I-NEXT:    or t6, s4, s3
+; RV32I-NEXT:    or s0, a0, s5
 ; RV32I-NEXT:    srai a0, a0, 31
 ; RV32I-NEXT:    andi a1, a1, 28
-; RV32I-NEXT:    or a5, a6, a5
-; RV32I-NEXT:    or a6, t1, a7
-; RV32I-NEXT:    or a7, s7, s1
-; RV32I-NEXT:    or t1, s4, s3
-; RV32I-NEXT:    or a3, t0, a3
-; RV32I-NEXT:    or a4, t2, a4
-; RV32I-NEXT:    or t0, t4, t3
-; RV32I-NEXT:    or t2, t6, t5
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a5, t0, a7
+; RV32I-NEXT:    or a6, t2, t1
+; RV32I-NEXT:    or a7, s1, t4
+; RV32I-NEXT:    or t0, s6, s2
+; RV32I-NEXT:    or t1, t5, t3
+; RV32I-NEXT:    or t2, s0, t6
+; RV32I-NEXT:    sw a0, 48(sp)
+; RV32I-NEXT:    sw a0, 52(sp)
 ; RV32I-NEXT:    sw a0, 56(sp)
 ; RV32I-NEXT:    sw a0, 60(sp)
-; RV32I-NEXT:    sw a0, 64(sp)
-; RV32I-NEXT:    sw a0, 68(sp)
+; RV32I-NEXT:    sw a0, 32(sp)
+; RV32I-NEXT:    sw a0, 36(sp)
 ; RV32I-NEXT:    sw a0, 40(sp)
 ; RV32I-NEXT:    sw a0, 44(sp)
-; RV32I-NEXT:    sw a0, 48(sp)
-; RV32I-NEXT:    sw a0, 52(sp)
-; RV32I-NEXT:    add s5, s5, a1
-; RV32I-NEXT:    sw a3, 24(sp)
-; RV32I-NEXT:    sw a4, 28(sp)
-; RV32I-NEXT:    sw t0, 32(sp)
-; RV32I-NEXT:    sw t2, 36(sp)
+; RV32I-NEXT:    add s7, s7, a1
+; RV32I-NEXT:    sw a7, 16(sp)
+; RV32I-NEXT:    sw t0, 20(sp)
+; RV32I-NEXT:    sw t1, 24(sp)
+; RV32I-NEXT:    sw t2, 28(sp)
+; RV32I-NEXT:    sw a3, 0(sp)
+; RV32I-NEXT:    sw a4, 4(sp)
 ; RV32I-NEXT:    sw a5, 8(sp)
 ; RV32I-NEXT:    sw a6, 12(sp)
-; RV32I-NEXT:    sw a7, 16(sp)
-; RV32I-NEXT:    sw t1, 20(sp)
-; RV32I-NEXT:    lw a6, 16(s5)
-; RV32I-NEXT:    lw a5, 20(s5)
-; RV32I-NEXT:    lw a7, 24(s5)
-; RV32I-NEXT:    lw a1, 0(s5)
-; RV32I-NEXT:    lw a0, 4(s5)
-; RV32I-NEXT:    lw a4, 8(s5)
-; RV32I-NEXT:    lw a3, 12(s5)
-; RV32I-NEXT:    lw t0, 28(s5)
+; RV32I-NEXT:    lw a6, 16(s7)
+; RV32I-NEXT:    lw a5, 20(s7)
+; RV32I-NEXT:    lw a7, 24(s7)
+; RV32I-NEXT:    lw a1, 0(s7)
+; RV32I-NEXT:    lw a0, 4(s7)
+; RV32I-NEXT:    lw a4, 8(s7)
+; RV32I-NEXT:    lw a3, 12(s7)
+; RV32I-NEXT:    lw t0, 28(s7)
 ; RV32I-NEXT:    srli t1, a7, 24
 ; RV32I-NEXT:    srli t2, a7, 16
 ; RV32I-NEXT:    srli t3, a7, 8
@@ -5465,21 +5432,21 @@ define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
 ; RV32I-NEXT:    srli s5, a5, 8
 ; RV32I-NEXT:    srli s6, a4, 24
 ; RV32I-NEXT:    srli s7, a4, 16
-; RV32I-NEXT:    srli s8, a4, 8
-; RV32I-NEXT:    srli s9, a3, 24
-; RV32I-NEXT:    srli s10, a3, 16
-; RV32I-NEXT:    srli s11, a3, 8
 ; RV32I-NEXT:    sb a7, 24(a2)
-; RV32I-NEXT:    srli a7, a1, 24
+; RV32I-NEXT:    srli a7, a4, 8
 ; RV32I-NEXT:    sb t3, 25(a2)
+; RV32I-NEXT:    srli t3, a3, 24
 ; RV32I-NEXT:    sb t2, 26(a2)
+; RV32I-NEXT:    srli t2, a3, 16
 ; RV32I-NEXT:    sb t1, 27(a2)
-; RV32I-NEXT:    srli t1, a1, 16
+; RV32I-NEXT:    srli t1, a3, 8
 ; RV32I-NEXT:    sb t0, 28(a2)
+; RV32I-NEXT:    srli t0, a1, 24
 ; RV32I-NEXT:    sb t6, 29(a2)
+; RV32I-NEXT:    srli t6, a1, 16
 ; RV32I-NEXT:    sb t5, 30(a2)
 ; RV32I-NEXT:    sb t4, 31(a2)
-; RV32I-NEXT:    srli t0, a1, 8
+; RV32I-NEXT:    srli t4, a1, 8
 ; RV32I-NEXT:    sb a6, 16(a2)
 ; RV32I-NEXT:    sb s2, 17(a2)
 ; RV32I-NEXT:    sb s1, 18(a2)
@@ -5491,36 +5458,35 @@ define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
 ; RV32I-NEXT:    sb s3, 23(a2)
 ; RV32I-NEXT:    srli a5, a0, 16
 ; RV32I-NEXT:    sb a4, 8(a2)
-; RV32I-NEXT:    sb s8, 9(a2)
+; RV32I-NEXT:    sb a7, 9(a2)
 ; RV32I-NEXT:    sb s7, 10(a2)
 ; RV32I-NEXT:    sb s6, 11(a2)
 ; RV32I-NEXT:    srli a4, a0, 8
 ; RV32I-NEXT:    sb a3, 12(a2)
-; RV32I-NEXT:    sb s11, 13(a2)
-; RV32I-NEXT:    sb s10, 14(a2)
-; RV32I-NEXT:    sb s9, 15(a2)
+; RV32I-NEXT:    sb t1, 13(a2)
+; RV32I-NEXT:    sb t2, 14(a2)
+; RV32I-NEXT:    sb t3, 15(a2)
 ; RV32I-NEXT:    sb a1, 0(a2)
-; RV32I-NEXT:    sb t0, 1(a2)
-; RV32I-NEXT:    sb t1, 2(a2)
-; RV32I-NEXT:    sb a7, 3(a2)
+; RV32I-NEXT:    sb t4, 1(a2)
+; RV32I-NEXT:    sb t6, 2(a2)
+; RV32I-NEXT:    sb t0, 3(a2)
 ; RV32I-NEXT:    sb a0, 4(a2)
 ; RV32I-NEXT:    sb a4, 5(a2)
 ; RV32I-NEXT:    sb a5, 6(a2)
 ; RV32I-NEXT:    sb a6, 7(a2)
-; RV32I-NEXT:    lw ra, 124(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 120(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 116(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 112(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s4, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s5, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s6, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s7, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s8, 88(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s9, 84(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s10, 80(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s11, 76(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 128
+; RV32I-NEXT:    lw s0, 108(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 104(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 100(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 96(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s5, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s6, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s7, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s8, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s9, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s10, 68(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s11, 64(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 112
 ; RV32I-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
   %wordOff = load i256, ptr %wordOff.ptr, align 1
@@ -5546,112 +5512,112 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; RV64I-NEXT:    sd s9, 80(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s10, 72(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s11, 64(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a5, 0(a0)
-; RV64I-NEXT:    lbu a7, 1(a0)
-; RV64I-NEXT:    lbu t1, 2(a0)
-; RV64I-NEXT:    lbu s3, 3(a0)
-; RV64I-NEXT:    lbu t0, 4(a0)
-; RV64I-NEXT:    lbu s8, 5(a0)
-; RV64I-NEXT:    lbu s9, 6(a0)
-; RV64I-NEXT:    lbu s10, 7(a0)
-; RV64I-NEXT:    lbu s2, 8(a0)
-; RV64I-NEXT:    lbu s4, 9(a0)
-; RV64I-NEXT:    lbu s5, 10(a0)
-; RV64I-NEXT:    lbu s6, 11(a0)
-; RV64I-NEXT:    lbu s7, 12(a0)
-; RV64I-NEXT:    lbu s11, 13(a0)
-; RV64I-NEXT:    lbu t4, 14(a0)
-; RV64I-NEXT:    lbu t5, 15(a0)
-; RV64I-NEXT:    lbu a3, 16(a0)
-; RV64I-NEXT:    lbu a6, 17(a0)
-; RV64I-NEXT:    lbu t2, 18(a0)
-; RV64I-NEXT:    lbu t3, 19(a0)
-; RV64I-NEXT:    lbu a4, 20(a0)
-; RV64I-NEXT:    lbu t6, 21(a0)
-; RV64I-NEXT:    lbu s0, 22(a0)
-; RV64I-NEXT:    lbu s1, 23(a0)
-; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    lbu a3, 0(a0)
+; RV64I-NEXT:    lbu a4, 1(a0)
+; RV64I-NEXT:    lbu a5, 2(a0)
+; RV64I-NEXT:    lbu a6, 3(a0)
+; RV64I-NEXT:    lbu a7, 4(a0)
+; RV64I-NEXT:    lbu t0, 5(a0)
+; RV64I-NEXT:    lbu t1, 6(a0)
+; RV64I-NEXT:    lbu t2, 7(a0)
+; RV64I-NEXT:    lbu t3, 8(a0)
+; RV64I-NEXT:    lbu t4, 9(a0)
+; RV64I-NEXT:    lbu t5, 10(a0)
+; RV64I-NEXT:    lbu t6, 11(a0)
+; RV64I-NEXT:    lbu s0, 12(a0)
+; RV64I-NEXT:    lbu s1, 13(a0)
+; RV64I-NEXT:    lbu s2, 14(a0)
+; RV64I-NEXT:    lbu s3, 15(a0)
+; RV64I-NEXT:    lbu s4, 16(a0)
+; RV64I-NEXT:    lbu s5, 17(a0)
+; RV64I-NEXT:    lbu s6, 18(a0)
+; RV64I-NEXT:    lbu s7, 19(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    slli a5, a5, 16
+; RV64I-NEXT:    slli a6, a6, 24
+; RV64I-NEXT:    slli t0, t0, 8
 ; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t2, t2, 24
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a5, t0, a7
+; RV64I-NEXT:    or a6, t2, t1
+; RV64I-NEXT:    lbu s8, 20(a0)
+; RV64I-NEXT:    lbu s9, 21(a0)
+; RV64I-NEXT:    lbu s10, 22(a0)
+; RV64I-NEXT:    lbu s11, 23(a0)
+; RV64I-NEXT:    slli t4, t4, 8
+; RV64I-NEXT:    slli t5, t5, 16
+; RV64I-NEXT:    slli t6, t6, 24
+; RV64I-NEXT:    slli s1, s1, 8
+; RV64I-NEXT:    slli s2, s2, 16
 ; RV64I-NEXT:    slli s3, s3, 24
-; RV64I-NEXT:    slli s8, s8, 8
-; RV64I-NEXT:    slli s9, s9, 16
-; RV64I-NEXT:    slli s10, s10, 24
-; RV64I-NEXT:    or a5, a7, a5
-; RV64I-NEXT:    or a7, s3, t1
-; RV64I-NEXT:    or t0, s8, t0
-; RV64I-NEXT:    or t1, s10, s9
-; RV64I-NEXT:    lbu s3, 24(a0)
-; RV64I-NEXT:    lbu s8, 25(a0)
-; RV64I-NEXT:    lbu s9, 26(a0)
-; RV64I-NEXT:    lbu s10, 27(a0)
-; RV64I-NEXT:    slli s4, s4, 8
-; RV64I-NEXT:    slli s5, s5, 16
-; RV64I-NEXT:    slli s6, s6, 24
-; RV64I-NEXT:    slli s11, s11, 8
-; RV64I-NEXT:    or s2, s4, s2
-; RV64I-NEXT:    or s4, s6, s5
-; RV64I-NEXT:    or s5, s11, s7
-; RV64I-NEXT:    lbu s6, 28(a0)
-; RV64I-NEXT:    lbu s7, 29(a0)
-; RV64I-NEXT:    lbu s11, 30(a0)
+; RV64I-NEXT:    or a7, t4, t3
+; RV64I-NEXT:    or t0, t6, t5
+; RV64I-NEXT:    or t1, s1, s0
+; RV64I-NEXT:    or t2, s3, s2
+; RV64I-NEXT:    lbu t3, 24(a0)
+; RV64I-NEXT:    lbu t4, 25(a0)
+; RV64I-NEXT:    lbu t5, 26(a0)
+; RV64I-NEXT:    lbu t6, 27(a0)
+; RV64I-NEXT:    slli s5, s5, 8
+; RV64I-NEXT:    slli s6, s6, 16
+; RV64I-NEXT:    slli s7, s7, 24
+; RV64I-NEXT:    slli s9, s9, 8
+; RV64I-NEXT:    or s0, s5, s4
+; RV64I-NEXT:    or s1, s7, s6
+; RV64I-NEXT:    or s2, s9, s8
+; RV64I-NEXT:    lbu s3, 28(a0)
+; RV64I-NEXT:    lbu s4, 29(a0)
+; RV64I-NEXT:    lbu s5, 30(a0)
 ; RV64I-NEXT:    lbu a0, 31(a0)
 ; RV64I-NEXT:    lbu a1, 0(a1)
-; RV64I-NEXT:    slli t4, t4, 16
-; RV64I-NEXT:    slli t5, t5, 24
-; RV64I-NEXT:    or t4, t5, t4
-; RV64I-NEXT:    mv t5, sp
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    slli t2, t2, 16
-; RV64I-NEXT:    slli t3, t3, 24
-; RV64I-NEXT:    slli t6, t6, 8
-; RV64I-NEXT:    slli s0, s0, 16
-; RV64I-NEXT:    slli s1, s1, 24
-; RV64I-NEXT:    slli s8, s8, 8
-; RV64I-NEXT:    slli s9, s9, 16
-; RV64I-NEXT:    slli s10, s10, 24
-; RV64I-NEXT:    slli s7, s7, 8
-; RV64I-NEXT:    slli s11, s11, 16
+; RV64I-NEXT:    slli s10, s10, 16
+; RV64I-NEXT:    slli s11, s11, 24
+; RV64I-NEXT:    or s6, s11, s10
+; RV64I-NEXT:    mv s7, sp
+; RV64I-NEXT:    slli t4, t4, 8
+; RV64I-NEXT:    slli t5, t5, 16
+; RV64I-NEXT:    slli t6, t6, 24
+; RV64I-NEXT:    slli s4, s4, 8
+; RV64I-NEXT:    slli s5, s5, 16
 ; RV64I-NEXT:    slli a0, a0, 24
 ; RV64I-NEXT:    slli a1, a1, 3
-; RV64I-NEXT:    or a3, a6, a3
-; RV64I-NEXT:    or a6, t3, t2
-; RV64I-NEXT:    or a4, t6, a4
-; RV64I-NEXT:    or s0, s1, s0
-; RV64I-NEXT:    or t2, s8, s3
-; RV64I-NEXT:    or t3, s10, s9
-; RV64I-NEXT:    or t6, s7, s6
-; RV64I-NEXT:    or a0, a0, s11
+; RV64I-NEXT:    or t3, t4, t3
+; RV64I-NEXT:    or t4, t6, t5
+; RV64I-NEXT:    or t5, s4, s3
+; RV64I-NEXT:    or a0, a0, s5
 ; RV64I-NEXT:    andi a1, a1, 24
-; RV64I-NEXT:    or a5, a7, a5
-; RV64I-NEXT:    or a7, t1, t0
-; RV64I-NEXT:    or t0, s4, s2
-; RV64I-NEXT:    or t1, t4, s5
-; RV64I-NEXT:    or a3, a6, a3
-; RV64I-NEXT:    or a4, s0, a4
-; RV64I-NEXT:    or a6, t3, t2
-; RV64I-NEXT:    or a0, a0, t6
-; RV64I-NEXT:    add t5, t5, a1
-; RV64I-NEXT:    slli a7, a7, 32
-; RV64I-NEXT:    slli t1, t1, 32
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a5, t0, a7
+; RV64I-NEXT:    or a6, t2, t1
+; RV64I-NEXT:    or s0, s1, s0
+; RV64I-NEXT:    or a7, s6, s2
+; RV64I-NEXT:    or t0, t4, t3
+; RV64I-NEXT:    or a0, a0, t5
+; RV64I-NEXT:    add s7, s7, a1
 ; RV64I-NEXT:    slli a4, a4, 32
+; RV64I-NEXT:    slli a6, a6, 32
+; RV64I-NEXT:    slli a7, a7, 32
 ; RV64I-NEXT:    slli a1, a0, 32
 ; RV64I-NEXT:    sraiw a0, a0, 31
-; RV64I-NEXT:    or a5, a7, a5
-; RV64I-NEXT:    or a7, t1, t0
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    or a1, a1, a6
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a5, a7, s0
+; RV64I-NEXT:    or a1, a1, t0
 ; RV64I-NEXT:    sd a0, 32(sp)
 ; RV64I-NEXT:    sd a0, 40(sp)
 ; RV64I-NEXT:    sd a0, 48(sp)
 ; RV64I-NEXT:    sd a0, 56(sp)
-; RV64I-NEXT:    sd a5, 0(sp)
-; RV64I-NEXT:    sd a7, 8(sp)
-; RV64I-NEXT:    sd a3, 16(sp)
+; RV64I-NEXT:    sd a3, 0(sp)
+; RV64I-NEXT:    sd a4, 8(sp)
+; RV64I-NEXT:    sd a5, 16(sp)
 ; RV64I-NEXT:    sd a1, 24(sp)
-; RV64I-NEXT:    ld a4, 16(t5)
-; RV64I-NEXT:    ld a0, 8(t5)
-; RV64I-NEXT:    ld a1, 0(t5)
-; RV64I-NEXT:    ld a3, 24(t5)
+; RV64I-NEXT:    ld a4, 16(s7)
+; RV64I-NEXT:    ld a0, 8(s7)
+; RV64I-NEXT:    ld a1, 0(s7)
+; RV64I-NEXT:    ld a3, 24(s7)
 ; RV64I-NEXT:    srli a5, a4, 56
 ; RV64I-NEXT:    srli a6, a4, 48
 ; RV64I-NEXT:    srli a7, a4, 40
@@ -5670,25 +5636,25 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; RV64I-NEXT:    srli s5, a1, 48
 ; RV64I-NEXT:    srli s6, a1, 40
 ; RV64I-NEXT:    srli s7, a1, 32
-; RV64I-NEXT:    srli s8, a1, 24
-; RV64I-NEXT:    srli s9, a1, 16
-; RV64I-NEXT:    srli s10, a1, 8
-; RV64I-NEXT:    srli s11, a0, 56
 ; RV64I-NEXT:    sb t0, 20(a2)
+; RV64I-NEXT:    srli t0, a1, 24
 ; RV64I-NEXT:    sb a7, 21(a2)
+; RV64I-NEXT:    srli a7, a1, 16
 ; RV64I-NEXT:    sb a6, 22(a2)
+; RV64I-NEXT:    srli a6, a1, 8
 ; RV64I-NEXT:    sb a5, 23(a2)
-; RV64I-NEXT:    srli a5, a0, 48
+; RV64I-NEXT:    srli a5, a0, 56
 ; RV64I-NEXT:    sb a4, 16(a2)
+; RV64I-NEXT:    srli a4, a0, 48
 ; RV64I-NEXT:    sb t3, 17(a2)
 ; RV64I-NEXT:    sb t2, 18(a2)
 ; RV64I-NEXT:    sb t1, 19(a2)
-; RV64I-NEXT:    srli a4, a0, 40
+; RV64I-NEXT:    srli t1, a0, 40
 ; RV64I-NEXT:    sb s0, 28(a2)
 ; RV64I-NEXT:    sb t6, 29(a2)
 ; RV64I-NEXT:    sb t5, 30(a2)
 ; RV64I-NEXT:    sb t4, 31(a2)
-; RV64I-NEXT:    srli a6, a0, 32
+; RV64I-NEXT:    srli t2, a0, 32
 ; RV64I-NEXT:    sb a3, 24(a2)
 ; RV64I-NEXT:    sb s3, 25(a2)
 ; RV64I-NEXT:    sb s2, 26(a2)
@@ -5698,19 +5664,19 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; RV64I-NEXT:    sb s6, 5(a2)
 ; RV64I-NEXT:    sb s5, 6(a2)
 ; RV64I-NEXT:    sb s4, 7(a2)
-; RV64I-NEXT:    srli a7, a0, 16
+; RV64I-NEXT:    srli t3, a0, 16
 ; RV64I-NEXT:    sb a1, 0(a2)
-; RV64I-NEXT:    sb s10, 1(a2)
-; RV64I-NEXT:    sb s9, 2(a2)
-; RV64I-NEXT:    sb s8, 3(a2)
+; RV64I-NEXT:    sb a6, 1(a2)
+; RV64I-NEXT:    sb a7, 2(a2)
+; RV64I-NEXT:    sb t0, 3(a2)
 ; RV64I-NEXT:    srli a1, a0, 8
-; RV64I-NEXT:    sb a6, 12(a2)
-; RV64I-NEXT:    sb a4, 13(a2)
-; RV64I-NEXT:    sb a5, 14(a2)
-; RV64I-NEXT:    sb s11, 15(a2)
+; RV64I-NEXT:    sb t2, 12(a2)
+; RV64I-NEXT:    sb t1, 13(a2)
+; RV64I-NEXT:    sb a4, 14(a2)
+; RV64I-NEXT:    sb a5, 15(a2)
 ; RV64I-NEXT:    sb a0, 8(a2)
 ; RV64I-NEXT:    sb a1, 9(a2)
-; RV64I-NEXT:    sb a7, 10(a2)
+; RV64I-NEXT:    sb t3, 10(a2)
 ; RV64I-NEXT:    sb a3, 11(a2)
 ; RV64I-NEXT:    ld s0, 152(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 144(sp) # 8-byte Folded Reload
@@ -5729,130 +5695,129 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ;
 ; RV32I-LABEL: ashr_32bytes_dwordOff:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -128
-; RV32I-NEXT:    sw ra, 124(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 120(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 116(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 112(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s5, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s6, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s7, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s8, 88(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s9, 84(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s10, 80(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s11, 76(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a5, 0(a0)
-; RV32I-NEXT:    lbu a6, 1(a0)
-; RV32I-NEXT:    lbu a7, 2(a0)
-; RV32I-NEXT:    lbu t1, 3(a0)
-; RV32I-NEXT:    lbu s0, 4(a0)
-; RV32I-NEXT:    lbu s2, 5(a0)
-; RV32I-NEXT:    lbu s3, 6(a0)
-; RV32I-NEXT:    lbu s6, 7(a0)
-; RV32I-NEXT:    lbu s1, 8(a0)
-; RV32I-NEXT:    lbu s7, 9(a0)
-; RV32I-NEXT:    lbu s8, 10(a0)
-; RV32I-NEXT:    lbu s9, 11(a0)
-; RV32I-NEXT:    lbu s10, 12(a0)
-; RV32I-NEXT:    lbu s11, 13(a0)
-; RV32I-NEXT:    lbu s4, 14(a0)
-; RV32I-NEXT:    lbu s5, 15(a0)
-; RV32I-NEXT:    lbu a3, 16(a0)
-; RV32I-NEXT:    lbu t0, 17(a0)
-; RV32I-NEXT:    lbu t2, 18(a0)
-; RV32I-NEXT:    lbu t3, 19(a0)
-; RV32I-NEXT:    lbu a4, 20(a0)
-; RV32I-NEXT:    lbu t4, 21(a0)
-; RV32I-NEXT:    lbu t5, 22(a0)
-; RV32I-NEXT:    lbu t6, 23(a0)
-; RV32I-NEXT:    slli a6, a6, 8
-; RV32I-NEXT:    slli a7, a7, 16
-; RV32I-NEXT:    slli t1, t1, 24
-; RV32I-NEXT:    slli s2, s2, 8
-; RV32I-NEXT:    slli s3, s3, 16
-; RV32I-NEXT:    slli s6, s6, 24
-; RV32I-NEXT:    or a5, a6, a5
-; RV32I-NEXT:    or a6, t1, a7
-; RV32I-NEXT:    or a7, s2, s0
-; RV32I-NEXT:    or t1, s6, s3
-; RV32I-NEXT:    lbu s0, 24(a0)
-; RV32I-NEXT:    lbu s6, 25(a0)
-; RV32I-NEXT:    lbu ra, 26(a0)
-; RV32I-NEXT:    lbu s2, 27(a0)
-; RV32I-NEXT:    slli s7, s7, 8
-; RV32I-NEXT:    slli s8, s8, 16
-; RV32I-NEXT:    slli s9, s9, 24
-; RV32I-NEXT:    slli s11, s11, 8
-; RV32I-NEXT:    or s1, s7, s1
-; RV32I-NEXT:    or s7, s9, s8
-; RV32I-NEXT:    or s3, s11, s10
-; RV32I-NEXT:    lbu s8, 28(a0)
-; RV32I-NEXT:    lbu s9, 29(a0)
-; RV32I-NEXT:    lbu s10, 30(a0)
-; RV32I-NEXT:    lbu a0, 31(a0)
-; RV32I-NEXT:    lbu a1, 0(a1)
-; RV32I-NEXT:    slli s4, s4, 16
-; RV32I-NEXT:    slli s5, s5, 24
-; RV32I-NEXT:    or s4, s5, s4
-; RV32I-NEXT:    addi s5, sp, 8
+; RV32I-NEXT:    addi sp, sp, -112
+; RV32I-NEXT:    sw s0, 108(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 104(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 100(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 96(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s5, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s6, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s7, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s8, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s9, 72(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s10, 68(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s11, 64(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a3, 0(a0)
+; RV32I-NEXT:    lbu a4, 1(a0)
+; RV32I-NEXT:    lbu a5, 2(a0)
+; RV32I-NEXT:    lbu a6, 3(a0)
+; RV32I-NEXT:    lbu a7, 4(a0)
+; RV32I-NEXT:    lbu t0, 5(a0)
+; RV32I-NEXT:    lbu t1, 6(a0)
+; RV32I-NEXT:    lbu t2, 7(a0)
+; RV32I-NEXT:    lbu t3, 8(a0)
+; RV32I-NEXT:    lbu t4, 9(a0)
+; RV32I-NEXT:    lbu t5, 10(a0)
+; RV32I-NEXT:    lbu t6, 11(a0)
+; RV32I-NEXT:    lbu s0, 12(a0)
+; RV32I-NEXT:    lbu s1, 13(a0)
+; RV32I-NEXT:    lbu s2, 14(a0)
+; RV32I-NEXT:    lbu s3, 15(a0)
+; RV32I-NEXT:    lbu s4, 16(a0)
+; RV32I-NEXT:    lbu s5, 17(a0)
+; RV32I-NEXT:    lbu s6, 18(a0)
+; RV32I-NEXT:    lbu s7, 19(a0)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a6, a6, 24
 ; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    slli t2, t2, 16
-; RV32I-NEXT:    slli t3, t3, 24
+; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t2, t2, 24
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a5, t0, a7
+; RV32I-NEXT:    or a6, t2, t1
+; RV32I-NEXT:    lbu s8, 20(a0)
+; RV32I-NEXT:    lbu s9, 21(a0)
+; RV32I-NEXT:    lbu s10, 22(a0)
+; RV32I-NEXT:    lbu s11, 23(a0)
 ; RV32I-NEXT:    slli t4, t4, 8
 ; RV32I-NEXT:    slli t5, t5, 16
 ; RV32I-NEXT:    slli t6, t6, 24
-; RV32I-NEXT:    slli s6, s6, 8
-; RV32I-NEXT:    slli ra, ra, 16
-; RV32I-NEXT:    slli s2, s2, 24
+; RV32I-NEXT:    slli s1, s1, 8
+; RV32I-NEXT:    slli s2, s2, 16
+; RV32I-NEXT:    slli s3, s3, 24
+; RV32I-NEXT:    or a7, t4, t3
+; RV32I-NEXT:    or t0, t6, t5
+; RV32I-NEXT:    or t1, s1, s0
+; RV32I-NEXT:    or t2, s3, s2
+; RV32I-NEXT:    lbu t3, 24(a0)
+; RV32I-NEXT:    lbu t5, 25(a0)
+; RV32I-NEXT:    lbu t6, 26(a0)
+; RV32I-NEXT:    lbu s0, 27(a0)
+; RV32I-NEXT:    slli s5, s5, 8
+; RV32I-NEXT:    slli s6, s6, 16
+; RV32I-NEXT:    slli s7, s7, 24
 ; RV32I-NEXT:    slli s9, s9, 8
+; RV32I-NEXT:    or t4, s5, s4
+; RV32I-NEXT:    or s1, s7, s6
+; RV32I-NEXT:    or s2, s9, s8
+; RV32I-NEXT:    lbu s3, 28(a0)
+; RV32I-NEXT:    lbu s4, 29(a0)
+; RV32I-NEXT:    lbu s5, 30(a0)
+; RV32I-NEXT:    lbu a0, 31(a0)
+; RV32I-NEXT:    lbu a1, 0(a1)
 ; RV32I-NEXT:    slli s10, s10, 16
+; RV32I-NEXT:    slli s11, s11, 24
+; RV32I-NEXT:    or s6, s11, s10
+; RV32I-NEXT:    mv s7, sp
+; RV32I-NEXT:    slli t5, t5, 8
+; RV32I-NEXT:    slli t6, t6, 16
+; RV32I-NEXT:    slli s0, s0, 24
+; RV32I-NEXT:    slli s4, s4, 8
+; RV32I-NEXT:    slli s5, s5, 16
 ; RV32I-NEXT:    slli a0, a0, 24
 ; RV32I-NEXT:    slli a1, a1, 3
-; RV32I-NEXT:    or a3, t0, a3
-; RV32I-NEXT:    or t0, t3, t2
-; RV32I-NEXT:    or a4, t4, a4
-; RV32I-NEXT:    or t2, t6, t5
-; RV32I-NEXT:    or t3, s6, s0
-; RV32I-NEXT:    or t4, s2, ra
-; RV32I-NEXT:    or t5, s9, s8
-; RV32I-NEXT:    or t6, a0, s10
+; RV32I-NEXT:    or t3, t5, t3
+; RV32I-NEXT:    or t5, s0, t6
+; RV32I-NEXT:    or t6, s4, s3
+; RV32I-NEXT:    or s0, a0, s5
 ; RV32I-NEXT:    srai a0, a0, 31
 ; RV32I-NEXT:    andi a1, a1, 24
-; RV32I-NEXT:    or a5, a6, a5
-; RV32I-NEXT:    or a6, t1, a7
-; RV32I-NEXT:    or a7, s7, s1
-; RV32I-NEXT:    or t1, s4, s3
-; RV32I-NEXT:    or a3, t0, a3
-; RV32I-NEXT:    or a4, t2, a4
-; RV32I-NEXT:    or t0, t4, t3
-; RV32I-NEXT:    or t2, t6, t5
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a5, t0, a7
+; RV32I-NEXT:    or a6, t2, t1
+; RV32I-NEXT:    or a7, s1, t4
+; RV32I-NEXT:    or t0, s6, s2
+; RV32I-NEXT:    or t1, t5, t3
+; RV32I-NEXT:    or t2, s0, t6
+; RV32I-NEXT:    sw a0, 48(sp)
+; RV32I-NEXT:    sw a0, 52(sp)
 ; RV32I-NEXT:    sw a0, 56(sp)
 ; RV32I-NEXT:    sw a0, 60(sp)
-; RV32I-NEXT:    sw a0, 64(sp)
-; RV32I-NEXT:    sw a0, 68(sp)
+; RV32I-NEXT:    sw a0, 32(sp)
+; RV32I-NEXT:    sw a0, 36(sp)
 ; RV32I-NEXT:    sw a0, 40(sp)
 ; RV32I-NEXT:    sw a0, 44(sp)
-; RV32I-NEXT:    sw a0, 48(sp)
-; RV32I-NEXT:    sw a0, 52(sp)
-; RV32I-NEXT:    add s5, s5, a1
-; RV32I-NEXT:    sw a3, 24(sp)
-; RV32I-NEXT:    sw a4, 28(sp)
-; RV32I-NEXT:    sw t0, 32(sp)
-; RV32I-NEXT:    sw t2, 36(sp)
+; RV32I-NEXT:    add s7, s7, a1
+; RV32I-NEXT:    sw a7, 16(sp)
+; RV32I-NEXT:    sw t0, 20(sp)
+; RV32I-NEXT:    sw t1, 24(sp)
+; RV32I-NEXT:    sw t2, 28(sp)
+; RV32I-NEXT:    sw a3, 0(sp)
+; RV32I-NEXT:    sw a4, 4(sp)
 ; RV32I-NEXT:    sw a5, 8(sp)
 ; RV32I-NEXT:    sw a6, 12(sp)
-; RV32I-NEXT:    sw a7, 16(sp)
-; RV32I-NEXT:    sw t1, 20(sp)
-; RV32I-NEXT:    lw a6, 16(s5)
-; RV32I-NEXT:    lw a5, 20(s5)
-; RV32I-NEXT:    lw a7, 24(s5)
-; RV32I-NEXT:    lw a1, 0(s5)
-; RV32I-NEXT:    lw a0, 4(s5)
-; RV32I-NEXT:    lw a4, 8(s5)
-; RV32I-NEXT:    lw a3, 12(s5)
-; RV32I-NEXT:    lw t0, 28(s5)
+; RV32I-NEXT:    lw a6, 16(s7)
+; RV32I-NEXT:    lw a5, 20(s7)
+; RV32I-NEXT:    lw a7, 24(s7)
+; RV32I-NEXT:    lw a1, 0(s7)
+; RV32I-NEXT:    lw a0, 4(s7)
+; RV32I-NEXT:    lw a4, 8(s7)
+; RV32I-NEXT:    lw a3, 12(s7)
+; RV32I-NEXT:    lw t0, 28(s7)
 ; RV32I-NEXT:    srli t1, a7, 24
 ; RV32I-NEXT:    srli t2, a7, 16
 ; RV32I-NEXT:    srli t3, a7, 8
@@ -5867,21 +5832,21 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; RV32I-NEXT:    srli s5, a5, 8
 ; RV32I-NEXT:    srli s6, a4, 24
 ; RV32I-NEXT:    srli s7, a4, 16
-; RV32I-NEXT:    srli s8, a4, 8
-; RV32I-NEXT:    srli s9, a3, 24
-; RV32I-NEXT:    srli s10, a3, 16
-; RV32I-NEXT:    srli s11, a3, 8
 ; RV32I-NEXT:    sb a7, 24(a2)
-; RV32I-NEXT:    srli a7, a1, 24
+; RV32I-NEXT:    srli a7, a4, 8
 ; RV32I-NEXT:    sb t3, 25(a2)
+; RV32I-NEXT:    srli t3, a3, 24
 ; RV32I-NEXT:    sb t2, 26(a2)
+; RV32I-NEXT:    srli t2, a3, 16
 ; RV32I-NEXT:    sb t1, 27(a2)
-; RV32I-NEXT:    srli t1, a1, 16
+; RV32I-NEXT:    srli t1, a3, 8
 ; RV32I-NEXT:    sb t0, 28(a2)
+; RV32I-NEXT:    srli t0, a1, 24
 ; RV32I-NEXT:    sb t6, 29(a2)
+; RV32I-NEXT:    srli t6, a1, 16
 ; RV32I-NEXT:    sb t5, 30(a2)
 ; RV32I-NEXT:    sb t4, 31(a2)
-; RV32I-NEXT:    srli t0, a1, 8
+; RV32I-NEXT:    srli t4, a1, 8
 ; RV32I-NEXT:    sb a6, 16(a2)
 ; RV32I-NEXT:    sb s2, 17(a2)
 ; RV32I-NEXT:    sb s1, 18(a2)
@@ -5893,36 +5858,35 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; RV32I-NEXT:    sb s3, 23(a2)
 ; RV32I-NEXT:    srli a5, a0, 16
 ; RV32I-NEXT:    sb a4, 8(a2)
-; RV32I-NEXT:    sb s8, 9(a2)
+; RV32I-NEXT:    sb a7, 9(a2)
 ; RV32I-NEXT:    sb s7, 10(a2)
 ; RV32I-NEXT:    sb s6, 11(a2)
 ; RV32I-NEXT:    srli a4, a0, 8
 ; RV32I-NEXT:    sb a3, 12(a2)
-; RV32I-NEXT:    sb s11, 13(a2)
-; RV32I-NEXT:    sb s10, 14(a2)
-; RV32I-NEXT:    sb s9, 15(a2)
+; RV32I-NEXT:    sb t1, 13(a2)
+; RV32I-NEXT:    sb t2, 14(a2)
+; RV32I-NEXT:    sb t3, 15(a2)
 ; RV32I-NEXT:    sb a1, 0(a2)
-; RV32I-NEXT:    sb t0, 1(a2)
-; RV32I-NEXT:    sb t1, 2(a2)
-; RV32I-NEXT:    sb a7, 3(a2)
+; RV32I-NEXT:    sb t4, 1(a2)
+; RV32I-NEXT:    sb t6, 2(a2)
+; RV32I-NEXT:    sb t0, 3(a2)
 ; RV32I-NEXT:    sb a0, 4(a2)
 ; RV32I-NEXT:    sb a4, 5(a2)
 ; RV32I-NEXT:    sb a5, 6(a2)
 ; RV32I-NEXT:    sb a6, 7(a2)
-; RV32I-NEXT:    lw ra, 124(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 120(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 116(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 112(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s4, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s5, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s6, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s7, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s8, 88(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s9, 84(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s10, 80(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s11, 76(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 128
+; RV32I-NEXT:    lw s0, 108(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 104(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 100(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 96(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s5, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s6, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s7, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s8, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s9, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s10, 68(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s11, 64(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 112
 ; RV32I-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
   %dwordOff = load i256, ptr %dwordOff.ptr, align 1
diff --git a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
index b2c130c2d7c10a..b8952d2cb2b29e 100644
--- a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
+++ b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
@@ -1530,25 +1530,24 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ;
 ; RV32I-LABEL: lshr_32bytes:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -128
-; RV32I-NEXT:    sw ra, 124(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 120(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 116(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 112(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s5, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s6, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s7, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s8, 88(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s9, 84(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s10, 80(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s11, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    addi sp, sp, -112
+; RV32I-NEXT:    sw s0, 108(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 104(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 100(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 96(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s5, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s6, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s7, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s8, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s9, 72(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s10, 68(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s11, 64(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    lbu a3, 0(a0)
 ; RV32I-NEXT:    lbu a4, 1(a0)
-; RV32I-NEXT:    lbu a6, 2(a0)
-; RV32I-NEXT:    lbu a7, 3(a0)
-; RV32I-NEXT:    lbu a5, 4(a0)
+; RV32I-NEXT:    lbu a5, 2(a0)
+; RV32I-NEXT:    lbu a6, 3(a0)
+; RV32I-NEXT:    lbu a7, 4(a0)
 ; RV32I-NEXT:    lbu t0, 5(a0)
 ; RV32I-NEXT:    lbu t1, 6(a0)
 ; RV32I-NEXT:    lbu t2, 7(a0)
@@ -1557,107 +1556,105 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    lbu t5, 10(a0)
 ; RV32I-NEXT:    lbu t6, 11(a0)
 ; RV32I-NEXT:    lbu s0, 12(a0)
-; RV32I-NEXT:    lbu s2, 13(a0)
-; RV32I-NEXT:    lbu s4, 14(a0)
-; RV32I-NEXT:    lbu s5, 15(a0)
-; RV32I-NEXT:    lbu s6, 16(a0)
-; RV32I-NEXT:    lbu s7, 17(a0)
-; RV32I-NEXT:    lbu s8, 18(a0)
-; RV32I-NEXT:    lbu s9, 19(a0)
+; RV32I-NEXT:    lbu s1, 13(a0)
+; RV32I-NEXT:    lbu s2, 14(a0)
+; RV32I-NEXT:    lbu s3, 15(a0)
+; RV32I-NEXT:    lbu s4, 16(a0)
+; RV32I-NEXT:    lbu s5, 17(a0)
+; RV32I-NEXT:    lbu s6, 18(a0)
+; RV32I-NEXT:    lbu s7, 19(a0)
 ; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    slli a6, a6, 16
-; RV32I-NEXT:    slli a7, a7, 24
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    sw a3, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    or a4, a7, a6
-; RV32I-NEXT:    lbu s10, 20(a0)
-; RV32I-NEXT:    lbu s11, 21(a0)
-; RV32I-NEXT:    lbu ra, 22(a0)
-; RV32I-NEXT:    lbu a3, 23(a0)
+; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a6, a6, 24
 ; RV32I-NEXT:    slli t0, t0, 8
 ; RV32I-NEXT:    slli t1, t1, 16
 ; RV32I-NEXT:    slli t2, t2, 24
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a5, t0, a7
+; RV32I-NEXT:    or a6, t2, t1
+; RV32I-NEXT:    lbu s8, 20(a0)
+; RV32I-NEXT:    lbu s9, 21(a0)
+; RV32I-NEXT:    lbu s10, 22(a0)
+; RV32I-NEXT:    lbu s11, 23(a0)
 ; RV32I-NEXT:    slli t4, t4, 8
 ; RV32I-NEXT:    slli t5, t5, 16
 ; RV32I-NEXT:    slli t6, t6, 24
-; RV32I-NEXT:    or a5, t0, a5
-; RV32I-NEXT:    or a6, t2, t1
+; RV32I-NEXT:    slli s1, s1, 8
+; RV32I-NEXT:    slli s2, s2, 16
+; RV32I-NEXT:    slli s3, s3, 24
 ; RV32I-NEXT:    or a7, t4, t3
 ; RV32I-NEXT:    or t0, t6, t5
-; RV32I-NEXT:    lbu s1, 24(a0)
-; RV32I-NEXT:    lbu s3, 25(a0)
-; RV32I-NEXT:    lbu t4, 26(a0)
-; RV32I-NEXT:    lbu t5, 27(a0)
-; RV32I-NEXT:    slli s2, s2, 8
-; RV32I-NEXT:    slli s4, s4, 16
-; RV32I-NEXT:    slli s5, s5, 24
-; RV32I-NEXT:    slli s7, s7, 8
-; RV32I-NEXT:    or t1, s2, s0
-; RV32I-NEXT:    or t2, s5, s4
-; RV32I-NEXT:    or t3, s7, s6
-; RV32I-NEXT:    lbu t6, 28(a0)
+; RV32I-NEXT:    or t1, s1, s0
+; RV32I-NEXT:    or t2, s3, s2
+; RV32I-NEXT:    lbu t6, 24(a0)
+; RV32I-NEXT:    lbu s0, 25(a0)
+; RV32I-NEXT:    lbu s1, 26(a0)
+; RV32I-NEXT:    lbu s2, 27(a0)
+; RV32I-NEXT:    slli s5, s5, 8
+; RV32I-NEXT:    slli s6, s6, 16
+; RV32I-NEXT:    slli s7, s7, 24
+; RV32I-NEXT:    slli s9, s9, 8
+; RV32I-NEXT:    or t3, s5, s4
+; RV32I-NEXT:    or t4, s7, s6
+; RV32I-NEXT:    or t5, s9, s8
+; RV32I-NEXT:    lbu s3, 28(a0)
 ; RV32I-NEXT:    lbu s4, 29(a0)
 ; RV32I-NEXT:    lbu s5, 30(a0)
 ; RV32I-NEXT:    lbu s6, 31(a0)
-; RV32I-NEXT:    slli s8, s8, 16
-; RV32I-NEXT:    slli s9, s9, 24
-; RV32I-NEXT:    slli s11, s11, 8
-; RV32I-NEXT:    slli ra, ra, 16
-; RV32I-NEXT:    slli a3, a3, 24
-; RV32I-NEXT:    or a0, s9, s8
-; RV32I-NEXT:    or s0, s11, s10
-; RV32I-NEXT:    or s2, a3, ra
-; RV32I-NEXT:    lbu a3, 0(a1)
-; RV32I-NEXT:    lbu s7, 1(a1)
-; RV32I-NEXT:    lbu s8, 2(a1)
+; RV32I-NEXT:    slli s10, s10, 16
+; RV32I-NEXT:    slli s11, s11, 24
+; RV32I-NEXT:    slli s0, s0, 8
+; RV32I-NEXT:    slli s1, s1, 16
+; RV32I-NEXT:    slli s2, s2, 24
+; RV32I-NEXT:    or a0, s11, s10
+; RV32I-NEXT:    or t6, s0, t6
+; RV32I-NEXT:    or s0, s2, s1
+; RV32I-NEXT:    lbu s1, 0(a1)
+; RV32I-NEXT:    lbu s2, 1(a1)
+; RV32I-NEXT:    lbu s7, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    sw zero, 48(sp)
+; RV32I-NEXT:    sw zero, 52(sp)
 ; RV32I-NEXT:    sw zero, 56(sp)
 ; RV32I-NEXT:    sw zero, 60(sp)
-; RV32I-NEXT:    sw zero, 64(sp)
-; RV32I-NEXT:    sw zero, 68(sp)
+; RV32I-NEXT:    sw zero, 32(sp)
+; RV32I-NEXT:    sw zero, 36(sp)
 ; RV32I-NEXT:    sw zero, 40(sp)
 ; RV32I-NEXT:    sw zero, 44(sp)
-; RV32I-NEXT:    sw zero, 48(sp)
-; RV32I-NEXT:    sw zero, 52(sp)
-; RV32I-NEXT:    slli s3, s3, 8
-; RV32I-NEXT:    or s1, s3, s1
-; RV32I-NEXT:    addi s3, sp, 8
-; RV32I-NEXT:    slli t4, t4, 16
-; RV32I-NEXT:    slli t5, t5, 24
 ; RV32I-NEXT:    slli s4, s4, 8
+; RV32I-NEXT:    or s3, s4, s3
+; RV32I-NEXT:    mv s4, sp
 ; RV32I-NEXT:    slli s5, s5, 16
 ; RV32I-NEXT:    slli s6, s6, 24
-; RV32I-NEXT:    slli s7, s7, 8
-; RV32I-NEXT:    slli s8, s8, 16
+; RV32I-NEXT:    slli s2, s2, 8
+; RV32I-NEXT:    slli s7, s7, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or t4, t5, t4
-; RV32I-NEXT:    or t5, s4, t6
-; RV32I-NEXT:    or t6, s6, s5
-; RV32I-NEXT:    or a3, s7, a3
-; RV32I-NEXT:    or a1, a1, s8
-; RV32I-NEXT:    lw s4, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    or a4, a4, s4
-; RV32I-NEXT:    or a5, a6, a5
-; RV32I-NEXT:    or a6, t0, a7
-; RV32I-NEXT:    or a7, t2, t1
-; RV32I-NEXT:    or t0, a0, t3
-; RV32I-NEXT:    or t1, s2, s0
-; RV32I-NEXT:    or t2, t4, s1
-; RV32I-NEXT:    or t3, t6, t5
-; RV32I-NEXT:    or a0, a1, a3
-; RV32I-NEXT:    sw t0, 24(sp)
-; RV32I-NEXT:    sw t1, 28(sp)
-; RV32I-NEXT:    sw t2, 32(sp)
-; RV32I-NEXT:    sw t3, 36(sp)
-; RV32I-NEXT:    sw a4, 8(sp)
-; RV32I-NEXT:    sw a5, 12(sp)
-; RV32I-NEXT:    sw a6, 16(sp)
-; RV32I-NEXT:    sw a7, 20(sp)
+; RV32I-NEXT:    or s5, s6, s5
+; RV32I-NEXT:    or s1, s2, s1
+; RV32I-NEXT:    or a1, a1, s7
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a5, t0, a7
+; RV32I-NEXT:    or a6, t2, t1
+; RV32I-NEXT:    or a7, t4, t3
+; RV32I-NEXT:    or t0, a0, t5
+; RV32I-NEXT:    or t1, s0, t6
+; RV32I-NEXT:    or t2, s5, s3
+; RV32I-NEXT:    or a0, a1, s1
+; RV32I-NEXT:    sw a7, 16(sp)
+; RV32I-NEXT:    sw t0, 20(sp)
+; RV32I-NEXT:    sw t1, 24(sp)
+; RV32I-NEXT:    sw t2, 28(sp)
+; RV32I-NEXT:    sw a3, 0(sp)
+; RV32I-NEXT:    sw a4, 4(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a6, 12(sp)
 ; RV32I-NEXT:    srli a1, a0, 3
 ; RV32I-NEXT:    andi a3, a0, 31
 ; RV32I-NEXT:    andi a4, a1, 28
 ; RV32I-NEXT:    xori a1, a3, 31
-; RV32I-NEXT:    add a4, s3, a4
+; RV32I-NEXT:    add a4, s4, a4
 ; RV32I-NEXT:    lw a3, 0(a4)
 ; RV32I-NEXT:    lw a5, 4(a4)
 ; RV32I-NEXT:    lw a6, 8(a4)
@@ -1717,13 +1714,13 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    srli s5, a3, 24
 ; RV32I-NEXT:    srli s6, a3, 16
 ; RV32I-NEXT:    srli s7, a3, 8
-; RV32I-NEXT:    srli s8, a1, 24
-; RV32I-NEXT:    srli s9, a1, 16
 ; RV32I-NEXT:    sb a7, 24(a2)
+; RV32I-NEXT:    srli a7, a1, 24
 ; RV32I-NEXT:    sb t2, 25(a2)
+; RV32I-NEXT:    srli t2, a1, 16
 ; RV32I-NEXT:    sb t1, 26(a2)
 ; RV32I-NEXT:    sb t0, 27(a2)
-; RV32I-NEXT:    srli a7, a1, 8
+; RV32I-NEXT:    srli t0, a1, 8
 ; RV32I-NEXT:    sb a6, 16(a2)
 ; RV32I-NEXT:    sb t5, 17(a2)
 ; RV32I-NEXT:    sb t4, 18(a2)
@@ -1744,27 +1741,26 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sb s6, 14(a2)
 ; RV32I-NEXT:    sb s5, 15(a2)
 ; RV32I-NEXT:    sb a1, 0(a2)
-; RV32I-NEXT:    sb a7, 1(a2)
-; RV32I-NEXT:    sb s9, 2(a2)
-; RV32I-NEXT:    sb s8, 3(a2)
+; RV32I-NEXT:    sb t0, 1(a2)
+; RV32I-NEXT:    sb t2, 2(a2)
+; RV32I-NEXT:    sb a7, 3(a2)
 ; RV32I-NEXT:    sb a0, 4(a2)
 ; RV32I-NEXT:    sb a4, 5(a2)
 ; RV32I-NEXT:    sb a5, 6(a2)
 ; RV32I-NEXT:    sb a6, 7(a2)
-; RV32I-NEXT:    lw ra, 124(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 120(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 116(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 112(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s4, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s5, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s6, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s7, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s8, 88(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s9, 84(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s10, 80(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s11, 76(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 128
+; RV32I-NEXT:    lw s0, 108(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 104(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 100(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 96(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s5, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s6, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s7, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s8, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s9, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s10, 68(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s11, 64(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 112
 ; RV32I-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
   %bitOff = load i256, ptr %bitOff.ptr, align 1
@@ -2006,25 +2002,24 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ;
 ; RV32I-LABEL: shl_32bytes:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -128
-; RV32I-NEXT:    sw ra, 124(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 120(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 116(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 112(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s5, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s6, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s7, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s8, 88(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s9, 84(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s10, 80(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s11, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    addi sp, sp, -112
+; RV32I-NEXT:    sw s0, 108(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 104(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 100(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 96(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s5, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s6, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s7, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s8, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s9, 72(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s10, 68(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s11, 64(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    lbu a3, 0(a0)
 ; RV32I-NEXT:    lbu a4, 1(a0)
-; RV32I-NEXT:    lbu a6, 2(a0)
-; RV32I-NEXT:    lbu a7, 3(a0)
-; RV32I-NEXT:    lbu a5, 4(a0)
+; RV32I-NEXT:    lbu a5, 2(a0)
+; RV32I-NEXT:    lbu a6, 3(a0)
+; RV32I-NEXT:    lbu a7, 4(a0)
 ; RV32I-NEXT:    lbu t0, 5(a0)
 ; RV32I-NEXT:    lbu t1, 6(a0)
 ; RV32I-NEXT:    lbu t2, 7(a0)
@@ -2033,107 +2028,105 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    lbu t5, 10(a0)
 ; RV32I-NEXT:    lbu t6, 11(a0)
 ; RV32I-NEXT:    lbu s0, 12(a0)
-; RV32I-NEXT:    lbu s2, 13(a0)
-; RV32I-NEXT:    lbu s4, 14(a0)
-; RV32I-NEXT:    lbu s5, 15(a0)
-; RV32I-NEXT:    lbu s6, 16(a0)
-; RV32I-NEXT:    lbu s7, 17(a0)
-; RV32I-NEXT:    lbu s8, 18(a0)
-; RV32I-NEXT:    lbu s9, 19(a0)
+; RV32I-NEXT:    lbu s1, 13(a0)
+; RV32I-NEXT:    lbu s2, 14(a0)
+; RV32I-NEXT:    lbu s3, 15(a0)
+; RV32I-NEXT:    lbu s4, 16(a0)
+; RV32I-NEXT:    lbu s5, 17(a0)
+; RV32I-NEXT:    lbu s6, 18(a0)
+; RV32I-NEXT:    lbu s7, 19(a0)
 ; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    slli a6, a6, 16
-; RV32I-NEXT:    slli a7, a7, 24
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    sw a3, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    or a4, a7, a6
-; RV32I-NEXT:    lbu s10, 20(a0)
-; RV32I-NEXT:    lbu s11, 21(a0)
-; RV32I-NEXT:    lbu ra, 22(a0)
-; RV32I-NEXT:    lbu a3, 23(a0)
+; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a6, a6, 24
 ; RV32I-NEXT:    slli t0, t0, 8
 ; RV32I-NEXT:    slli t1, t1, 16
 ; RV32I-NEXT:    slli t2, t2, 24
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a5, t0, a7
+; RV32I-NEXT:    or a6, t2, t1
+; RV32I-NEXT:    lbu s8, 20(a0)
+; RV32I-NEXT:    lbu s9, 21(a0)
+; RV32I-NEXT:    lbu s10, 22(a0)
+; RV32I-NEXT:    lbu s11, 23(a0)
 ; RV32I-NEXT:    slli t4, t4, 8
 ; RV32I-NEXT:    slli t5, t5, 16
 ; RV32I-NEXT:    slli t6, t6, 24
-; RV32I-NEXT:    or a5, t0, a5
-; RV32I-NEXT:    or a6, t2, t1
+; RV32I-NEXT:    slli s1, s1, 8
+; RV32I-NEXT:    slli s2, s2, 16
+; RV32I-NEXT:    slli s3, s3, 24
 ; RV32I-NEXT:    or a7, t4, t3
 ; RV32I-NEXT:    or t0, t6, t5
-; RV32I-NEXT:    lbu s1, 24(a0)
-; RV32I-NEXT:    lbu s3, 25(a0)
-; RV32I-NEXT:    lbu t4, 26(a0)
-; RV32I-NEXT:    lbu t5, 27(a0)
-; RV32I-NEXT:    slli s2, s2, 8
-; RV32I-NEXT:    slli s4, s4, 16
-; RV32I-NEXT:    slli s5, s5, 24
-; RV32I-NEXT:    slli s7, s7, 8
-; RV32I-NEXT:    or t1, s2, s0
-; RV32I-NEXT:    or t2, s5, s4
-; RV32I-NEXT:    or t3, s7, s6
-; RV32I-NEXT:    lbu t6, 28(a0)
+; RV32I-NEXT:    or t1, s1, s0
+; RV32I-NEXT:    or t2, s3, s2
+; RV32I-NEXT:    lbu t6, 24(a0)
+; RV32I-NEXT:    lbu s0, 25(a0)
+; RV32I-NEXT:    lbu s1, 26(a0)
+; RV32I-NEXT:    lbu s2, 27(a0)
+; RV32I-NEXT:    slli s5, s5, 8
+; RV32I-NEXT:    slli s6, s6, 16
+; RV32I-NEXT:    slli s7, s7, 24
+; RV32I-NEXT:    slli s9, s9, 8
+; RV32I-NEXT:    or t3, s5, s4
+; RV32I-NEXT:    or t4, s7, s6
+; RV32I-NEXT:    or t5, s9, s8
+; RV32I-NEXT:    lbu s3, 28(a0)
 ; RV32I-NEXT:    lbu s4, 29(a0)
 ; RV32I-NEXT:    lbu s5, 30(a0)
 ; RV32I-NEXT:    lbu s6, 31(a0)
-; RV32I-NEXT:    slli s8, s8, 16
-; RV32I-NEXT:    slli s9, s9, 24
-; RV32I-NEXT:    slli s11, s11, 8
-; RV32I-NEXT:    slli ra, ra, 16
-; RV32I-NEXT:    slli a3, a3, 24
-; RV32I-NEXT:    or a0, s9, s8
-; RV32I-NEXT:    or s0, s11, s10
-; RV32I-NEXT:    or s2, a3, ra
-; RV32I-NEXT:    lbu a3, 0(a1)
-; RV32I-NEXT:    lbu s7, 1(a1)
-; RV32I-NEXT:    lbu s8, 2(a1)
+; RV32I-NEXT:    slli s10, s10, 16
+; RV32I-NEXT:    slli s11, s11, 24
+; RV32I-NEXT:    slli s0, s0, 8
+; RV32I-NEXT:    slli s1, s1, 16
+; RV32I-NEXT:    slli s2, s2, 24
+; RV32I-NEXT:    or a0, s11, s10
+; RV32I-NEXT:    or t6, s0, t6
+; RV32I-NEXT:    or s0, s2, s1
+; RV32I-NEXT:    lbu s1, 0(a1)
+; RV32I-NEXT:    lbu s2, 1(a1)
+; RV32I-NEXT:    lbu s7, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    sw zero, 16(sp)
+; RV32I-NEXT:    sw zero, 20(sp)
 ; RV32I-NEXT:    sw zero, 24(sp)
 ; RV32I-NEXT:    sw zero, 28(sp)
-; RV32I-NEXT:    sw zero, 32(sp)
-; RV32I-NEXT:    sw zero, 36(sp)
+; RV32I-NEXT:    sw zero, 0(sp)
+; RV32I-NEXT:    sw zero, 4(sp)
 ; RV32I-NEXT:    sw zero, 8(sp)
 ; RV32I-NEXT:    sw zero, 12(sp)
-; RV32I-NEXT:    sw zero, 16(sp)
-; RV32I-NEXT:    sw zero, 20(sp)
-; RV32I-NEXT:    slli s3, s3, 8
-; RV32I-NEXT:    or s1, s3, s1
-; RV32I-NEXT:    addi s3, sp, 40
-; RV32I-NEXT:    slli t4, t4, 16
-; RV32I-NEXT:    slli t5, t5, 24
 ; RV32I-NEXT:    slli s4, s4, 8
+; RV32I-NEXT:    or s3, s4, s3
+; RV32I-NEXT:    addi s4, sp, 32
 ; RV32I-NEXT:    slli s5, s5, 16
 ; RV32I-NEXT:    slli s6, s6, 24
-; RV32I-NEXT:    slli s7, s7, 8
-; RV32I-NEXT:    slli s8, s8, 16
+; RV32I-NEXT:    slli s2, s2, 8
+; RV32I-NEXT:    slli s7, s7, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or t4, t5, t4
-; RV32I-NEXT:    or t5, s4, t6
-; RV32I-NEXT:    or t6, s6, s5
-; RV32I-NEXT:    or a3, s7, a3
-; RV32I-NEXT:    or a1, a1, s8
-; RV32I-NEXT:    lw s4, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    or a4, a4, s4
-; RV32I-NEXT:    or a5, a6, a5
-; RV32I-NEXT:    or a6, t0, a7
-; RV32I-NEXT:    or a7, t2, t1
-; RV32I-NEXT:    or t0, a0, t3
-; RV32I-NEXT:    or t1, s2, s0
-; RV32I-NEXT:    or t2, t4, s1
-; RV32I-NEXT:    or t3, t6, t5
-; RV32I-NEXT:    or a0, a1, a3
-; RV32I-NEXT:    sw t0, 56(sp)
-; RV32I-NEXT:    sw t1, 60(sp)
-; RV32I-NEXT:    sw t2, 64(sp)
-; RV32I-NEXT:    sw t3, 68(sp)
-; RV32I-NEXT:    sw a4, 40(sp)
-; RV32I-NEXT:    sw a5, 44(sp)
-; RV32I-NEXT:    sw a6, 48(sp)
-; RV32I-NEXT:    sw a7, 52(sp)
+; RV32I-NEXT:    or s5, s6, s5
+; RV32I-NEXT:    or s1, s2, s1
+; RV32I-NEXT:    or a1, a1, s7
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a5, t0, a7
+; RV32I-NEXT:    or a6, t2, t1
+; RV32I-NEXT:    or a7, t4, t3
+; RV32I-NEXT:    or t0, a0, t5
+; RV32I-NEXT:    or t1, s0, t6
+; RV32I-NEXT:    or t2, s5, s3
+; RV32I-NEXT:    or a0, a1, s1
+; RV32I-NEXT:    sw a7, 48(sp)
+; RV32I-NEXT:    sw t0, 52(sp)
+; RV32I-NEXT:    sw t1, 56(sp)
+; RV32I-NEXT:    sw t2, 60(sp)
+; RV32I-NEXT:    sw a3, 32(sp)
+; RV32I-NEXT:    sw a4, 36(sp)
+; RV32I-NEXT:    sw a5, 40(sp)
+; RV32I-NEXT:    sw a6, 44(sp)
 ; RV32I-NEXT:    srli a1, a0, 3
 ; RV32I-NEXT:    andi a3, a0, 31
 ; RV32I-NEXT:    andi a4, a1, 28
 ; RV32I-NEXT:    xori a1, a3, 31
-; RV32I-NEXT:    sub a3, s3, a4
+; RV32I-NEXT:    sub a3, s4, a4
 ; RV32I-NEXT:    lw a4, 0(a3)
 ; RV32I-NEXT:    lw a5, 4(a3)
 ; RV32I-NEXT:    lw a6, 8(a3)
@@ -2193,13 +2186,13 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    srli s5, a3, 24
 ; RV32I-NEXT:    srli s6, a3, 16
 ; RV32I-NEXT:    srli s7, a3, 8
-; RV32I-NEXT:    srli s8, a1, 24
-; RV32I-NEXT:    srli s9, a1, 16
 ; RV32I-NEXT:    sb a7, 24(a2)
+; RV32I-NEXT:    srli a7, a1, 24
 ; RV32I-NEXT:    sb t2, 25(a2)
+; RV32I-NEXT:    srli t2, a1, 16
 ; RV32I-NEXT:    sb t1, 26(a2)
 ; RV32I-NEXT:    sb t0, 27(a2)
-; RV32I-NEXT:    srli a7, a1, 8
+; RV32I-NEXT:    srli t0, a1, 8
 ; RV32I-NEXT:    sb a6, 28(a2)
 ; RV32I-NEXT:    sb t5, 29(a2)
 ; RV32I-NEXT:    sb t4, 30(a2)
@@ -2220,27 +2213,26 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sb s6, 10(a2)
 ; RV32I-NEXT:    sb s5, 11(a2)
 ; RV32I-NEXT:    sb a1, 12(a2)
-; RV32I-NEXT:    sb a7, 13(a2)
-; RV32I-NEXT:    sb s9, 14(a2)
-; RV32I-NEXT:    sb s8, 15(a2)
+; RV32I-NEXT:    sb t0, 13(a2)
+; RV32I-NEXT:    sb t2, 14(a2)
+; RV32I-NEXT:    sb a7, 15(a2)
 ; RV32I-NEXT:    sb a0, 4(a2)
 ; RV32I-NEXT:    sb a4, 5(a2)
 ; RV32I-NEXT:    sb a5, 6(a2)
 ; RV32I-NEXT:    sb a6, 7(a2)
-; RV32I-NEXT:    lw ra, 124(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 120(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 116(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 112(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s4, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s5, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s6, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s7, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s8, 88(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s9, 84(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s10, 80(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s11, 76(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 128
+; RV32I-NEXT:    lw s0, 108(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 104(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 100(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 96(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s5, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s6, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s7, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s8, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s9, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s10, 68(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s11, 64(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 112
 ; RV32I-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
   %bitOff = load i256, ptr %bitOff.ptr, align 1
@@ -2483,25 +2475,24 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ;
 ; RV32I-LABEL: ashr_32bytes:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -128
-; RV32I-NEXT:    sw ra, 124(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 120(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 116(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 112(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s5, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s6, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s7, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s8, 88(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s9, 84(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s10, 80(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s11, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    addi sp, sp, -112
+; RV32I-NEXT:    sw s0, 108(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 104(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 100(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 96(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s5, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s6, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s7, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s8, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s9, 72(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s10, 68(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s11, 64(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    lbu a3, 0(a0)
 ; RV32I-NEXT:    lbu a4, 1(a0)
-; RV32I-NEXT:    lbu a6, 2(a0)
-; RV32I-NEXT:    lbu a7, 3(a0)
-; RV32I-NEXT:    lbu a5, 4(a0)
+; RV32I-NEXT:    lbu a5, 2(a0)
+; RV32I-NEXT:    lbu a6, 3(a0)
+; RV32I-NEXT:    lbu a7, 4(a0)
 ; RV32I-NEXT:    lbu t0, 5(a0)
 ; RV32I-NEXT:    lbu t1, 6(a0)
 ; RV32I-NEXT:    lbu t2, 7(a0)
@@ -2518,100 +2509,98 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    lbu s6, 18(a0)
 ; RV32I-NEXT:    lbu s7, 19(a0)
 ; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    slli a6, a6, 16
-; RV32I-NEXT:    slli a7, a7, 24
+; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a6, a6, 24
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t2, t2, 24
 ; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    sw a3, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    or a4, a7, a6
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a5, t0, a7
+; RV32I-NEXT:    or a6, t2, t1
 ; RV32I-NEXT:    lbu s8, 20(a0)
 ; RV32I-NEXT:    lbu s9, 21(a0)
 ; RV32I-NEXT:    lbu s10, 22(a0)
 ; RV32I-NEXT:    lbu s11, 23(a0)
-; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    slli t1, t1, 16
-; RV32I-NEXT:    slli t2, t2, 24
 ; RV32I-NEXT:    slli t4, t4, 8
 ; RV32I-NEXT:    slli t5, t5, 16
 ; RV32I-NEXT:    slli t6, t6, 24
-; RV32I-NEXT:    or a5, t0, a5
-; RV32I-NEXT:    or a6, t2, t1
-; RV32I-NEXT:    or a7, t4, t3
-; RV32I-NEXT:    or t0, t6, t5
-; RV32I-NEXT:    lbu ra, 24(a0)
-; RV32I-NEXT:    lbu a3, 25(a0)
-; RV32I-NEXT:    lbu t4, 26(a0)
-; RV32I-NEXT:    lbu t5, 27(a0)
 ; RV32I-NEXT:    slli s1, s1, 8
 ; RV32I-NEXT:    slli s2, s2, 16
 ; RV32I-NEXT:    slli s3, s3, 24
-; RV32I-NEXT:    slli s5, s5, 8
+; RV32I-NEXT:    or a7, t4, t3
+; RV32I-NEXT:    or t0, t6, t5
 ; RV32I-NEXT:    or t1, s1, s0
 ; RV32I-NEXT:    or t2, s3, s2
-; RV32I-NEXT:    or t3, s5, s4
-; RV32I-NEXT:    lbu t6, 28(a0)
-; RV32I-NEXT:    lbu s0, 29(a0)
-; RV32I-NEXT:    lbu s1, 30(a0)
-; RV32I-NEXT:    lbu a0, 31(a0)
+; RV32I-NEXT:    lbu t6, 24(a0)
+; RV32I-NEXT:    lbu s0, 25(a0)
+; RV32I-NEXT:    lbu s1, 26(a0)
+; RV32I-NEXT:    lbu s2, 27(a0)
+; RV32I-NEXT:    slli s5, s5, 8
 ; RV32I-NEXT:    slli s6, s6, 16
 ; RV32I-NEXT:    slli s7, s7, 24
 ; RV32I-NEXT:    slli s9, s9, 8
+; RV32I-NEXT:    or t3, s5, s4
+; RV32I-NEXT:    or t4, s7, s6
+; RV32I-NEXT:    or t5, s9, s8
+; RV32I-NEXT:    lbu s3, 28(a0)
+; RV32I-NEXT:    lbu s4, 29(a0)
+; RV32I-NEXT:    lbu s5, 30(a0)
+; RV32I-NEXT:    lbu a0, 31(a0)
 ; RV32I-NEXT:    slli s10, s10, 16
 ; RV32I-NEXT:    slli s11, s11, 24
-; RV32I-NEXT:    or s2, s7, s6
-; RV32I-NEXT:    or s3, s9, s8
-; RV32I-NEXT:    or s4, s11, s10
-; RV32I-NEXT:    lbu s5, 0(a1)
-; RV32I-NEXT:    lbu s6, 1(a1)
-; RV32I-NEXT:    lbu s7, 2(a1)
-; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, ra
-; RV32I-NEXT:    addi s8, sp, 8
-; RV32I-NEXT:    slli t4, t4, 16
-; RV32I-NEXT:    slli t5, t5, 24
 ; RV32I-NEXT:    slli s0, s0, 8
 ; RV32I-NEXT:    slli s1, s1, 16
+; RV32I-NEXT:    slli s2, s2, 24
+; RV32I-NEXT:    or s6, s11, s10
+; RV32I-NEXT:    or t6, s0, t6
+; RV32I-NEXT:    or s0, s2, s1
+; RV32I-NEXT:    lbu s1, 0(a1)
+; RV32I-NEXT:    lbu s2, 1(a1)
+; RV32I-NEXT:    lbu s7, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli s4, s4, 8
+; RV32I-NEXT:    or s3, s4, s3
+; RV32I-NEXT:    mv s4, sp
+; RV32I-NEXT:    slli s5, s5, 16
 ; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    slli s6, s6, 8
+; RV32I-NEXT:    slli s2, s2, 8
 ; RV32I-NEXT:    slli s7, s7, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or t4, t5, t4
-; RV32I-NEXT:    or t5, s0, t6
-; RV32I-NEXT:    or s1, a0, s1
-; RV32I-NEXT:    or t6, s6, s5
+; RV32I-NEXT:    or s5, a0, s5
+; RV32I-NEXT:    or s1, s2, s1
 ; RV32I-NEXT:    or a1, a1, s7
-; RV32I-NEXT:    srai s0, a0, 31
-; RV32I-NEXT:    lw a0, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    or a4, a4, a0
-; RV32I-NEXT:    or a5, a6, a5
-; RV32I-NEXT:    or a6, t0, a7
-; RV32I-NEXT:    or a7, t2, t1
-; RV32I-NEXT:    or t0, s2, t3
-; RV32I-NEXT:    or t1, s4, s3
-; RV32I-NEXT:    or a3, t4, a3
-; RV32I-NEXT:    or t2, s1, t5
-; RV32I-NEXT:    or a0, a1, t6
-; RV32I-NEXT:    sw s0, 56(sp)
-; RV32I-NEXT:    sw s0, 60(sp)
-; RV32I-NEXT:    sw s0, 64(sp)
-; RV32I-NEXT:    sw s0, 68(sp)
-; RV32I-NEXT:    sw s0, 40(sp)
-; RV32I-NEXT:    sw s0, 44(sp)
-; RV32I-NEXT:    sw s0, 48(sp)
-; RV32I-NEXT:    sw s0, 52(sp)
-; RV32I-NEXT:    sw t0, 24(sp)
-; RV32I-NEXT:    sw t1, 28(sp)
-; RV32I-NEXT:    sw a3, 32(sp)
-; RV32I-NEXT:    sw t2, 36(sp)
-; RV32I-NEXT:    sw a4, 8(sp)
-; RV32I-NEXT:    sw a5, 12(sp)
-; RV32I-NEXT:    sw a6, 16(sp)
-; RV32I-NEXT:    sw a7, 20(sp)
+; RV32I-NEXT:    srai s2, a0, 31
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a5, t0, a7
+; RV32I-NEXT:    or a6, t2, t1
+; RV32I-NEXT:    or a7, t4, t3
+; RV32I-NEXT:    or t0, s6, t5
+; RV32I-NEXT:    or t1, s0, t6
+; RV32I-NEXT:    or t2, s5, s3
+; RV32I-NEXT:    or a0, a1, s1
+; RV32I-NEXT:    sw s2, 48(sp)
+; RV32I-NEXT:    sw s2, 52(sp)
+; RV32I-NEXT:    sw s2, 56(sp)
+; RV32I-NEXT:    sw s2, 60(sp)
+; RV32I-NEXT:    sw s2, 32(sp)
+; RV32I-NEXT:    sw s2, 36(sp)
+; RV32I-NEXT:    sw s2, 40(sp)
+; RV32I-NEXT:    sw s2, 44(sp)
+; RV32I-NEXT:    sw a7, 16(sp)
+; RV32I-NEXT:    sw t0, 20(sp)
+; RV32I-NEXT:    sw t1, 24(sp)
+; RV32I-NEXT:    sw t2, 28(sp)
+; RV32I-NEXT:    sw a3, 0(sp)
+; RV32I-NEXT:    sw a4, 4(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a6, 12(sp)
 ; RV32I-NEXT:    srli a1, a0, 3
 ; RV32I-NEXT:    andi a3, a0, 31
 ; RV32I-NEXT:    andi a4, a1, 28
 ; RV32I-NEXT:    xori a1, a3, 31
-; RV32I-NEXT:    add a4, s8, a4
+; RV32I-NEXT:    add a4, s4, a4
 ; RV32I-NEXT:    lw a3, 0(a4)
 ; RV32I-NEXT:    lw a5, 4(a4)
 ; RV32I-NEXT:    lw a6, 8(a4)
@@ -2671,13 +2660,13 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    srli s5, a3, 24
 ; RV32I-NEXT:    srli s6, a3, 16
 ; RV32I-NEXT:    srli s7, a3, 8
-; RV32I-NEXT:    srli s8, a1, 24
-; RV32I-NEXT:    srli s9, a1, 16
 ; RV32I-NEXT:    sb a7, 24(a2)
+; RV32I-NEXT:    srli a7, a1, 24
 ; RV32I-NEXT:    sb t2, 25(a2)
+; RV32I-NEXT:    srli t2, a1, 16
 ; RV32I-NEXT:    sb t1, 26(a2)
 ; RV32I-NEXT:    sb t0, 27(a2)
-; RV32I-NEXT:    srli a7, a1, 8
+; RV32I-NEXT:    srli t0, a1, 8
 ; RV32I-NEXT:    sb a6, 16(a2)
 ; RV32I-NEXT:    sb t5, 17(a2)
 ; RV32I-NEXT:    sb t4, 18(a2)
@@ -2698,27 +2687,26 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sb s6, 14(a2)
 ; RV32I-NEXT:    sb s5, 15(a2)
 ; RV32I-NEXT:    sb a1, 0(a2)
-; RV32I-NEXT:    sb a7, 1(a2)
-; RV32I-NEXT:    sb s9, 2(a2)
-; RV32I-NEXT:    sb s8, 3(a2)
+; RV32I-NEXT:    sb t0, 1(a2)
+; RV32I-NEXT:    sb t2, 2(a2)
+; RV32I-NEXT:    sb a7, 3(a2)
 ; RV32I-NEXT:    sb a0, 4(a2)
 ; RV32I-NEXT:    sb a4, 5(a2)
 ; RV32I-NEXT:    sb a5, 6(a2)
 ; RV32I-NEXT:    sb a6, 7(a2)
-; RV32I-NEXT:    lw ra, 124(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 120(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 116(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 112(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s4, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s5, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s6, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s7, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s8, 88(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s9, 84(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s10, 80(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s11, 76(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 128
+; RV32I-NEXT:    lw s0, 108(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 104(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 100(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 96(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s5, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s6, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s7, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s8, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s9, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s10, 68(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s11, 64(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 112
 ; RV32I-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
   %bitOff = load i256, ptr %bitOff.ptr, align 1
diff --git a/llvm/unittests/CodeGen/MFCommon.inc b/llvm/unittests/CodeGen/MFCommon.inc
index 2c4b1f36ffd23d..67759bd5c4632e 100644
--- a/llvm/unittests/CodeGen/MFCommon.inc
+++ b/llvm/unittests/CodeGen/MFCommon.inc
@@ -50,8 +50,8 @@ public:
   const char *getRegPressureSetName(unsigned Idx) const override {
     return "bogus";
   }
-  unsigned getRegPressureSetLimit(const MachineFunction &MF, unsigned Idx,
-                                  bool RemoveReserved) const override {
+  unsigned getRegPressureSetLimit(const MachineFunction &MF,
+                                  unsigned Idx) const override {
     return 0;
   }
   const int *
diff --git a/llvm/utils/TableGen/RegisterInfoEmitter.cpp b/llvm/utils/TableGen/RegisterInfoEmitter.cpp
index 674925c1b2acd3..a6f87119aca5ba 100644
--- a/llvm/utils/TableGen/RegisterInfoEmitter.cpp
+++ b/llvm/utils/TableGen/RegisterInfoEmitter.cpp
@@ -275,8 +275,7 @@ void RegisterInfoEmitter::EmitRegUnitPressure(raw_ostream &OS,
   OS << "// Get the register unit pressure limit for this dimension.\n"
      << "// This limit must be adjusted dynamically for reserved registers.\n"
      << "unsigned " << ClassName << "::\n"
-     << "getRegPressureSetLimit(const MachineFunction &MF, unsigned Idx, bool "
-        "RemoveReserved) const "
+     << "getRegPressureSetLimit(const MachineFunction &MF, unsigned Idx) const "
         "{\n"
      << "  static const " << getMinimalTypeForRange(MaxRegUnitWeight, 32)
      << " PressureLimitTable[] = {\n";
@@ -1131,7 +1130,7 @@ void RegisterInfoEmitter::runTargetHeader(raw_ostream &OS) {
      << "  unsigned getNumRegPressureSets() const override;\n"
      << "  const char *getRegPressureSetName(unsigned Idx) const override;\n"
      << "  unsigned getRegPressureSetLimit(const MachineFunction &MF, unsigned "
-        "Idx, bool RemoveReserved = true) const override;\n"
+        "Idx) const override;\n"
      << "  const int *getRegClassPressureSets("
      << "const TargetRegisterClass *RC) const override;\n"
      << "  const int *getRegUnitPressureSets("



More information about the llvm-commits mailing list