[llvm] [RISCV][RFC/WIP] Enable load clustering in SelectionDAG scheduler (PR #74600)

Alex Bradbury via llvm-commits llvm-commits at lists.llvm.org
Wed Dec 13 06:42:28 PST 2023


https://github.com/asb updated https://github.com/llvm/llvm-project/pull/74600

>From 037f49eab2bb4d92629cfb42586d51c9ca1dd473 Mon Sep 17 00:00:00 2001
From: Alex Bradbury <asb at igalia.com>
Date: Wed, 13 Dec 2023 13:32:13 +0000
Subject: [PATCH 1/4] [MachineScheduler] Add option to control reordering for
 store/load clustering

Reordering based on the sort order of the MemOpInfo array was disabled
in <https://reviews.llvm.org/D72706>. However, it's not clear this is
desirable for al targets. It also makes it more difficult to compare the
incremental benefit of enabling load clustering in the selectiondag
scheduler as well was the machinescheduler, as the sdag scheduler does
seem to allow this reordering.

This patch adds a parameter that can control the behaviour on a
per-target basis.
---
 llvm/include/llvm/CodeGen/MachineScheduler.h  | 10 ++++--
 llvm/lib/CodeGen/MachineScheduler.cpp         | 31 ++++++++++++-------
 llvm/lib/Target/RISCV/RISCVTargetMachine.cpp  |  2 +-
 .../CodeGen/RISCV/misched-load-clustering.ll  |  6 ++--
 4 files changed, 32 insertions(+), 17 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/MachineScheduler.h b/llvm/include/llvm/CodeGen/MachineScheduler.h
index 9f16cf5d5bc387..47127a6c29603b 100644
--- a/llvm/include/llvm/CodeGen/MachineScheduler.h
+++ b/llvm/include/llvm/CodeGen/MachineScheduler.h
@@ -1347,13 +1347,19 @@ ScheduleDAGMILive *createGenericSchedLive(MachineSchedContext *C);
 /// Create a generic scheduler with no vreg liveness or DAG mutation passes.
 ScheduleDAGMI *createGenericSchedPostRA(MachineSchedContext *C);
 
+/// If ReorderWhileClustering is set to true, no attempt will be made to
+/// reduce reordering due to store clustering.
 std::unique_ptr<ScheduleDAGMutation>
 createLoadClusterDAGMutation(const TargetInstrInfo *TII,
-                             const TargetRegisterInfo *TRI);
+                             const TargetRegisterInfo *TRI,
+                             bool ReorderWhileClustering = false);
 
+/// If ReorderWhileClustering is set to true, no attempt will be made to
+/// reduce reordering due to store clustering.
 std::unique_ptr<ScheduleDAGMutation>
 createStoreClusterDAGMutation(const TargetInstrInfo *TII,
-                              const TargetRegisterInfo *TRI);
+                              const TargetRegisterInfo *TRI,
+                              bool ReorderWhileClustering = false);
 
 std::unique_ptr<ScheduleDAGMutation>
 createCopyConstrainDAGMutation(const TargetInstrInfo *TII,
diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp
index 886137d86f87de..554776783eff6f 100644
--- a/llvm/lib/CodeGen/MachineScheduler.cpp
+++ b/llvm/lib/CodeGen/MachineScheduler.cpp
@@ -1743,11 +1743,14 @@ class BaseMemOpClusterMutation : public ScheduleDAGMutation {
   const TargetInstrInfo *TII;
   const TargetRegisterInfo *TRI;
   bool IsLoad;
+  bool ReorderWhileClustering;
 
 public:
   BaseMemOpClusterMutation(const TargetInstrInfo *tii,
-                           const TargetRegisterInfo *tri, bool IsLoad)
-      : TII(tii), TRI(tri), IsLoad(IsLoad) {}
+                           const TargetRegisterInfo *tri, bool IsLoad,
+                           bool ReorderWhileClustering)
+      : TII(tii), TRI(tri), IsLoad(IsLoad),
+        ReorderWhileClustering(ReorderWhileClustering) {}
 
   void apply(ScheduleDAGInstrs *DAGInstrs) override;
 
@@ -1763,14 +1766,16 @@ class BaseMemOpClusterMutation : public ScheduleDAGMutation {
 class StoreClusterMutation : public BaseMemOpClusterMutation {
 public:
   StoreClusterMutation(const TargetInstrInfo *tii,
-                       const TargetRegisterInfo *tri)
-      : BaseMemOpClusterMutation(tii, tri, false) {}
+                       const TargetRegisterInfo *tri,
+                       bool ReorderWhileClustering)
+      : BaseMemOpClusterMutation(tii, tri, false, ReorderWhileClustering) {}
 };
 
 class LoadClusterMutation : public BaseMemOpClusterMutation {
 public:
-  LoadClusterMutation(const TargetInstrInfo *tii, const TargetRegisterInfo *tri)
-      : BaseMemOpClusterMutation(tii, tri, true) {}
+  LoadClusterMutation(const TargetInstrInfo *tii, const TargetRegisterInfo *tri,
+                      bool ReorderWhileClustering)
+      : BaseMemOpClusterMutation(tii, tri, true, ReorderWhileClustering) {}
 };
 
 } // end anonymous namespace
@@ -1779,15 +1784,19 @@ namespace llvm {
 
 std::unique_ptr<ScheduleDAGMutation>
 createLoadClusterDAGMutation(const TargetInstrInfo *TII,
-                             const TargetRegisterInfo *TRI) {
-  return EnableMemOpCluster ? std::make_unique<LoadClusterMutation>(TII, TRI)
+                             const TargetRegisterInfo *TRI,
+                             bool ReorderWhileClustering) {
+  return EnableMemOpCluster ? std::make_unique<LoadClusterMutation>(
+                                  TII, TRI, ReorderWhileClustering)
                             : nullptr;
 }
 
 std::unique_ptr<ScheduleDAGMutation>
 createStoreClusterDAGMutation(const TargetInstrInfo *TII,
-                              const TargetRegisterInfo *TRI) {
-  return EnableMemOpCluster ? std::make_unique<StoreClusterMutation>(TII, TRI)
+                              const TargetRegisterInfo *TRI,
+                              bool ReorderWhileClustering) {
+  return EnableMemOpCluster ? std::make_unique<StoreClusterMutation>(
+                                  TII, TRI, ReorderWhileClustering)
                             : nullptr;
 }
 
@@ -1840,7 +1849,7 @@ void BaseMemOpClusterMutation::clusterNeighboringMemOps(
 
     SUnit *SUa = MemOpa.SU;
     SUnit *SUb = MemOpb.SU;
-    if (SUa->NodeNum > SUb->NodeNum)
+    if (!ReorderWhileClustering && SUa->NodeNum > SUb->NodeNum)
       std::swap(SUa, SUb);
 
     // FIXME: Is this check really required?
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index 3abdb6003659fa..ae10a72e889239 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -348,7 +348,7 @@ class RISCVPassConfig : public TargetPassConfig {
     ScheduleDAGMILive *DAG = nullptr;
     if (EnableMISchedLoadClustering) {
       DAG = createGenericSchedLive(C);
-      DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
+      DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI, true));
     }
     if (ST.hasMacroFusion()) {
       DAG = DAG ? DAG : createGenericSchedLive(C);
diff --git a/llvm/test/CodeGen/RISCV/misched-load-clustering.ll b/llvm/test/CodeGen/RISCV/misched-load-clustering.ll
index 4eb969a357a9ee..d046aaf98f5edd 100644
--- a/llvm/test/CodeGen/RISCV/misched-load-clustering.ll
+++ b/llvm/test/CodeGen/RISCV/misched-load-clustering.ll
@@ -23,10 +23,10 @@ define i32 @load_clustering_1(ptr nocapture %p) {
 ; LDCLUSTER: ********** MI Scheduling **********
 ; LDCLUSTER-LABEL: load_clustering_1:%bb.0
 ; LDCLUSTER: *** Final schedule for %bb.0 ***
-; LDCLUSTER: SU(5): %6:gpr = LW %0:gpr, 16
-; LDCLUSTER: SU(1): %1:gpr = LW %0:gpr, 12
-; LDCLUSTER: SU(2): %2:gpr = LW %0:gpr, 8
 ; LDCLUSTER: SU(4): %4:gpr = LW %0:gpr, 4
+; LDCLUSTER: SU(2): %2:gpr = LW %0:gpr, 8
+; LDCLUSTER: SU(1): %1:gpr = LW %0:gpr, 12
+; LDCLUSTER: SU(5): %6:gpr = LW %0:gpr, 16
 entry:
   %arrayidx0 = getelementptr inbounds i32, ptr %p, i32 3
   %val0 = load i32, i32* %arrayidx0

>From ef43091841e891768642ee0cf7db6f6ccef27cc1 Mon Sep 17 00:00:00 2001
From: Alex Bradbury <asb at igalia.com>
Date: Wed, 13 Dec 2023 13:50:55 +0000
Subject: [PATCH 2/4] [RISCV] Change heuristic used for load clustering

Split out from #73789. Clusters if the operations are within a cache
line of each other (as AMDGPU does in shouldScheduleLoadsNear). X86 does
something similar, but does `((Offset2 - Offset1) / 8 > 64)`. I'm not
sure if that's intentionally set to 512 bytes or if the division is in
error.

Adopts the suggestion from @wangpc-pp to query the cache line size and
use it if available.
---
 llvm/lib/Target/RISCV/RISCVInstrInfo.cpp | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 1dcff7eb563e20..b8960fd9571c9b 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -2282,9 +2282,13 @@ bool RISCVInstrInfo::shouldClusterMemOps(
     return false;
   }
 
-  // TODO: Use a more carefully chosen heuristic, e.g. only cluster if offsets
-  // indicate they likely share a cache line.
-  return ClusterSize <= 4;
+  unsigned CacheLineSize =
+      BaseOps1.front()->getParent()->getMF()->getSubtarget().getCacheLineSize();
+  // Assume a cache line size of 64 bytes if no size is set in RISCVSubtarget.
+  CacheLineSize = CacheLineSize ? CacheLineSize : 64;
+  // Cluster if the memory operations are on the same or a neighbouring cache
+  // line.
+  return std::abs(Offset1 - Offset2) < CacheLineSize;
 }
 
 // Set BaseReg (the base register operand), Offset (the byte offset being

>From 1a46b84cdd3e63afc070fd2524d098a3fd59c231 Mon Sep 17 00:00:00 2001
From: Alex Bradbury <asb at igalia.com>
Date: Wed, 13 Dec 2023 14:10:42 +0000
Subject: [PATCH 3/4] [RISCV] Enable load clustering by default

---
 llvm/lib/Target/RISCV/RISCVTargetMachine.cpp  |   16 +-
 llvm/test/CodeGen/RISCV/GlobalISel/vararg.ll  |   24 +-
 llvm/test/CodeGen/RISCV/add-before-shl.ll     |   38 +-
 llvm/test/CodeGen/RISCV/atomic-rmw-discard.ll |  104 +-
 llvm/test/CodeGen/RISCV/atomic-rmw.ll         | 1040 ++---
 llvm/test/CodeGen/RISCV/atomic-signext.ll     |  208 +-
 .../CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll |  112 +-
 .../test/CodeGen/RISCV/callee-saved-fpr32s.ll |  720 ++--
 .../test/CodeGen/RISCV/callee-saved-fpr64s.ll |  496 +--
 llvm/test/CodeGen/RISCV/callee-saved-gprs.ll  | 1044 ++---
 ...calling-conv-ilp32-ilp32f-ilp32d-common.ll |   92 +-
 .../calling-conv-lp64-lp64f-lp64d-common.ll   |   46 +-
 llvm/test/CodeGen/RISCV/forced-atomics.ll     |  144 +-
 llvm/test/CodeGen/RISCV/fpclamptosat.ll       |  392 +-
 llvm/test/CodeGen/RISCV/iabs.ll               |    8 +-
 .../test/CodeGen/RISCV/intrinsic-cttz-elts.ll |    4 +-
 llvm/test/CodeGen/RISCV/legalize-fneg.ll      |   10 +-
 llvm/test/CodeGen/RISCV/llvm.exp10.ll         |   34 +-
 llvm/test/CodeGen/RISCV/llvm.frexp.ll         |  112 +-
 llvm/test/CodeGen/RISCV/memcpy.ll             |   62 +-
 .../CodeGen/RISCV/misched-load-clustering.ll  |   17 +-
 llvm/test/CodeGen/RISCV/mul.ll                |  134 +-
 llvm/test/CodeGen/RISCV/nontemporal.ll        | 1420 +++----
 .../test/CodeGen/RISCV/overflow-intrinsics.ll |    2 +-
 llvm/test/CodeGen/RISCV/push-pop-popret.ll    | 2004 ++++-----
 .../test/CodeGen/RISCV/reduction-formation.ll |   72 +-
 llvm/test/CodeGen/RISCV/rv32zbb.ll            |   94 +-
 llvm/test/CodeGen/RISCV/rv64i-shift-sext.ll   |    4 +-
 llvm/test/CodeGen/RISCV/rvv/expand-no-v.ll    |   64 +-
 .../CodeGen/RISCV/rvv/fixed-vectors-elen.ll   |   50 +-
 .../RISCV/rvv/fixed-vectors-fp2i-sat.ll       |  140 +-
 .../CodeGen/RISCV/rvv/fixed-vectors-i2fp.ll   |   96 +-
 ...fixed-vectors-interleaved-access-zve32x.ll |   36 +-
 .../CodeGen/RISCV/rvv/fixed-vectors-lrint.ll  |   40 +-
 .../RISCV/rvv/fixed-vectors-masked-gather.ll  | 1258 +++---
 .../RISCV/rvv/fixed-vectors-masked-scatter.ll | 1048 ++---
 .../fixed-vectors-strided-load-store-asm.ll   |   16 +-
 .../CodeGen/RISCV/rvv/fpclamptosat_vec.ll     |  456 +--
 llvm/test/CodeGen/RISCV/rvv/pr63596.ll        |   12 +-
 llvm/test/CodeGen/RISCV/shifts.ll             |  468 ++-
 .../CodeGen/RISCV/srem-seteq-illegal-types.ll |  190 +-
 llvm/test/CodeGen/RISCV/srem-vector-lkk.ll    |  638 +--
 llvm/test/CodeGen/RISCV/stack-store-check.ll  |   32 +-
 .../RISCV/umulo-128-legalisation-lowering.ll  |   96 +-
 .../CodeGen/RISCV/unaligned-load-store.ll     |   57 +-
 llvm/test/CodeGen/RISCV/urem-vector-lkk.ll    |  450 +-
 llvm/test/CodeGen/RISCV/vararg.ll             |   30 +-
 ...lar-shift-by-byte-multiple-legalization.ll | 2004 ++++-----
 .../RISCV/wide-scalar-shift-legalization.ll   | 3638 +++++++++--------
 llvm/test/CodeGen/RISCV/xtheadmempair.ll      |   14 +-
 50 files changed, 9657 insertions(+), 9629 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index ae10a72e889239..7b0fccd997d30d 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -95,11 +95,6 @@ static cl::opt<bool>
                         cl::desc("Enable Split RegisterAlloc for RVV"),
                         cl::init(true));
 
-static cl::opt<bool> EnableMISchedLoadClustering(
-    "riscv-misched-load-clustering", cl::Hidden,
-    cl::desc("Enable load clustering in the machine scheduler"),
-    cl::init(false));
-
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVTarget() {
   RegisterTargetMachine<RISCVTargetMachine> X(getTheRISCV32Target());
   RegisterTargetMachine<RISCVTargetMachine> Y(getTheRISCV64Target());
@@ -345,15 +340,10 @@ class RISCVPassConfig : public TargetPassConfig {
   ScheduleDAGInstrs *
   createMachineScheduler(MachineSchedContext *C) const override {
     const RISCVSubtarget &ST = C->MF->getSubtarget<RISCVSubtarget>();
-    ScheduleDAGMILive *DAG = nullptr;
-    if (EnableMISchedLoadClustering) {
-      DAG = createGenericSchedLive(C);
-      DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI, true));
-    }
-    if (ST.hasMacroFusion()) {
-      DAG = DAG ? DAG : createGenericSchedLive(C);
+    ScheduleDAGMILive *DAG = createGenericSchedLive(C);
+    DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI, true));
+    if (ST.hasMacroFusion())
       DAG->addMutation(createRISCVMacroFusionDAGMutation());
-    }
     return DAG;
   }
 
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/vararg.ll b/llvm/test/CodeGen/RISCV/GlobalISel/vararg.ll
index 501a3c0ce74380..b77d6bb1fac8f3 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/vararg.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/vararg.ll
@@ -63,15 +63,15 @@ define i32 @va1(ptr %fmt, ...) {
 ; RV64-NEXT:    sd a2, 32(sp)
 ; RV64-NEXT:    sd a3, 40(sp)
 ; RV64-NEXT:    sd a4, 48(sp)
-; RV64-NEXT:    sd a5, 56(sp)
 ; RV64-NEXT:    addi a0, sp, 24
 ; RV64-NEXT:    sd a0, 8(sp)
-; RV64-NEXT:    lw a0, 12(sp)
-; RV64-NEXT:    lwu a1, 8(sp)
+; RV64-NEXT:    lwu a0, 8(sp)
+; RV64-NEXT:    lw a1, 12(sp)
+; RV64-NEXT:    sd a5, 56(sp)
 ; RV64-NEXT:    sd a6, 64(sp)
 ; RV64-NEXT:    sd a7, 72(sp)
-; RV64-NEXT:    slli a0, a0, 32
-; RV64-NEXT:    or a0, a0, a1
+; RV64-NEXT:    slli a1, a1, 32
+; RV64-NEXT:    or a0, a1, a0
 ; RV64-NEXT:    addi a1, a0, 4
 ; RV64-NEXT:    srli a2, a1, 32
 ; RV64-NEXT:    sw a1, 8(sp)
@@ -968,22 +968,22 @@ define i32 @va_large_stack(ptr %fmt, ...) {
 ; RV64-NEXT:    add a0, sp, a0
 ; RV64-NEXT:    sd a4, 304(a0)
 ; RV64-NEXT:    lui a0, 24414
-; RV64-NEXT:    add a0, sp, a0
-; RV64-NEXT:    sd a5, 312(a0)
-; RV64-NEXT:    lui a0, 24414
 ; RV64-NEXT:    addiw a0, a0, 280
 ; RV64-NEXT:    add a0, sp, a0
 ; RV64-NEXT:    sd a0, 8(sp)
-; RV64-NEXT:    lw a0, 12(sp)
-; RV64-NEXT:    lwu a1, 8(sp)
+; RV64-NEXT:    lwu a0, 8(sp)
+; RV64-NEXT:    lw a1, 12(sp)
+; RV64-NEXT:    lui a2, 24414
+; RV64-NEXT:    add a2, sp, a2
+; RV64-NEXT:    sd a5, 312(a2)
 ; RV64-NEXT:    lui a2, 24414
 ; RV64-NEXT:    add a2, sp, a2
 ; RV64-NEXT:    sd a6, 320(a2)
 ; RV64-NEXT:    lui a2, 24414
 ; RV64-NEXT:    add a2, sp, a2
 ; RV64-NEXT:    sd a7, 328(a2)
-; RV64-NEXT:    slli a0, a0, 32
-; RV64-NEXT:    or a0, a0, a1
+; RV64-NEXT:    slli a1, a1, 32
+; RV64-NEXT:    or a0, a1, a0
 ; RV64-NEXT:    addi a1, a0, 4
 ; RV64-NEXT:    srli a2, a1, 32
 ; RV64-NEXT:    sw a1, 8(sp)
diff --git a/llvm/test/CodeGen/RISCV/add-before-shl.ll b/llvm/test/CodeGen/RISCV/add-before-shl.ll
index 274f1cef49aa95..823918f1c42e7a 100644
--- a/llvm/test/CodeGen/RISCV/add-before-shl.ll
+++ b/llvm/test/CodeGen/RISCV/add-before-shl.ll
@@ -167,17 +167,17 @@ define i128 @add_wide_operand(i128 %a) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lw a2, 0(a1)
 ; RV32I-NEXT:    lw a3, 4(a1)
-; RV32I-NEXT:    lw a4, 12(a1)
-; RV32I-NEXT:    lw a1, 8(a1)
+; RV32I-NEXT:    lw a4, 8(a1)
+; RV32I-NEXT:    lw a1, 12(a1)
 ; RV32I-NEXT:    srli a5, a2, 29
 ; RV32I-NEXT:    slli a6, a3, 3
 ; RV32I-NEXT:    or a5, a6, a5
 ; RV32I-NEXT:    srli a3, a3, 29
-; RV32I-NEXT:    slli a6, a1, 3
+; RV32I-NEXT:    slli a6, a4, 3
 ; RV32I-NEXT:    or a3, a6, a3
-; RV32I-NEXT:    srli a1, a1, 29
-; RV32I-NEXT:    slli a4, a4, 3
-; RV32I-NEXT:    or a1, a4, a1
+; RV32I-NEXT:    srli a4, a4, 29
+; RV32I-NEXT:    slli a1, a1, 3
+; RV32I-NEXT:    or a1, a1, a4
 ; RV32I-NEXT:    slli a2, a2, 3
 ; RV32I-NEXT:    lui a4, 128
 ; RV32I-NEXT:    add a1, a1, a4
@@ -200,26 +200,26 @@ define i128 @add_wide_operand(i128 %a) nounwind {
 ;
 ; RV32C-LABEL: add_wide_operand:
 ; RV32C:       # %bb.0:
-; RV32C-NEXT:    lw a6, 4(a1)
-; RV32C-NEXT:    c.lw a3, 12(a1)
-; RV32C-NEXT:    c.lw a4, 0(a1)
+; RV32C-NEXT:    c.lw a2, 12(a1)
+; RV32C-NEXT:    lw a6, 0(a1)
+; RV32C-NEXT:    c.lw a3, 4(a1)
 ; RV32C-NEXT:    c.lw a1, 8(a1)
 ; RV32C-NEXT:    c.lui a5, 16
-; RV32C-NEXT:    c.add a3, a5
-; RV32C-NEXT:    c.slli a3, 3
+; RV32C-NEXT:    c.add a2, a5
+; RV32C-NEXT:    c.slli a2, 3
 ; RV32C-NEXT:    srli a5, a1, 29
-; RV32C-NEXT:    c.or a3, a5
-; RV32C-NEXT:    srli a5, a4, 29
-; RV32C-NEXT:    slli a2, a6, 3
 ; RV32C-NEXT:    c.or a2, a5
 ; RV32C-NEXT:    srli a5, a6, 29
+; RV32C-NEXT:    slli a4, a3, 3
+; RV32C-NEXT:    c.or a4, a5
+; RV32C-NEXT:    c.srli a3, 29
 ; RV32C-NEXT:    c.slli a1, 3
-; RV32C-NEXT:    c.or a1, a5
-; RV32C-NEXT:    c.slli a4, 3
-; RV32C-NEXT:    c.sw a4, 0(a0)
+; RV32C-NEXT:    c.or a1, a3
+; RV32C-NEXT:    c.slli a6, 3
+; RV32C-NEXT:    sw a6, 0(a0)
 ; RV32C-NEXT:    c.sw a1, 8(a0)
-; RV32C-NEXT:    c.sw a2, 4(a0)
-; RV32C-NEXT:    c.sw a3, 12(a0)
+; RV32C-NEXT:    c.sw a4, 4(a0)
+; RV32C-NEXT:    c.sw a2, 12(a0)
 ; RV32C-NEXT:    c.jr ra
 ;
 ; RV64C-LABEL: add_wide_operand:
diff --git a/llvm/test/CodeGen/RISCV/atomic-rmw-discard.ll b/llvm/test/CodeGen/RISCV/atomic-rmw-discard.ll
index 895852b84e0041..c9b1a00968ab95 100644
--- a/llvm/test/CodeGen/RISCV/atomic-rmw-discard.ll
+++ b/llvm/test/CodeGen/RISCV/atomic-rmw-discard.ll
@@ -192,37 +192,37 @@ define void @amomax_d_discard(ptr %a, i64 %b) nounwind {
 ; RV32-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    mv s0, a0
-; RV32-NEXT:    lw a4, 4(a0)
-; RV32-NEXT:    lw a5, 0(a0)
+; RV32-NEXT:    lw a4, 0(a0)
+; RV32-NEXT:    lw a5, 4(a0)
 ; RV32-NEXT:    mv s1, a2
 ; RV32-NEXT:    mv s2, a1
 ; RV32-NEXT:    j .LBB11_2
 ; RV32-NEXT:  .LBB11_1: # %atomicrmw.start
 ; RV32-NEXT:    # in Loop: Header=BB11_2 Depth=1
-; RV32-NEXT:    sw a5, 8(sp)
-; RV32-NEXT:    sw a4, 12(sp)
+; RV32-NEXT:    sw a4, 8(sp)
+; RV32-NEXT:    sw a5, 12(sp)
 ; RV32-NEXT:    addi a1, sp, 8
 ; RV32-NEXT:    li a4, 5
 ; RV32-NEXT:    li a5, 5
 ; RV32-NEXT:    mv a0, s0
 ; RV32-NEXT:    call __atomic_compare_exchange_8 at plt
-; RV32-NEXT:    lw a4, 12(sp)
-; RV32-NEXT:    lw a5, 8(sp)
+; RV32-NEXT:    lw a4, 8(sp)
+; RV32-NEXT:    lw a5, 12(sp)
 ; RV32-NEXT:    bnez a0, .LBB11_6
 ; RV32-NEXT:  .LBB11_2: # %atomicrmw.start
 ; RV32-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32-NEXT:    beq a4, s1, .LBB11_4
+; RV32-NEXT:    beq a5, s1, .LBB11_4
 ; RV32-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32-NEXT:    # in Loop: Header=BB11_2 Depth=1
-; RV32-NEXT:    slt a0, s1, a4
-; RV32-NEXT:    mv a2, a5
-; RV32-NEXT:    mv a3, a4
+; RV32-NEXT:    slt a0, s1, a5
+; RV32-NEXT:    mv a2, a4
+; RV32-NEXT:    mv a3, a5
 ; RV32-NEXT:    bnez a0, .LBB11_1
 ; RV32-NEXT:    j .LBB11_5
 ; RV32-NEXT:  .LBB11_4: # in Loop: Header=BB11_2 Depth=1
-; RV32-NEXT:    sltu a0, s2, a5
-; RV32-NEXT:    mv a2, a5
-; RV32-NEXT:    mv a3, a4
+; RV32-NEXT:    sltu a0, s2, a4
+; RV32-NEXT:    mv a2, a4
+; RV32-NEXT:    mv a3, a5
 ; RV32-NEXT:    bnez a0, .LBB11_1
 ; RV32-NEXT:  .LBB11_5: # %atomicrmw.start
 ; RV32-NEXT:    # in Loop: Header=BB11_2 Depth=1
@@ -268,37 +268,37 @@ define void @amomaxu_d_discard(ptr %a, i64 %b) nounwind {
 ; RV32-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    mv s0, a0
-; RV32-NEXT:    lw a4, 4(a0)
-; RV32-NEXT:    lw a5, 0(a0)
+; RV32-NEXT:    lw a4, 0(a0)
+; RV32-NEXT:    lw a5, 4(a0)
 ; RV32-NEXT:    mv s1, a2
 ; RV32-NEXT:    mv s2, a1
 ; RV32-NEXT:    j .LBB13_2
 ; RV32-NEXT:  .LBB13_1: # %atomicrmw.start
 ; RV32-NEXT:    # in Loop: Header=BB13_2 Depth=1
-; RV32-NEXT:    sw a5, 8(sp)
-; RV32-NEXT:    sw a4, 12(sp)
+; RV32-NEXT:    sw a4, 8(sp)
+; RV32-NEXT:    sw a5, 12(sp)
 ; RV32-NEXT:    addi a1, sp, 8
 ; RV32-NEXT:    li a4, 5
 ; RV32-NEXT:    li a5, 5
 ; RV32-NEXT:    mv a0, s0
 ; RV32-NEXT:    call __atomic_compare_exchange_8 at plt
-; RV32-NEXT:    lw a4, 12(sp)
-; RV32-NEXT:    lw a5, 8(sp)
+; RV32-NEXT:    lw a4, 8(sp)
+; RV32-NEXT:    lw a5, 12(sp)
 ; RV32-NEXT:    bnez a0, .LBB13_6
 ; RV32-NEXT:  .LBB13_2: # %atomicrmw.start
 ; RV32-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32-NEXT:    beq a4, s1, .LBB13_4
+; RV32-NEXT:    beq a5, s1, .LBB13_4
 ; RV32-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32-NEXT:    # in Loop: Header=BB13_2 Depth=1
-; RV32-NEXT:    sltu a0, s1, a4
-; RV32-NEXT:    mv a2, a5
-; RV32-NEXT:    mv a3, a4
+; RV32-NEXT:    sltu a0, s1, a5
+; RV32-NEXT:    mv a2, a4
+; RV32-NEXT:    mv a3, a5
 ; RV32-NEXT:    bnez a0, .LBB13_1
 ; RV32-NEXT:    j .LBB13_5
 ; RV32-NEXT:  .LBB13_4: # in Loop: Header=BB13_2 Depth=1
-; RV32-NEXT:    sltu a0, s2, a5
-; RV32-NEXT:    mv a2, a5
-; RV32-NEXT:    mv a3, a4
+; RV32-NEXT:    sltu a0, s2, a4
+; RV32-NEXT:    mv a2, a4
+; RV32-NEXT:    mv a3, a5
 ; RV32-NEXT:    bnez a0, .LBB13_1
 ; RV32-NEXT:  .LBB13_5: # %atomicrmw.start
 ; RV32-NEXT:    # in Loop: Header=BB13_2 Depth=1
@@ -344,37 +344,37 @@ define void @amomin_d_discard(ptr %a, i64 %b) nounwind {
 ; RV32-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    mv s0, a0
-; RV32-NEXT:    lw a4, 4(a0)
-; RV32-NEXT:    lw a5, 0(a0)
+; RV32-NEXT:    lw a4, 0(a0)
+; RV32-NEXT:    lw a5, 4(a0)
 ; RV32-NEXT:    mv s1, a2
 ; RV32-NEXT:    mv s2, a1
 ; RV32-NEXT:    j .LBB15_2
 ; RV32-NEXT:  .LBB15_1: # %atomicrmw.start
 ; RV32-NEXT:    # in Loop: Header=BB15_2 Depth=1
-; RV32-NEXT:    sw a5, 8(sp)
-; RV32-NEXT:    sw a4, 12(sp)
+; RV32-NEXT:    sw a4, 8(sp)
+; RV32-NEXT:    sw a5, 12(sp)
 ; RV32-NEXT:    addi a1, sp, 8
 ; RV32-NEXT:    li a4, 5
 ; RV32-NEXT:    li a5, 5
 ; RV32-NEXT:    mv a0, s0
 ; RV32-NEXT:    call __atomic_compare_exchange_8 at plt
-; RV32-NEXT:    lw a4, 12(sp)
-; RV32-NEXT:    lw a5, 8(sp)
+; RV32-NEXT:    lw a4, 8(sp)
+; RV32-NEXT:    lw a5, 12(sp)
 ; RV32-NEXT:    bnez a0, .LBB15_6
 ; RV32-NEXT:  .LBB15_2: # %atomicrmw.start
 ; RV32-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32-NEXT:    beq a4, s1, .LBB15_4
+; RV32-NEXT:    beq a5, s1, .LBB15_4
 ; RV32-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32-NEXT:    # in Loop: Header=BB15_2 Depth=1
-; RV32-NEXT:    slt a0, s1, a4
-; RV32-NEXT:    mv a2, a5
-; RV32-NEXT:    mv a3, a4
+; RV32-NEXT:    slt a0, s1, a5
+; RV32-NEXT:    mv a2, a4
+; RV32-NEXT:    mv a3, a5
 ; RV32-NEXT:    beqz a0, .LBB15_1
 ; RV32-NEXT:    j .LBB15_5
 ; RV32-NEXT:  .LBB15_4: # in Loop: Header=BB15_2 Depth=1
-; RV32-NEXT:    sltu a0, s2, a5
-; RV32-NEXT:    mv a2, a5
-; RV32-NEXT:    mv a3, a4
+; RV32-NEXT:    sltu a0, s2, a4
+; RV32-NEXT:    mv a2, a4
+; RV32-NEXT:    mv a3, a5
 ; RV32-NEXT:    beqz a0, .LBB15_1
 ; RV32-NEXT:  .LBB15_5: # %atomicrmw.start
 ; RV32-NEXT:    # in Loop: Header=BB15_2 Depth=1
@@ -420,37 +420,37 @@ define void @amominu_d_discard(ptr %a, i64 %b) nounwind {
 ; RV32-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    mv s0, a0
-; RV32-NEXT:    lw a4, 4(a0)
-; RV32-NEXT:    lw a5, 0(a0)
+; RV32-NEXT:    lw a4, 0(a0)
+; RV32-NEXT:    lw a5, 4(a0)
 ; RV32-NEXT:    mv s1, a2
 ; RV32-NEXT:    mv s2, a1
 ; RV32-NEXT:    j .LBB17_2
 ; RV32-NEXT:  .LBB17_1: # %atomicrmw.start
 ; RV32-NEXT:    # in Loop: Header=BB17_2 Depth=1
-; RV32-NEXT:    sw a5, 8(sp)
-; RV32-NEXT:    sw a4, 12(sp)
+; RV32-NEXT:    sw a4, 8(sp)
+; RV32-NEXT:    sw a5, 12(sp)
 ; RV32-NEXT:    addi a1, sp, 8
 ; RV32-NEXT:    li a4, 5
 ; RV32-NEXT:    li a5, 5
 ; RV32-NEXT:    mv a0, s0
 ; RV32-NEXT:    call __atomic_compare_exchange_8 at plt
-; RV32-NEXT:    lw a4, 12(sp)
-; RV32-NEXT:    lw a5, 8(sp)
+; RV32-NEXT:    lw a4, 8(sp)
+; RV32-NEXT:    lw a5, 12(sp)
 ; RV32-NEXT:    bnez a0, .LBB17_6
 ; RV32-NEXT:  .LBB17_2: # %atomicrmw.start
 ; RV32-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32-NEXT:    beq a4, s1, .LBB17_4
+; RV32-NEXT:    beq a5, s1, .LBB17_4
 ; RV32-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32-NEXT:    # in Loop: Header=BB17_2 Depth=1
-; RV32-NEXT:    sltu a0, s1, a4
-; RV32-NEXT:    mv a2, a5
-; RV32-NEXT:    mv a3, a4
+; RV32-NEXT:    sltu a0, s1, a5
+; RV32-NEXT:    mv a2, a4
+; RV32-NEXT:    mv a3, a5
 ; RV32-NEXT:    beqz a0, .LBB17_1
 ; RV32-NEXT:    j .LBB17_5
 ; RV32-NEXT:  .LBB17_4: # in Loop: Header=BB17_2 Depth=1
-; RV32-NEXT:    sltu a0, s2, a5
-; RV32-NEXT:    mv a2, a5
-; RV32-NEXT:    mv a3, a4
+; RV32-NEXT:    sltu a0, s2, a4
+; RV32-NEXT:    mv a2, a4
+; RV32-NEXT:    mv a3, a5
 ; RV32-NEXT:    beqz a0, .LBB17_1
 ; RV32-NEXT:  .LBB17_5: # %atomicrmw.start
 ; RV32-NEXT:    # in Loop: Header=BB17_2 Depth=1
diff --git a/llvm/test/CodeGen/RISCV/atomic-rmw.ll b/llvm/test/CodeGen/RISCV/atomic-rmw.ll
index e97a1ea5dfca00..512ee2eab2d311 100644
--- a/llvm/test/CodeGen/RISCV/atomic-rmw.ll
+++ b/llvm/test/CodeGen/RISCV/atomic-rmw.ll
@@ -19066,36 +19066,36 @@ define i64 @atomicrmw_max_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 4(a0)
-; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 0(a0)
+; RV32I-NEXT:    lw a4, 4(a0)
 ; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB220_2
 ; RV32I-NEXT:  .LBB220_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB220_2 Depth=1
-; RV32I-NEXT:    sw a4, 8(sp)
-; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a4, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    li a4, 0
 ; RV32I-NEXT:    li a5, 0
 ; RV32I-NEXT:    call __atomic_compare_exchange_8 at plt
-; RV32I-NEXT:    lw a5, 12(sp)
-; RV32I-NEXT:    lw a4, 8(sp)
+; RV32I-NEXT:    lw a5, 8(sp)
+; RV32I-NEXT:    lw a4, 12(sp)
 ; RV32I-NEXT:    bnez a0, .LBB220_7
 ; RV32I-NEXT:  .LBB220_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a5, s1, .LBB220_4
+; RV32I-NEXT:    beq a4, s1, .LBB220_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB220_2 Depth=1
-; RV32I-NEXT:    slt a0, s1, a5
+; RV32I-NEXT:    slt a0, s1, a4
 ; RV32I-NEXT:    j .LBB220_5
 ; RV32I-NEXT:  .LBB220_4: # in Loop: Header=BB220_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a4
+; RV32I-NEXT:    sltu a0, s2, a5
 ; RV32I-NEXT:  .LBB220_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB220_2 Depth=1
-; RV32I-NEXT:    mv a2, a4
-; RV32I-NEXT:    mv a3, a5
+; RV32I-NEXT:    mv a2, a5
+; RV32I-NEXT:    mv a3, a4
 ; RV32I-NEXT:    bnez a0, .LBB220_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB220_2 Depth=1
@@ -19103,8 +19103,8 @@ define i64 @atomicrmw_max_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    mv a3, s1
 ; RV32I-NEXT:    j .LBB220_1
 ; RV32I-NEXT:  .LBB220_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    mv a1, a5
+; RV32I-NEXT:    mv a0, a5
+; RV32I-NEXT:    mv a1, a4
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -19120,36 +19120,36 @@ define i64 @atomicrmw_max_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 4(a0)
-; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 0(a0)
+; RV32IA-NEXT:    lw a4, 4(a0)
 ; RV32IA-NEXT:    mv s1, a2
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB220_2
 ; RV32IA-NEXT:  .LBB220_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB220_2 Depth=1
-; RV32IA-NEXT:    sw a4, 8(sp)
-; RV32IA-NEXT:    sw a5, 12(sp)
+; RV32IA-NEXT:    sw a5, 8(sp)
+; RV32IA-NEXT:    sw a4, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
 ; RV32IA-NEXT:    mv a0, s0
 ; RV32IA-NEXT:    li a4, 0
 ; RV32IA-NEXT:    li a5, 0
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8 at plt
-; RV32IA-NEXT:    lw a5, 12(sp)
-; RV32IA-NEXT:    lw a4, 8(sp)
+; RV32IA-NEXT:    lw a5, 8(sp)
+; RV32IA-NEXT:    lw a4, 12(sp)
 ; RV32IA-NEXT:    bnez a0, .LBB220_7
 ; RV32IA-NEXT:  .LBB220_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a5, s1, .LBB220_4
+; RV32IA-NEXT:    beq a4, s1, .LBB220_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB220_2 Depth=1
-; RV32IA-NEXT:    slt a0, s1, a5
+; RV32IA-NEXT:    slt a0, s1, a4
 ; RV32IA-NEXT:    j .LBB220_5
 ; RV32IA-NEXT:  .LBB220_4: # in Loop: Header=BB220_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a4
+; RV32IA-NEXT:    sltu a0, s2, a5
 ; RV32IA-NEXT:  .LBB220_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB220_2 Depth=1
-; RV32IA-NEXT:    mv a2, a4
-; RV32IA-NEXT:    mv a3, a5
+; RV32IA-NEXT:    mv a2, a5
+; RV32IA-NEXT:    mv a3, a4
 ; RV32IA-NEXT:    bnez a0, .LBB220_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB220_2 Depth=1
@@ -19157,8 +19157,8 @@ define i64 @atomicrmw_max_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    mv a3, s1
 ; RV32IA-NEXT:    j .LBB220_1
 ; RV32IA-NEXT:  .LBB220_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a4
-; RV32IA-NEXT:    mv a1, a5
+; RV32IA-NEXT:    mv a0, a5
+; RV32IA-NEXT:    mv a1, a4
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -19219,36 +19219,36 @@ define i64 @atomicrmw_max_i64_acquire(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 4(a0)
-; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 0(a0)
+; RV32I-NEXT:    lw a4, 4(a0)
 ; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB221_2
 ; RV32I-NEXT:  .LBB221_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB221_2 Depth=1
-; RV32I-NEXT:    sw a4, 8(sp)
-; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a4, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    li a4, 2
 ; RV32I-NEXT:    li a5, 2
 ; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    call __atomic_compare_exchange_8 at plt
-; RV32I-NEXT:    lw a5, 12(sp)
-; RV32I-NEXT:    lw a4, 8(sp)
+; RV32I-NEXT:    lw a5, 8(sp)
+; RV32I-NEXT:    lw a4, 12(sp)
 ; RV32I-NEXT:    bnez a0, .LBB221_7
 ; RV32I-NEXT:  .LBB221_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a5, s1, .LBB221_4
+; RV32I-NEXT:    beq a4, s1, .LBB221_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB221_2 Depth=1
-; RV32I-NEXT:    slt a0, s1, a5
+; RV32I-NEXT:    slt a0, s1, a4
 ; RV32I-NEXT:    j .LBB221_5
 ; RV32I-NEXT:  .LBB221_4: # in Loop: Header=BB221_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a4
+; RV32I-NEXT:    sltu a0, s2, a5
 ; RV32I-NEXT:  .LBB221_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB221_2 Depth=1
-; RV32I-NEXT:    mv a2, a4
-; RV32I-NEXT:    mv a3, a5
+; RV32I-NEXT:    mv a2, a5
+; RV32I-NEXT:    mv a3, a4
 ; RV32I-NEXT:    bnez a0, .LBB221_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB221_2 Depth=1
@@ -19256,8 +19256,8 @@ define i64 @atomicrmw_max_i64_acquire(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    mv a3, s1
 ; RV32I-NEXT:    j .LBB221_1
 ; RV32I-NEXT:  .LBB221_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    mv a1, a5
+; RV32I-NEXT:    mv a0, a5
+; RV32I-NEXT:    mv a1, a4
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -19273,36 +19273,36 @@ define i64 @atomicrmw_max_i64_acquire(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 4(a0)
-; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 0(a0)
+; RV32IA-NEXT:    lw a4, 4(a0)
 ; RV32IA-NEXT:    mv s1, a2
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB221_2
 ; RV32IA-NEXT:  .LBB221_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB221_2 Depth=1
-; RV32IA-NEXT:    sw a4, 8(sp)
-; RV32IA-NEXT:    sw a5, 12(sp)
+; RV32IA-NEXT:    sw a5, 8(sp)
+; RV32IA-NEXT:    sw a4, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
 ; RV32IA-NEXT:    li a4, 2
 ; RV32IA-NEXT:    li a5, 2
 ; RV32IA-NEXT:    mv a0, s0
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8 at plt
-; RV32IA-NEXT:    lw a5, 12(sp)
-; RV32IA-NEXT:    lw a4, 8(sp)
+; RV32IA-NEXT:    lw a5, 8(sp)
+; RV32IA-NEXT:    lw a4, 12(sp)
 ; RV32IA-NEXT:    bnez a0, .LBB221_7
 ; RV32IA-NEXT:  .LBB221_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a5, s1, .LBB221_4
+; RV32IA-NEXT:    beq a4, s1, .LBB221_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB221_2 Depth=1
-; RV32IA-NEXT:    slt a0, s1, a5
+; RV32IA-NEXT:    slt a0, s1, a4
 ; RV32IA-NEXT:    j .LBB221_5
 ; RV32IA-NEXT:  .LBB221_4: # in Loop: Header=BB221_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a4
+; RV32IA-NEXT:    sltu a0, s2, a5
 ; RV32IA-NEXT:  .LBB221_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB221_2 Depth=1
-; RV32IA-NEXT:    mv a2, a4
-; RV32IA-NEXT:    mv a3, a5
+; RV32IA-NEXT:    mv a2, a5
+; RV32IA-NEXT:    mv a3, a4
 ; RV32IA-NEXT:    bnez a0, .LBB221_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB221_2 Depth=1
@@ -19310,8 +19310,8 @@ define i64 @atomicrmw_max_i64_acquire(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    mv a3, s1
 ; RV32IA-NEXT:    j .LBB221_1
 ; RV32IA-NEXT:  .LBB221_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a4
-; RV32IA-NEXT:    mv a1, a5
+; RV32IA-NEXT:    mv a0, a5
+; RV32IA-NEXT:    mv a1, a4
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -19377,36 +19377,36 @@ define i64 @atomicrmw_max_i64_release(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 4(a0)
-; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 0(a0)
+; RV32I-NEXT:    lw a4, 4(a0)
 ; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB222_2
 ; RV32I-NEXT:  .LBB222_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB222_2 Depth=1
-; RV32I-NEXT:    sw a4, 8(sp)
-; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a4, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    li a4, 3
 ; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    li a5, 0
 ; RV32I-NEXT:    call __atomic_compare_exchange_8 at plt
-; RV32I-NEXT:    lw a5, 12(sp)
-; RV32I-NEXT:    lw a4, 8(sp)
+; RV32I-NEXT:    lw a5, 8(sp)
+; RV32I-NEXT:    lw a4, 12(sp)
 ; RV32I-NEXT:    bnez a0, .LBB222_7
 ; RV32I-NEXT:  .LBB222_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a5, s1, .LBB222_4
+; RV32I-NEXT:    beq a4, s1, .LBB222_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB222_2 Depth=1
-; RV32I-NEXT:    slt a0, s1, a5
+; RV32I-NEXT:    slt a0, s1, a4
 ; RV32I-NEXT:    j .LBB222_5
 ; RV32I-NEXT:  .LBB222_4: # in Loop: Header=BB222_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a4
+; RV32I-NEXT:    sltu a0, s2, a5
 ; RV32I-NEXT:  .LBB222_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB222_2 Depth=1
-; RV32I-NEXT:    mv a2, a4
-; RV32I-NEXT:    mv a3, a5
+; RV32I-NEXT:    mv a2, a5
+; RV32I-NEXT:    mv a3, a4
 ; RV32I-NEXT:    bnez a0, .LBB222_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB222_2 Depth=1
@@ -19414,8 +19414,8 @@ define i64 @atomicrmw_max_i64_release(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    mv a3, s1
 ; RV32I-NEXT:    j .LBB222_1
 ; RV32I-NEXT:  .LBB222_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    mv a1, a5
+; RV32I-NEXT:    mv a0, a5
+; RV32I-NEXT:    mv a1, a4
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -19431,36 +19431,36 @@ define i64 @atomicrmw_max_i64_release(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 4(a0)
-; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 0(a0)
+; RV32IA-NEXT:    lw a4, 4(a0)
 ; RV32IA-NEXT:    mv s1, a2
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB222_2
 ; RV32IA-NEXT:  .LBB222_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB222_2 Depth=1
-; RV32IA-NEXT:    sw a4, 8(sp)
-; RV32IA-NEXT:    sw a5, 12(sp)
+; RV32IA-NEXT:    sw a5, 8(sp)
+; RV32IA-NEXT:    sw a4, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
 ; RV32IA-NEXT:    li a4, 3
 ; RV32IA-NEXT:    mv a0, s0
 ; RV32IA-NEXT:    li a5, 0
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8 at plt
-; RV32IA-NEXT:    lw a5, 12(sp)
-; RV32IA-NEXT:    lw a4, 8(sp)
+; RV32IA-NEXT:    lw a5, 8(sp)
+; RV32IA-NEXT:    lw a4, 12(sp)
 ; RV32IA-NEXT:    bnez a0, .LBB222_7
 ; RV32IA-NEXT:  .LBB222_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a5, s1, .LBB222_4
+; RV32IA-NEXT:    beq a4, s1, .LBB222_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB222_2 Depth=1
-; RV32IA-NEXT:    slt a0, s1, a5
+; RV32IA-NEXT:    slt a0, s1, a4
 ; RV32IA-NEXT:    j .LBB222_5
 ; RV32IA-NEXT:  .LBB222_4: # in Loop: Header=BB222_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a4
+; RV32IA-NEXT:    sltu a0, s2, a5
 ; RV32IA-NEXT:  .LBB222_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB222_2 Depth=1
-; RV32IA-NEXT:    mv a2, a4
-; RV32IA-NEXT:    mv a3, a5
+; RV32IA-NEXT:    mv a2, a5
+; RV32IA-NEXT:    mv a3, a4
 ; RV32IA-NEXT:    bnez a0, .LBB222_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB222_2 Depth=1
@@ -19468,8 +19468,8 @@ define i64 @atomicrmw_max_i64_release(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    mv a3, s1
 ; RV32IA-NEXT:    j .LBB222_1
 ; RV32IA-NEXT:  .LBB222_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a4
-; RV32IA-NEXT:    mv a1, a5
+; RV32IA-NEXT:    mv a0, a5
+; RV32IA-NEXT:    mv a1, a4
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -19535,36 +19535,36 @@ define i64 @atomicrmw_max_i64_acq_rel(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 4(a0)
-; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 0(a0)
+; RV32I-NEXT:    lw a4, 4(a0)
 ; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB223_2
 ; RV32I-NEXT:  .LBB223_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB223_2 Depth=1
-; RV32I-NEXT:    sw a4, 8(sp)
-; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a4, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    li a4, 4
 ; RV32I-NEXT:    li a5, 2
 ; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    call __atomic_compare_exchange_8 at plt
-; RV32I-NEXT:    lw a5, 12(sp)
-; RV32I-NEXT:    lw a4, 8(sp)
+; RV32I-NEXT:    lw a5, 8(sp)
+; RV32I-NEXT:    lw a4, 12(sp)
 ; RV32I-NEXT:    bnez a0, .LBB223_7
 ; RV32I-NEXT:  .LBB223_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a5, s1, .LBB223_4
+; RV32I-NEXT:    beq a4, s1, .LBB223_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB223_2 Depth=1
-; RV32I-NEXT:    slt a0, s1, a5
+; RV32I-NEXT:    slt a0, s1, a4
 ; RV32I-NEXT:    j .LBB223_5
 ; RV32I-NEXT:  .LBB223_4: # in Loop: Header=BB223_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a4
+; RV32I-NEXT:    sltu a0, s2, a5
 ; RV32I-NEXT:  .LBB223_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB223_2 Depth=1
-; RV32I-NEXT:    mv a2, a4
-; RV32I-NEXT:    mv a3, a5
+; RV32I-NEXT:    mv a2, a5
+; RV32I-NEXT:    mv a3, a4
 ; RV32I-NEXT:    bnez a0, .LBB223_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB223_2 Depth=1
@@ -19572,8 +19572,8 @@ define i64 @atomicrmw_max_i64_acq_rel(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    mv a3, s1
 ; RV32I-NEXT:    j .LBB223_1
 ; RV32I-NEXT:  .LBB223_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    mv a1, a5
+; RV32I-NEXT:    mv a0, a5
+; RV32I-NEXT:    mv a1, a4
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -19589,36 +19589,36 @@ define i64 @atomicrmw_max_i64_acq_rel(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 4(a0)
-; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 0(a0)
+; RV32IA-NEXT:    lw a4, 4(a0)
 ; RV32IA-NEXT:    mv s1, a2
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB223_2
 ; RV32IA-NEXT:  .LBB223_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB223_2 Depth=1
-; RV32IA-NEXT:    sw a4, 8(sp)
-; RV32IA-NEXT:    sw a5, 12(sp)
+; RV32IA-NEXT:    sw a5, 8(sp)
+; RV32IA-NEXT:    sw a4, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
 ; RV32IA-NEXT:    li a4, 4
 ; RV32IA-NEXT:    li a5, 2
 ; RV32IA-NEXT:    mv a0, s0
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8 at plt
-; RV32IA-NEXT:    lw a5, 12(sp)
-; RV32IA-NEXT:    lw a4, 8(sp)
+; RV32IA-NEXT:    lw a5, 8(sp)
+; RV32IA-NEXT:    lw a4, 12(sp)
 ; RV32IA-NEXT:    bnez a0, .LBB223_7
 ; RV32IA-NEXT:  .LBB223_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a5, s1, .LBB223_4
+; RV32IA-NEXT:    beq a4, s1, .LBB223_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB223_2 Depth=1
-; RV32IA-NEXT:    slt a0, s1, a5
+; RV32IA-NEXT:    slt a0, s1, a4
 ; RV32IA-NEXT:    j .LBB223_5
 ; RV32IA-NEXT:  .LBB223_4: # in Loop: Header=BB223_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a4
+; RV32IA-NEXT:    sltu a0, s2, a5
 ; RV32IA-NEXT:  .LBB223_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB223_2 Depth=1
-; RV32IA-NEXT:    mv a2, a4
-; RV32IA-NEXT:    mv a3, a5
+; RV32IA-NEXT:    mv a2, a5
+; RV32IA-NEXT:    mv a3, a4
 ; RV32IA-NEXT:    bnez a0, .LBB223_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB223_2 Depth=1
@@ -19626,8 +19626,8 @@ define i64 @atomicrmw_max_i64_acq_rel(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    mv a3, s1
 ; RV32IA-NEXT:    j .LBB223_1
 ; RV32IA-NEXT:  .LBB223_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a4
-; RV32IA-NEXT:    mv a1, a5
+; RV32IA-NEXT:    mv a0, a5
+; RV32IA-NEXT:    mv a1, a4
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -19693,36 +19693,36 @@ define i64 @atomicrmw_max_i64_seq_cst(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 4(a0)
-; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 0(a0)
+; RV32I-NEXT:    lw a4, 4(a0)
 ; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB224_2
 ; RV32I-NEXT:  .LBB224_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB224_2 Depth=1
-; RV32I-NEXT:    sw a4, 8(sp)
-; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a4, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    li a4, 5
 ; RV32I-NEXT:    li a5, 5
 ; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    call __atomic_compare_exchange_8 at plt
-; RV32I-NEXT:    lw a5, 12(sp)
-; RV32I-NEXT:    lw a4, 8(sp)
+; RV32I-NEXT:    lw a5, 8(sp)
+; RV32I-NEXT:    lw a4, 12(sp)
 ; RV32I-NEXT:    bnez a0, .LBB224_7
 ; RV32I-NEXT:  .LBB224_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a5, s1, .LBB224_4
+; RV32I-NEXT:    beq a4, s1, .LBB224_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB224_2 Depth=1
-; RV32I-NEXT:    slt a0, s1, a5
+; RV32I-NEXT:    slt a0, s1, a4
 ; RV32I-NEXT:    j .LBB224_5
 ; RV32I-NEXT:  .LBB224_4: # in Loop: Header=BB224_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a4
+; RV32I-NEXT:    sltu a0, s2, a5
 ; RV32I-NEXT:  .LBB224_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB224_2 Depth=1
-; RV32I-NEXT:    mv a2, a4
-; RV32I-NEXT:    mv a3, a5
+; RV32I-NEXT:    mv a2, a5
+; RV32I-NEXT:    mv a3, a4
 ; RV32I-NEXT:    bnez a0, .LBB224_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB224_2 Depth=1
@@ -19730,8 +19730,8 @@ define i64 @atomicrmw_max_i64_seq_cst(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    mv a3, s1
 ; RV32I-NEXT:    j .LBB224_1
 ; RV32I-NEXT:  .LBB224_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    mv a1, a5
+; RV32I-NEXT:    mv a0, a5
+; RV32I-NEXT:    mv a1, a4
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -19747,36 +19747,36 @@ define i64 @atomicrmw_max_i64_seq_cst(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 4(a0)
-; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 0(a0)
+; RV32IA-NEXT:    lw a4, 4(a0)
 ; RV32IA-NEXT:    mv s1, a2
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB224_2
 ; RV32IA-NEXT:  .LBB224_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB224_2 Depth=1
-; RV32IA-NEXT:    sw a4, 8(sp)
-; RV32IA-NEXT:    sw a5, 12(sp)
+; RV32IA-NEXT:    sw a5, 8(sp)
+; RV32IA-NEXT:    sw a4, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
 ; RV32IA-NEXT:    li a4, 5
 ; RV32IA-NEXT:    li a5, 5
 ; RV32IA-NEXT:    mv a0, s0
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8 at plt
-; RV32IA-NEXT:    lw a5, 12(sp)
-; RV32IA-NEXT:    lw a4, 8(sp)
+; RV32IA-NEXT:    lw a5, 8(sp)
+; RV32IA-NEXT:    lw a4, 12(sp)
 ; RV32IA-NEXT:    bnez a0, .LBB224_7
 ; RV32IA-NEXT:  .LBB224_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a5, s1, .LBB224_4
+; RV32IA-NEXT:    beq a4, s1, .LBB224_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB224_2 Depth=1
-; RV32IA-NEXT:    slt a0, s1, a5
+; RV32IA-NEXT:    slt a0, s1, a4
 ; RV32IA-NEXT:    j .LBB224_5
 ; RV32IA-NEXT:  .LBB224_4: # in Loop: Header=BB224_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a4
+; RV32IA-NEXT:    sltu a0, s2, a5
 ; RV32IA-NEXT:  .LBB224_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB224_2 Depth=1
-; RV32IA-NEXT:    mv a2, a4
-; RV32IA-NEXT:    mv a3, a5
+; RV32IA-NEXT:    mv a2, a5
+; RV32IA-NEXT:    mv a3, a4
 ; RV32IA-NEXT:    bnez a0, .LBB224_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB224_2 Depth=1
@@ -19784,8 +19784,8 @@ define i64 @atomicrmw_max_i64_seq_cst(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    mv a3, s1
 ; RV32IA-NEXT:    j .LBB224_1
 ; RV32IA-NEXT:  .LBB224_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a4
-; RV32IA-NEXT:    mv a1, a5
+; RV32IA-NEXT:    mv a0, a5
+; RV32IA-NEXT:    mv a1, a4
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -19851,36 +19851,36 @@ define i64 @atomicrmw_min_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 4(a0)
-; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 0(a0)
+; RV32I-NEXT:    lw a4, 4(a0)
 ; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB225_2
 ; RV32I-NEXT:  .LBB225_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB225_2 Depth=1
-; RV32I-NEXT:    sw a4, 8(sp)
-; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a4, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    li a4, 0
 ; RV32I-NEXT:    li a5, 0
 ; RV32I-NEXT:    call __atomic_compare_exchange_8 at plt
-; RV32I-NEXT:    lw a5, 12(sp)
-; RV32I-NEXT:    lw a4, 8(sp)
+; RV32I-NEXT:    lw a5, 8(sp)
+; RV32I-NEXT:    lw a4, 12(sp)
 ; RV32I-NEXT:    bnez a0, .LBB225_7
 ; RV32I-NEXT:  .LBB225_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a5, s1, .LBB225_4
+; RV32I-NEXT:    beq a4, s1, .LBB225_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB225_2 Depth=1
-; RV32I-NEXT:    slt a0, s1, a5
+; RV32I-NEXT:    slt a0, s1, a4
 ; RV32I-NEXT:    j .LBB225_5
 ; RV32I-NEXT:  .LBB225_4: # in Loop: Header=BB225_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a4
+; RV32I-NEXT:    sltu a0, s2, a5
 ; RV32I-NEXT:  .LBB225_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB225_2 Depth=1
-; RV32I-NEXT:    mv a2, a4
-; RV32I-NEXT:    mv a3, a5
+; RV32I-NEXT:    mv a2, a5
+; RV32I-NEXT:    mv a3, a4
 ; RV32I-NEXT:    beqz a0, .LBB225_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB225_2 Depth=1
@@ -19888,8 +19888,8 @@ define i64 @atomicrmw_min_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    mv a3, s1
 ; RV32I-NEXT:    j .LBB225_1
 ; RV32I-NEXT:  .LBB225_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    mv a1, a5
+; RV32I-NEXT:    mv a0, a5
+; RV32I-NEXT:    mv a1, a4
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -19905,36 +19905,36 @@ define i64 @atomicrmw_min_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 4(a0)
-; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 0(a0)
+; RV32IA-NEXT:    lw a4, 4(a0)
 ; RV32IA-NEXT:    mv s1, a2
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB225_2
 ; RV32IA-NEXT:  .LBB225_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB225_2 Depth=1
-; RV32IA-NEXT:    sw a4, 8(sp)
-; RV32IA-NEXT:    sw a5, 12(sp)
+; RV32IA-NEXT:    sw a5, 8(sp)
+; RV32IA-NEXT:    sw a4, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
 ; RV32IA-NEXT:    mv a0, s0
 ; RV32IA-NEXT:    li a4, 0
 ; RV32IA-NEXT:    li a5, 0
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8 at plt
-; RV32IA-NEXT:    lw a5, 12(sp)
-; RV32IA-NEXT:    lw a4, 8(sp)
+; RV32IA-NEXT:    lw a5, 8(sp)
+; RV32IA-NEXT:    lw a4, 12(sp)
 ; RV32IA-NEXT:    bnez a0, .LBB225_7
 ; RV32IA-NEXT:  .LBB225_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a5, s1, .LBB225_4
+; RV32IA-NEXT:    beq a4, s1, .LBB225_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB225_2 Depth=1
-; RV32IA-NEXT:    slt a0, s1, a5
+; RV32IA-NEXT:    slt a0, s1, a4
 ; RV32IA-NEXT:    j .LBB225_5
 ; RV32IA-NEXT:  .LBB225_4: # in Loop: Header=BB225_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a4
+; RV32IA-NEXT:    sltu a0, s2, a5
 ; RV32IA-NEXT:  .LBB225_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB225_2 Depth=1
-; RV32IA-NEXT:    mv a2, a4
-; RV32IA-NEXT:    mv a3, a5
+; RV32IA-NEXT:    mv a2, a5
+; RV32IA-NEXT:    mv a3, a4
 ; RV32IA-NEXT:    beqz a0, .LBB225_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB225_2 Depth=1
@@ -19942,8 +19942,8 @@ define i64 @atomicrmw_min_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    mv a3, s1
 ; RV32IA-NEXT:    j .LBB225_1
 ; RV32IA-NEXT:  .LBB225_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a4
-; RV32IA-NEXT:    mv a1, a5
+; RV32IA-NEXT:    mv a0, a5
+; RV32IA-NEXT:    mv a1, a4
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -20004,36 +20004,36 @@ define i64 @atomicrmw_min_i64_acquire(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 4(a0)
-; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 0(a0)
+; RV32I-NEXT:    lw a4, 4(a0)
 ; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB226_2
 ; RV32I-NEXT:  .LBB226_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB226_2 Depth=1
-; RV32I-NEXT:    sw a4, 8(sp)
-; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a4, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    li a4, 2
 ; RV32I-NEXT:    li a5, 2
 ; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    call __atomic_compare_exchange_8 at plt
-; RV32I-NEXT:    lw a5, 12(sp)
-; RV32I-NEXT:    lw a4, 8(sp)
+; RV32I-NEXT:    lw a5, 8(sp)
+; RV32I-NEXT:    lw a4, 12(sp)
 ; RV32I-NEXT:    bnez a0, .LBB226_7
 ; RV32I-NEXT:  .LBB226_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a5, s1, .LBB226_4
+; RV32I-NEXT:    beq a4, s1, .LBB226_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB226_2 Depth=1
-; RV32I-NEXT:    slt a0, s1, a5
+; RV32I-NEXT:    slt a0, s1, a4
 ; RV32I-NEXT:    j .LBB226_5
 ; RV32I-NEXT:  .LBB226_4: # in Loop: Header=BB226_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a4
+; RV32I-NEXT:    sltu a0, s2, a5
 ; RV32I-NEXT:  .LBB226_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB226_2 Depth=1
-; RV32I-NEXT:    mv a2, a4
-; RV32I-NEXT:    mv a3, a5
+; RV32I-NEXT:    mv a2, a5
+; RV32I-NEXT:    mv a3, a4
 ; RV32I-NEXT:    beqz a0, .LBB226_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB226_2 Depth=1
@@ -20041,8 +20041,8 @@ define i64 @atomicrmw_min_i64_acquire(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    mv a3, s1
 ; RV32I-NEXT:    j .LBB226_1
 ; RV32I-NEXT:  .LBB226_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    mv a1, a5
+; RV32I-NEXT:    mv a0, a5
+; RV32I-NEXT:    mv a1, a4
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -20058,36 +20058,36 @@ define i64 @atomicrmw_min_i64_acquire(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 4(a0)
-; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 0(a0)
+; RV32IA-NEXT:    lw a4, 4(a0)
 ; RV32IA-NEXT:    mv s1, a2
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB226_2
 ; RV32IA-NEXT:  .LBB226_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB226_2 Depth=1
-; RV32IA-NEXT:    sw a4, 8(sp)
-; RV32IA-NEXT:    sw a5, 12(sp)
+; RV32IA-NEXT:    sw a5, 8(sp)
+; RV32IA-NEXT:    sw a4, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
 ; RV32IA-NEXT:    li a4, 2
 ; RV32IA-NEXT:    li a5, 2
 ; RV32IA-NEXT:    mv a0, s0
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8 at plt
-; RV32IA-NEXT:    lw a5, 12(sp)
-; RV32IA-NEXT:    lw a4, 8(sp)
+; RV32IA-NEXT:    lw a5, 8(sp)
+; RV32IA-NEXT:    lw a4, 12(sp)
 ; RV32IA-NEXT:    bnez a0, .LBB226_7
 ; RV32IA-NEXT:  .LBB226_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a5, s1, .LBB226_4
+; RV32IA-NEXT:    beq a4, s1, .LBB226_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB226_2 Depth=1
-; RV32IA-NEXT:    slt a0, s1, a5
+; RV32IA-NEXT:    slt a0, s1, a4
 ; RV32IA-NEXT:    j .LBB226_5
 ; RV32IA-NEXT:  .LBB226_4: # in Loop: Header=BB226_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a4
+; RV32IA-NEXT:    sltu a0, s2, a5
 ; RV32IA-NEXT:  .LBB226_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB226_2 Depth=1
-; RV32IA-NEXT:    mv a2, a4
-; RV32IA-NEXT:    mv a3, a5
+; RV32IA-NEXT:    mv a2, a5
+; RV32IA-NEXT:    mv a3, a4
 ; RV32IA-NEXT:    beqz a0, .LBB226_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB226_2 Depth=1
@@ -20095,8 +20095,8 @@ define i64 @atomicrmw_min_i64_acquire(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    mv a3, s1
 ; RV32IA-NEXT:    j .LBB226_1
 ; RV32IA-NEXT:  .LBB226_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a4
-; RV32IA-NEXT:    mv a1, a5
+; RV32IA-NEXT:    mv a0, a5
+; RV32IA-NEXT:    mv a1, a4
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -20162,36 +20162,36 @@ define i64 @atomicrmw_min_i64_release(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 4(a0)
-; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 0(a0)
+; RV32I-NEXT:    lw a4, 4(a0)
 ; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB227_2
 ; RV32I-NEXT:  .LBB227_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB227_2 Depth=1
-; RV32I-NEXT:    sw a4, 8(sp)
-; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a4, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    li a4, 3
 ; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    li a5, 0
 ; RV32I-NEXT:    call __atomic_compare_exchange_8 at plt
-; RV32I-NEXT:    lw a5, 12(sp)
-; RV32I-NEXT:    lw a4, 8(sp)
+; RV32I-NEXT:    lw a5, 8(sp)
+; RV32I-NEXT:    lw a4, 12(sp)
 ; RV32I-NEXT:    bnez a0, .LBB227_7
 ; RV32I-NEXT:  .LBB227_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a5, s1, .LBB227_4
+; RV32I-NEXT:    beq a4, s1, .LBB227_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB227_2 Depth=1
-; RV32I-NEXT:    slt a0, s1, a5
+; RV32I-NEXT:    slt a0, s1, a4
 ; RV32I-NEXT:    j .LBB227_5
 ; RV32I-NEXT:  .LBB227_4: # in Loop: Header=BB227_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a4
+; RV32I-NEXT:    sltu a0, s2, a5
 ; RV32I-NEXT:  .LBB227_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB227_2 Depth=1
-; RV32I-NEXT:    mv a2, a4
-; RV32I-NEXT:    mv a3, a5
+; RV32I-NEXT:    mv a2, a5
+; RV32I-NEXT:    mv a3, a4
 ; RV32I-NEXT:    beqz a0, .LBB227_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB227_2 Depth=1
@@ -20199,8 +20199,8 @@ define i64 @atomicrmw_min_i64_release(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    mv a3, s1
 ; RV32I-NEXT:    j .LBB227_1
 ; RV32I-NEXT:  .LBB227_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    mv a1, a5
+; RV32I-NEXT:    mv a0, a5
+; RV32I-NEXT:    mv a1, a4
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -20216,36 +20216,36 @@ define i64 @atomicrmw_min_i64_release(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 4(a0)
-; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 0(a0)
+; RV32IA-NEXT:    lw a4, 4(a0)
 ; RV32IA-NEXT:    mv s1, a2
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB227_2
 ; RV32IA-NEXT:  .LBB227_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB227_2 Depth=1
-; RV32IA-NEXT:    sw a4, 8(sp)
-; RV32IA-NEXT:    sw a5, 12(sp)
+; RV32IA-NEXT:    sw a5, 8(sp)
+; RV32IA-NEXT:    sw a4, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
 ; RV32IA-NEXT:    li a4, 3
 ; RV32IA-NEXT:    mv a0, s0
 ; RV32IA-NEXT:    li a5, 0
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8 at plt
-; RV32IA-NEXT:    lw a5, 12(sp)
-; RV32IA-NEXT:    lw a4, 8(sp)
+; RV32IA-NEXT:    lw a5, 8(sp)
+; RV32IA-NEXT:    lw a4, 12(sp)
 ; RV32IA-NEXT:    bnez a0, .LBB227_7
 ; RV32IA-NEXT:  .LBB227_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a5, s1, .LBB227_4
+; RV32IA-NEXT:    beq a4, s1, .LBB227_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB227_2 Depth=1
-; RV32IA-NEXT:    slt a0, s1, a5
+; RV32IA-NEXT:    slt a0, s1, a4
 ; RV32IA-NEXT:    j .LBB227_5
 ; RV32IA-NEXT:  .LBB227_4: # in Loop: Header=BB227_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a4
+; RV32IA-NEXT:    sltu a0, s2, a5
 ; RV32IA-NEXT:  .LBB227_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB227_2 Depth=1
-; RV32IA-NEXT:    mv a2, a4
-; RV32IA-NEXT:    mv a3, a5
+; RV32IA-NEXT:    mv a2, a5
+; RV32IA-NEXT:    mv a3, a4
 ; RV32IA-NEXT:    beqz a0, .LBB227_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB227_2 Depth=1
@@ -20253,8 +20253,8 @@ define i64 @atomicrmw_min_i64_release(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    mv a3, s1
 ; RV32IA-NEXT:    j .LBB227_1
 ; RV32IA-NEXT:  .LBB227_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a4
-; RV32IA-NEXT:    mv a1, a5
+; RV32IA-NEXT:    mv a0, a5
+; RV32IA-NEXT:    mv a1, a4
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -20320,36 +20320,36 @@ define i64 @atomicrmw_min_i64_acq_rel(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 4(a0)
-; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 0(a0)
+; RV32I-NEXT:    lw a4, 4(a0)
 ; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB228_2
 ; RV32I-NEXT:  .LBB228_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB228_2 Depth=1
-; RV32I-NEXT:    sw a4, 8(sp)
-; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a4, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    li a4, 4
 ; RV32I-NEXT:    li a5, 2
 ; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    call __atomic_compare_exchange_8 at plt
-; RV32I-NEXT:    lw a5, 12(sp)
-; RV32I-NEXT:    lw a4, 8(sp)
+; RV32I-NEXT:    lw a5, 8(sp)
+; RV32I-NEXT:    lw a4, 12(sp)
 ; RV32I-NEXT:    bnez a0, .LBB228_7
 ; RV32I-NEXT:  .LBB228_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a5, s1, .LBB228_4
+; RV32I-NEXT:    beq a4, s1, .LBB228_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB228_2 Depth=1
-; RV32I-NEXT:    slt a0, s1, a5
+; RV32I-NEXT:    slt a0, s1, a4
 ; RV32I-NEXT:    j .LBB228_5
 ; RV32I-NEXT:  .LBB228_4: # in Loop: Header=BB228_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a4
+; RV32I-NEXT:    sltu a0, s2, a5
 ; RV32I-NEXT:  .LBB228_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB228_2 Depth=1
-; RV32I-NEXT:    mv a2, a4
-; RV32I-NEXT:    mv a3, a5
+; RV32I-NEXT:    mv a2, a5
+; RV32I-NEXT:    mv a3, a4
 ; RV32I-NEXT:    beqz a0, .LBB228_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB228_2 Depth=1
@@ -20357,8 +20357,8 @@ define i64 @atomicrmw_min_i64_acq_rel(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    mv a3, s1
 ; RV32I-NEXT:    j .LBB228_1
 ; RV32I-NEXT:  .LBB228_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    mv a1, a5
+; RV32I-NEXT:    mv a0, a5
+; RV32I-NEXT:    mv a1, a4
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -20374,36 +20374,36 @@ define i64 @atomicrmw_min_i64_acq_rel(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 4(a0)
-; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 0(a0)
+; RV32IA-NEXT:    lw a4, 4(a0)
 ; RV32IA-NEXT:    mv s1, a2
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB228_2
 ; RV32IA-NEXT:  .LBB228_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB228_2 Depth=1
-; RV32IA-NEXT:    sw a4, 8(sp)
-; RV32IA-NEXT:    sw a5, 12(sp)
+; RV32IA-NEXT:    sw a5, 8(sp)
+; RV32IA-NEXT:    sw a4, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
 ; RV32IA-NEXT:    li a4, 4
 ; RV32IA-NEXT:    li a5, 2
 ; RV32IA-NEXT:    mv a0, s0
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8 at plt
-; RV32IA-NEXT:    lw a5, 12(sp)
-; RV32IA-NEXT:    lw a4, 8(sp)
+; RV32IA-NEXT:    lw a5, 8(sp)
+; RV32IA-NEXT:    lw a4, 12(sp)
 ; RV32IA-NEXT:    bnez a0, .LBB228_7
 ; RV32IA-NEXT:  .LBB228_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a5, s1, .LBB228_4
+; RV32IA-NEXT:    beq a4, s1, .LBB228_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB228_2 Depth=1
-; RV32IA-NEXT:    slt a0, s1, a5
+; RV32IA-NEXT:    slt a0, s1, a4
 ; RV32IA-NEXT:    j .LBB228_5
 ; RV32IA-NEXT:  .LBB228_4: # in Loop: Header=BB228_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a4
+; RV32IA-NEXT:    sltu a0, s2, a5
 ; RV32IA-NEXT:  .LBB228_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB228_2 Depth=1
-; RV32IA-NEXT:    mv a2, a4
-; RV32IA-NEXT:    mv a3, a5
+; RV32IA-NEXT:    mv a2, a5
+; RV32IA-NEXT:    mv a3, a4
 ; RV32IA-NEXT:    beqz a0, .LBB228_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB228_2 Depth=1
@@ -20411,8 +20411,8 @@ define i64 @atomicrmw_min_i64_acq_rel(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    mv a3, s1
 ; RV32IA-NEXT:    j .LBB228_1
 ; RV32IA-NEXT:  .LBB228_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a4
-; RV32IA-NEXT:    mv a1, a5
+; RV32IA-NEXT:    mv a0, a5
+; RV32IA-NEXT:    mv a1, a4
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -20478,36 +20478,36 @@ define i64 @atomicrmw_min_i64_seq_cst(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 4(a0)
-; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 0(a0)
+; RV32I-NEXT:    lw a4, 4(a0)
 ; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB229_2
 ; RV32I-NEXT:  .LBB229_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB229_2 Depth=1
-; RV32I-NEXT:    sw a4, 8(sp)
-; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a4, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    li a4, 5
 ; RV32I-NEXT:    li a5, 5
 ; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    call __atomic_compare_exchange_8 at plt
-; RV32I-NEXT:    lw a5, 12(sp)
-; RV32I-NEXT:    lw a4, 8(sp)
+; RV32I-NEXT:    lw a5, 8(sp)
+; RV32I-NEXT:    lw a4, 12(sp)
 ; RV32I-NEXT:    bnez a0, .LBB229_7
 ; RV32I-NEXT:  .LBB229_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a5, s1, .LBB229_4
+; RV32I-NEXT:    beq a4, s1, .LBB229_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB229_2 Depth=1
-; RV32I-NEXT:    slt a0, s1, a5
+; RV32I-NEXT:    slt a0, s1, a4
 ; RV32I-NEXT:    j .LBB229_5
 ; RV32I-NEXT:  .LBB229_4: # in Loop: Header=BB229_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a4
+; RV32I-NEXT:    sltu a0, s2, a5
 ; RV32I-NEXT:  .LBB229_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB229_2 Depth=1
-; RV32I-NEXT:    mv a2, a4
-; RV32I-NEXT:    mv a3, a5
+; RV32I-NEXT:    mv a2, a5
+; RV32I-NEXT:    mv a3, a4
 ; RV32I-NEXT:    beqz a0, .LBB229_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB229_2 Depth=1
@@ -20515,8 +20515,8 @@ define i64 @atomicrmw_min_i64_seq_cst(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    mv a3, s1
 ; RV32I-NEXT:    j .LBB229_1
 ; RV32I-NEXT:  .LBB229_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    mv a1, a5
+; RV32I-NEXT:    mv a0, a5
+; RV32I-NEXT:    mv a1, a4
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -20532,36 +20532,36 @@ define i64 @atomicrmw_min_i64_seq_cst(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 4(a0)
-; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 0(a0)
+; RV32IA-NEXT:    lw a4, 4(a0)
 ; RV32IA-NEXT:    mv s1, a2
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB229_2
 ; RV32IA-NEXT:  .LBB229_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB229_2 Depth=1
-; RV32IA-NEXT:    sw a4, 8(sp)
-; RV32IA-NEXT:    sw a5, 12(sp)
+; RV32IA-NEXT:    sw a5, 8(sp)
+; RV32IA-NEXT:    sw a4, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
 ; RV32IA-NEXT:    li a4, 5
 ; RV32IA-NEXT:    li a5, 5
 ; RV32IA-NEXT:    mv a0, s0
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8 at plt
-; RV32IA-NEXT:    lw a5, 12(sp)
-; RV32IA-NEXT:    lw a4, 8(sp)
+; RV32IA-NEXT:    lw a5, 8(sp)
+; RV32IA-NEXT:    lw a4, 12(sp)
 ; RV32IA-NEXT:    bnez a0, .LBB229_7
 ; RV32IA-NEXT:  .LBB229_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a5, s1, .LBB229_4
+; RV32IA-NEXT:    beq a4, s1, .LBB229_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB229_2 Depth=1
-; RV32IA-NEXT:    slt a0, s1, a5
+; RV32IA-NEXT:    slt a0, s1, a4
 ; RV32IA-NEXT:    j .LBB229_5
 ; RV32IA-NEXT:  .LBB229_4: # in Loop: Header=BB229_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a4
+; RV32IA-NEXT:    sltu a0, s2, a5
 ; RV32IA-NEXT:  .LBB229_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB229_2 Depth=1
-; RV32IA-NEXT:    mv a2, a4
-; RV32IA-NEXT:    mv a3, a5
+; RV32IA-NEXT:    mv a2, a5
+; RV32IA-NEXT:    mv a3, a4
 ; RV32IA-NEXT:    beqz a0, .LBB229_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB229_2 Depth=1
@@ -20569,8 +20569,8 @@ define i64 @atomicrmw_min_i64_seq_cst(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    mv a3, s1
 ; RV32IA-NEXT:    j .LBB229_1
 ; RV32IA-NEXT:  .LBB229_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a4
-; RV32IA-NEXT:    mv a1, a5
+; RV32IA-NEXT:    mv a0, a5
+; RV32IA-NEXT:    mv a1, a4
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -20636,36 +20636,36 @@ define i64 @atomicrmw_umax_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 4(a0)
-; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 0(a0)
+; RV32I-NEXT:    lw a4, 4(a0)
 ; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB230_2
 ; RV32I-NEXT:  .LBB230_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB230_2 Depth=1
-; RV32I-NEXT:    sw a4, 8(sp)
-; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a4, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    li a4, 0
 ; RV32I-NEXT:    li a5, 0
 ; RV32I-NEXT:    call __atomic_compare_exchange_8 at plt
-; RV32I-NEXT:    lw a5, 12(sp)
-; RV32I-NEXT:    lw a4, 8(sp)
+; RV32I-NEXT:    lw a5, 8(sp)
+; RV32I-NEXT:    lw a4, 12(sp)
 ; RV32I-NEXT:    bnez a0, .LBB230_7
 ; RV32I-NEXT:  .LBB230_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a5, s1, .LBB230_4
+; RV32I-NEXT:    beq a4, s1, .LBB230_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB230_2 Depth=1
-; RV32I-NEXT:    sltu a0, s1, a5
+; RV32I-NEXT:    sltu a0, s1, a4
 ; RV32I-NEXT:    j .LBB230_5
 ; RV32I-NEXT:  .LBB230_4: # in Loop: Header=BB230_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a4
+; RV32I-NEXT:    sltu a0, s2, a5
 ; RV32I-NEXT:  .LBB230_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB230_2 Depth=1
-; RV32I-NEXT:    mv a2, a4
-; RV32I-NEXT:    mv a3, a5
+; RV32I-NEXT:    mv a2, a5
+; RV32I-NEXT:    mv a3, a4
 ; RV32I-NEXT:    bnez a0, .LBB230_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB230_2 Depth=1
@@ -20673,8 +20673,8 @@ define i64 @atomicrmw_umax_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    mv a3, s1
 ; RV32I-NEXT:    j .LBB230_1
 ; RV32I-NEXT:  .LBB230_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    mv a1, a5
+; RV32I-NEXT:    mv a0, a5
+; RV32I-NEXT:    mv a1, a4
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -20690,36 +20690,36 @@ define i64 @atomicrmw_umax_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 4(a0)
-; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 0(a0)
+; RV32IA-NEXT:    lw a4, 4(a0)
 ; RV32IA-NEXT:    mv s1, a2
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB230_2
 ; RV32IA-NEXT:  .LBB230_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB230_2 Depth=1
-; RV32IA-NEXT:    sw a4, 8(sp)
-; RV32IA-NEXT:    sw a5, 12(sp)
+; RV32IA-NEXT:    sw a5, 8(sp)
+; RV32IA-NEXT:    sw a4, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
 ; RV32IA-NEXT:    mv a0, s0
 ; RV32IA-NEXT:    li a4, 0
 ; RV32IA-NEXT:    li a5, 0
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8 at plt
-; RV32IA-NEXT:    lw a5, 12(sp)
-; RV32IA-NEXT:    lw a4, 8(sp)
+; RV32IA-NEXT:    lw a5, 8(sp)
+; RV32IA-NEXT:    lw a4, 12(sp)
 ; RV32IA-NEXT:    bnez a0, .LBB230_7
 ; RV32IA-NEXT:  .LBB230_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a5, s1, .LBB230_4
+; RV32IA-NEXT:    beq a4, s1, .LBB230_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB230_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s1, a5
+; RV32IA-NEXT:    sltu a0, s1, a4
 ; RV32IA-NEXT:    j .LBB230_5
 ; RV32IA-NEXT:  .LBB230_4: # in Loop: Header=BB230_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a4
+; RV32IA-NEXT:    sltu a0, s2, a5
 ; RV32IA-NEXT:  .LBB230_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB230_2 Depth=1
-; RV32IA-NEXT:    mv a2, a4
-; RV32IA-NEXT:    mv a3, a5
+; RV32IA-NEXT:    mv a2, a5
+; RV32IA-NEXT:    mv a3, a4
 ; RV32IA-NEXT:    bnez a0, .LBB230_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB230_2 Depth=1
@@ -20727,8 +20727,8 @@ define i64 @atomicrmw_umax_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    mv a3, s1
 ; RV32IA-NEXT:    j .LBB230_1
 ; RV32IA-NEXT:  .LBB230_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a4
-; RV32IA-NEXT:    mv a1, a5
+; RV32IA-NEXT:    mv a0, a5
+; RV32IA-NEXT:    mv a1, a4
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -20789,36 +20789,36 @@ define i64 @atomicrmw_umax_i64_acquire(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 4(a0)
-; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 0(a0)
+; RV32I-NEXT:    lw a4, 4(a0)
 ; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB231_2
 ; RV32I-NEXT:  .LBB231_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB231_2 Depth=1
-; RV32I-NEXT:    sw a4, 8(sp)
-; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a4, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    li a4, 2
 ; RV32I-NEXT:    li a5, 2
 ; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    call __atomic_compare_exchange_8 at plt
-; RV32I-NEXT:    lw a5, 12(sp)
-; RV32I-NEXT:    lw a4, 8(sp)
+; RV32I-NEXT:    lw a5, 8(sp)
+; RV32I-NEXT:    lw a4, 12(sp)
 ; RV32I-NEXT:    bnez a0, .LBB231_7
 ; RV32I-NEXT:  .LBB231_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a5, s1, .LBB231_4
+; RV32I-NEXT:    beq a4, s1, .LBB231_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB231_2 Depth=1
-; RV32I-NEXT:    sltu a0, s1, a5
+; RV32I-NEXT:    sltu a0, s1, a4
 ; RV32I-NEXT:    j .LBB231_5
 ; RV32I-NEXT:  .LBB231_4: # in Loop: Header=BB231_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a4
+; RV32I-NEXT:    sltu a0, s2, a5
 ; RV32I-NEXT:  .LBB231_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB231_2 Depth=1
-; RV32I-NEXT:    mv a2, a4
-; RV32I-NEXT:    mv a3, a5
+; RV32I-NEXT:    mv a2, a5
+; RV32I-NEXT:    mv a3, a4
 ; RV32I-NEXT:    bnez a0, .LBB231_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB231_2 Depth=1
@@ -20826,8 +20826,8 @@ define i64 @atomicrmw_umax_i64_acquire(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    mv a3, s1
 ; RV32I-NEXT:    j .LBB231_1
 ; RV32I-NEXT:  .LBB231_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    mv a1, a5
+; RV32I-NEXT:    mv a0, a5
+; RV32I-NEXT:    mv a1, a4
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -20843,36 +20843,36 @@ define i64 @atomicrmw_umax_i64_acquire(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 4(a0)
-; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 0(a0)
+; RV32IA-NEXT:    lw a4, 4(a0)
 ; RV32IA-NEXT:    mv s1, a2
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB231_2
 ; RV32IA-NEXT:  .LBB231_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB231_2 Depth=1
-; RV32IA-NEXT:    sw a4, 8(sp)
-; RV32IA-NEXT:    sw a5, 12(sp)
+; RV32IA-NEXT:    sw a5, 8(sp)
+; RV32IA-NEXT:    sw a4, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
 ; RV32IA-NEXT:    li a4, 2
 ; RV32IA-NEXT:    li a5, 2
 ; RV32IA-NEXT:    mv a0, s0
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8 at plt
-; RV32IA-NEXT:    lw a5, 12(sp)
-; RV32IA-NEXT:    lw a4, 8(sp)
+; RV32IA-NEXT:    lw a5, 8(sp)
+; RV32IA-NEXT:    lw a4, 12(sp)
 ; RV32IA-NEXT:    bnez a0, .LBB231_7
 ; RV32IA-NEXT:  .LBB231_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a5, s1, .LBB231_4
+; RV32IA-NEXT:    beq a4, s1, .LBB231_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB231_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s1, a5
+; RV32IA-NEXT:    sltu a0, s1, a4
 ; RV32IA-NEXT:    j .LBB231_5
 ; RV32IA-NEXT:  .LBB231_4: # in Loop: Header=BB231_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a4
+; RV32IA-NEXT:    sltu a0, s2, a5
 ; RV32IA-NEXT:  .LBB231_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB231_2 Depth=1
-; RV32IA-NEXT:    mv a2, a4
-; RV32IA-NEXT:    mv a3, a5
+; RV32IA-NEXT:    mv a2, a5
+; RV32IA-NEXT:    mv a3, a4
 ; RV32IA-NEXT:    bnez a0, .LBB231_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB231_2 Depth=1
@@ -20880,8 +20880,8 @@ define i64 @atomicrmw_umax_i64_acquire(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    mv a3, s1
 ; RV32IA-NEXT:    j .LBB231_1
 ; RV32IA-NEXT:  .LBB231_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a4
-; RV32IA-NEXT:    mv a1, a5
+; RV32IA-NEXT:    mv a0, a5
+; RV32IA-NEXT:    mv a1, a4
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -20947,36 +20947,36 @@ define i64 @atomicrmw_umax_i64_release(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 4(a0)
-; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 0(a0)
+; RV32I-NEXT:    lw a4, 4(a0)
 ; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB232_2
 ; RV32I-NEXT:  .LBB232_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB232_2 Depth=1
-; RV32I-NEXT:    sw a4, 8(sp)
-; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a4, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    li a4, 3
 ; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    li a5, 0
 ; RV32I-NEXT:    call __atomic_compare_exchange_8 at plt
-; RV32I-NEXT:    lw a5, 12(sp)
-; RV32I-NEXT:    lw a4, 8(sp)
+; RV32I-NEXT:    lw a5, 8(sp)
+; RV32I-NEXT:    lw a4, 12(sp)
 ; RV32I-NEXT:    bnez a0, .LBB232_7
 ; RV32I-NEXT:  .LBB232_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a5, s1, .LBB232_4
+; RV32I-NEXT:    beq a4, s1, .LBB232_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB232_2 Depth=1
-; RV32I-NEXT:    sltu a0, s1, a5
+; RV32I-NEXT:    sltu a0, s1, a4
 ; RV32I-NEXT:    j .LBB232_5
 ; RV32I-NEXT:  .LBB232_4: # in Loop: Header=BB232_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a4
+; RV32I-NEXT:    sltu a0, s2, a5
 ; RV32I-NEXT:  .LBB232_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB232_2 Depth=1
-; RV32I-NEXT:    mv a2, a4
-; RV32I-NEXT:    mv a3, a5
+; RV32I-NEXT:    mv a2, a5
+; RV32I-NEXT:    mv a3, a4
 ; RV32I-NEXT:    bnez a0, .LBB232_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB232_2 Depth=1
@@ -20984,8 +20984,8 @@ define i64 @atomicrmw_umax_i64_release(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    mv a3, s1
 ; RV32I-NEXT:    j .LBB232_1
 ; RV32I-NEXT:  .LBB232_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    mv a1, a5
+; RV32I-NEXT:    mv a0, a5
+; RV32I-NEXT:    mv a1, a4
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -21001,36 +21001,36 @@ define i64 @atomicrmw_umax_i64_release(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 4(a0)
-; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 0(a0)
+; RV32IA-NEXT:    lw a4, 4(a0)
 ; RV32IA-NEXT:    mv s1, a2
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB232_2
 ; RV32IA-NEXT:  .LBB232_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB232_2 Depth=1
-; RV32IA-NEXT:    sw a4, 8(sp)
-; RV32IA-NEXT:    sw a5, 12(sp)
+; RV32IA-NEXT:    sw a5, 8(sp)
+; RV32IA-NEXT:    sw a4, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
 ; RV32IA-NEXT:    li a4, 3
 ; RV32IA-NEXT:    mv a0, s0
 ; RV32IA-NEXT:    li a5, 0
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8 at plt
-; RV32IA-NEXT:    lw a5, 12(sp)
-; RV32IA-NEXT:    lw a4, 8(sp)
+; RV32IA-NEXT:    lw a5, 8(sp)
+; RV32IA-NEXT:    lw a4, 12(sp)
 ; RV32IA-NEXT:    bnez a0, .LBB232_7
 ; RV32IA-NEXT:  .LBB232_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a5, s1, .LBB232_4
+; RV32IA-NEXT:    beq a4, s1, .LBB232_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB232_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s1, a5
+; RV32IA-NEXT:    sltu a0, s1, a4
 ; RV32IA-NEXT:    j .LBB232_5
 ; RV32IA-NEXT:  .LBB232_4: # in Loop: Header=BB232_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a4
+; RV32IA-NEXT:    sltu a0, s2, a5
 ; RV32IA-NEXT:  .LBB232_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB232_2 Depth=1
-; RV32IA-NEXT:    mv a2, a4
-; RV32IA-NEXT:    mv a3, a5
+; RV32IA-NEXT:    mv a2, a5
+; RV32IA-NEXT:    mv a3, a4
 ; RV32IA-NEXT:    bnez a0, .LBB232_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB232_2 Depth=1
@@ -21038,8 +21038,8 @@ define i64 @atomicrmw_umax_i64_release(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    mv a3, s1
 ; RV32IA-NEXT:    j .LBB232_1
 ; RV32IA-NEXT:  .LBB232_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a4
-; RV32IA-NEXT:    mv a1, a5
+; RV32IA-NEXT:    mv a0, a5
+; RV32IA-NEXT:    mv a1, a4
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -21105,36 +21105,36 @@ define i64 @atomicrmw_umax_i64_acq_rel(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 4(a0)
-; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 0(a0)
+; RV32I-NEXT:    lw a4, 4(a0)
 ; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB233_2
 ; RV32I-NEXT:  .LBB233_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB233_2 Depth=1
-; RV32I-NEXT:    sw a4, 8(sp)
-; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a4, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    li a4, 4
 ; RV32I-NEXT:    li a5, 2
 ; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    call __atomic_compare_exchange_8 at plt
-; RV32I-NEXT:    lw a5, 12(sp)
-; RV32I-NEXT:    lw a4, 8(sp)
+; RV32I-NEXT:    lw a5, 8(sp)
+; RV32I-NEXT:    lw a4, 12(sp)
 ; RV32I-NEXT:    bnez a0, .LBB233_7
 ; RV32I-NEXT:  .LBB233_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a5, s1, .LBB233_4
+; RV32I-NEXT:    beq a4, s1, .LBB233_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB233_2 Depth=1
-; RV32I-NEXT:    sltu a0, s1, a5
+; RV32I-NEXT:    sltu a0, s1, a4
 ; RV32I-NEXT:    j .LBB233_5
 ; RV32I-NEXT:  .LBB233_4: # in Loop: Header=BB233_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a4
+; RV32I-NEXT:    sltu a0, s2, a5
 ; RV32I-NEXT:  .LBB233_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB233_2 Depth=1
-; RV32I-NEXT:    mv a2, a4
-; RV32I-NEXT:    mv a3, a5
+; RV32I-NEXT:    mv a2, a5
+; RV32I-NEXT:    mv a3, a4
 ; RV32I-NEXT:    bnez a0, .LBB233_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB233_2 Depth=1
@@ -21142,8 +21142,8 @@ define i64 @atomicrmw_umax_i64_acq_rel(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    mv a3, s1
 ; RV32I-NEXT:    j .LBB233_1
 ; RV32I-NEXT:  .LBB233_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    mv a1, a5
+; RV32I-NEXT:    mv a0, a5
+; RV32I-NEXT:    mv a1, a4
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -21159,36 +21159,36 @@ define i64 @atomicrmw_umax_i64_acq_rel(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 4(a0)
-; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 0(a0)
+; RV32IA-NEXT:    lw a4, 4(a0)
 ; RV32IA-NEXT:    mv s1, a2
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB233_2
 ; RV32IA-NEXT:  .LBB233_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB233_2 Depth=1
-; RV32IA-NEXT:    sw a4, 8(sp)
-; RV32IA-NEXT:    sw a5, 12(sp)
+; RV32IA-NEXT:    sw a5, 8(sp)
+; RV32IA-NEXT:    sw a4, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
 ; RV32IA-NEXT:    li a4, 4
 ; RV32IA-NEXT:    li a5, 2
 ; RV32IA-NEXT:    mv a0, s0
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8 at plt
-; RV32IA-NEXT:    lw a5, 12(sp)
-; RV32IA-NEXT:    lw a4, 8(sp)
+; RV32IA-NEXT:    lw a5, 8(sp)
+; RV32IA-NEXT:    lw a4, 12(sp)
 ; RV32IA-NEXT:    bnez a0, .LBB233_7
 ; RV32IA-NEXT:  .LBB233_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a5, s1, .LBB233_4
+; RV32IA-NEXT:    beq a4, s1, .LBB233_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB233_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s1, a5
+; RV32IA-NEXT:    sltu a0, s1, a4
 ; RV32IA-NEXT:    j .LBB233_5
 ; RV32IA-NEXT:  .LBB233_4: # in Loop: Header=BB233_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a4
+; RV32IA-NEXT:    sltu a0, s2, a5
 ; RV32IA-NEXT:  .LBB233_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB233_2 Depth=1
-; RV32IA-NEXT:    mv a2, a4
-; RV32IA-NEXT:    mv a3, a5
+; RV32IA-NEXT:    mv a2, a5
+; RV32IA-NEXT:    mv a3, a4
 ; RV32IA-NEXT:    bnez a0, .LBB233_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB233_2 Depth=1
@@ -21196,8 +21196,8 @@ define i64 @atomicrmw_umax_i64_acq_rel(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    mv a3, s1
 ; RV32IA-NEXT:    j .LBB233_1
 ; RV32IA-NEXT:  .LBB233_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a4
-; RV32IA-NEXT:    mv a1, a5
+; RV32IA-NEXT:    mv a0, a5
+; RV32IA-NEXT:    mv a1, a4
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -21263,36 +21263,36 @@ define i64 @atomicrmw_umax_i64_seq_cst(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 4(a0)
-; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 0(a0)
+; RV32I-NEXT:    lw a4, 4(a0)
 ; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB234_2
 ; RV32I-NEXT:  .LBB234_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB234_2 Depth=1
-; RV32I-NEXT:    sw a4, 8(sp)
-; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a4, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    li a4, 5
 ; RV32I-NEXT:    li a5, 5
 ; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    call __atomic_compare_exchange_8 at plt
-; RV32I-NEXT:    lw a5, 12(sp)
-; RV32I-NEXT:    lw a4, 8(sp)
+; RV32I-NEXT:    lw a5, 8(sp)
+; RV32I-NEXT:    lw a4, 12(sp)
 ; RV32I-NEXT:    bnez a0, .LBB234_7
 ; RV32I-NEXT:  .LBB234_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a5, s1, .LBB234_4
+; RV32I-NEXT:    beq a4, s1, .LBB234_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB234_2 Depth=1
-; RV32I-NEXT:    sltu a0, s1, a5
+; RV32I-NEXT:    sltu a0, s1, a4
 ; RV32I-NEXT:    j .LBB234_5
 ; RV32I-NEXT:  .LBB234_4: # in Loop: Header=BB234_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a4
+; RV32I-NEXT:    sltu a0, s2, a5
 ; RV32I-NEXT:  .LBB234_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB234_2 Depth=1
-; RV32I-NEXT:    mv a2, a4
-; RV32I-NEXT:    mv a3, a5
+; RV32I-NEXT:    mv a2, a5
+; RV32I-NEXT:    mv a3, a4
 ; RV32I-NEXT:    bnez a0, .LBB234_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB234_2 Depth=1
@@ -21300,8 +21300,8 @@ define i64 @atomicrmw_umax_i64_seq_cst(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    mv a3, s1
 ; RV32I-NEXT:    j .LBB234_1
 ; RV32I-NEXT:  .LBB234_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    mv a1, a5
+; RV32I-NEXT:    mv a0, a5
+; RV32I-NEXT:    mv a1, a4
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -21317,36 +21317,36 @@ define i64 @atomicrmw_umax_i64_seq_cst(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 4(a0)
-; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 0(a0)
+; RV32IA-NEXT:    lw a4, 4(a0)
 ; RV32IA-NEXT:    mv s1, a2
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB234_2
 ; RV32IA-NEXT:  .LBB234_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB234_2 Depth=1
-; RV32IA-NEXT:    sw a4, 8(sp)
-; RV32IA-NEXT:    sw a5, 12(sp)
+; RV32IA-NEXT:    sw a5, 8(sp)
+; RV32IA-NEXT:    sw a4, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
 ; RV32IA-NEXT:    li a4, 5
 ; RV32IA-NEXT:    li a5, 5
 ; RV32IA-NEXT:    mv a0, s0
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8 at plt
-; RV32IA-NEXT:    lw a5, 12(sp)
-; RV32IA-NEXT:    lw a4, 8(sp)
+; RV32IA-NEXT:    lw a5, 8(sp)
+; RV32IA-NEXT:    lw a4, 12(sp)
 ; RV32IA-NEXT:    bnez a0, .LBB234_7
 ; RV32IA-NEXT:  .LBB234_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a5, s1, .LBB234_4
+; RV32IA-NEXT:    beq a4, s1, .LBB234_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB234_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s1, a5
+; RV32IA-NEXT:    sltu a0, s1, a4
 ; RV32IA-NEXT:    j .LBB234_5
 ; RV32IA-NEXT:  .LBB234_4: # in Loop: Header=BB234_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a4
+; RV32IA-NEXT:    sltu a0, s2, a5
 ; RV32IA-NEXT:  .LBB234_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB234_2 Depth=1
-; RV32IA-NEXT:    mv a2, a4
-; RV32IA-NEXT:    mv a3, a5
+; RV32IA-NEXT:    mv a2, a5
+; RV32IA-NEXT:    mv a3, a4
 ; RV32IA-NEXT:    bnez a0, .LBB234_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB234_2 Depth=1
@@ -21354,8 +21354,8 @@ define i64 @atomicrmw_umax_i64_seq_cst(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    mv a3, s1
 ; RV32IA-NEXT:    j .LBB234_1
 ; RV32IA-NEXT:  .LBB234_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a4
-; RV32IA-NEXT:    mv a1, a5
+; RV32IA-NEXT:    mv a0, a5
+; RV32IA-NEXT:    mv a1, a4
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -21421,36 +21421,36 @@ define i64 @atomicrmw_umin_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 4(a0)
-; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 0(a0)
+; RV32I-NEXT:    lw a4, 4(a0)
 ; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB235_2
 ; RV32I-NEXT:  .LBB235_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB235_2 Depth=1
-; RV32I-NEXT:    sw a4, 8(sp)
-; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a4, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    li a4, 0
 ; RV32I-NEXT:    li a5, 0
 ; RV32I-NEXT:    call __atomic_compare_exchange_8 at plt
-; RV32I-NEXT:    lw a5, 12(sp)
-; RV32I-NEXT:    lw a4, 8(sp)
+; RV32I-NEXT:    lw a5, 8(sp)
+; RV32I-NEXT:    lw a4, 12(sp)
 ; RV32I-NEXT:    bnez a0, .LBB235_7
 ; RV32I-NEXT:  .LBB235_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a5, s1, .LBB235_4
+; RV32I-NEXT:    beq a4, s1, .LBB235_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB235_2 Depth=1
-; RV32I-NEXT:    sltu a0, s1, a5
+; RV32I-NEXT:    sltu a0, s1, a4
 ; RV32I-NEXT:    j .LBB235_5
 ; RV32I-NEXT:  .LBB235_4: # in Loop: Header=BB235_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a4
+; RV32I-NEXT:    sltu a0, s2, a5
 ; RV32I-NEXT:  .LBB235_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB235_2 Depth=1
-; RV32I-NEXT:    mv a2, a4
-; RV32I-NEXT:    mv a3, a5
+; RV32I-NEXT:    mv a2, a5
+; RV32I-NEXT:    mv a3, a4
 ; RV32I-NEXT:    beqz a0, .LBB235_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB235_2 Depth=1
@@ -21458,8 +21458,8 @@ define i64 @atomicrmw_umin_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    mv a3, s1
 ; RV32I-NEXT:    j .LBB235_1
 ; RV32I-NEXT:  .LBB235_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    mv a1, a5
+; RV32I-NEXT:    mv a0, a5
+; RV32I-NEXT:    mv a1, a4
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -21475,36 +21475,36 @@ define i64 @atomicrmw_umin_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 4(a0)
-; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 0(a0)
+; RV32IA-NEXT:    lw a4, 4(a0)
 ; RV32IA-NEXT:    mv s1, a2
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB235_2
 ; RV32IA-NEXT:  .LBB235_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB235_2 Depth=1
-; RV32IA-NEXT:    sw a4, 8(sp)
-; RV32IA-NEXT:    sw a5, 12(sp)
+; RV32IA-NEXT:    sw a5, 8(sp)
+; RV32IA-NEXT:    sw a4, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
 ; RV32IA-NEXT:    mv a0, s0
 ; RV32IA-NEXT:    li a4, 0
 ; RV32IA-NEXT:    li a5, 0
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8 at plt
-; RV32IA-NEXT:    lw a5, 12(sp)
-; RV32IA-NEXT:    lw a4, 8(sp)
+; RV32IA-NEXT:    lw a5, 8(sp)
+; RV32IA-NEXT:    lw a4, 12(sp)
 ; RV32IA-NEXT:    bnez a0, .LBB235_7
 ; RV32IA-NEXT:  .LBB235_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a5, s1, .LBB235_4
+; RV32IA-NEXT:    beq a4, s1, .LBB235_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB235_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s1, a5
+; RV32IA-NEXT:    sltu a0, s1, a4
 ; RV32IA-NEXT:    j .LBB235_5
 ; RV32IA-NEXT:  .LBB235_4: # in Loop: Header=BB235_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a4
+; RV32IA-NEXT:    sltu a0, s2, a5
 ; RV32IA-NEXT:  .LBB235_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB235_2 Depth=1
-; RV32IA-NEXT:    mv a2, a4
-; RV32IA-NEXT:    mv a3, a5
+; RV32IA-NEXT:    mv a2, a5
+; RV32IA-NEXT:    mv a3, a4
 ; RV32IA-NEXT:    beqz a0, .LBB235_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB235_2 Depth=1
@@ -21512,8 +21512,8 @@ define i64 @atomicrmw_umin_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    mv a3, s1
 ; RV32IA-NEXT:    j .LBB235_1
 ; RV32IA-NEXT:  .LBB235_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a4
-; RV32IA-NEXT:    mv a1, a5
+; RV32IA-NEXT:    mv a0, a5
+; RV32IA-NEXT:    mv a1, a4
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -21574,36 +21574,36 @@ define i64 @atomicrmw_umin_i64_acquire(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 4(a0)
-; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 0(a0)
+; RV32I-NEXT:    lw a4, 4(a0)
 ; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB236_2
 ; RV32I-NEXT:  .LBB236_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB236_2 Depth=1
-; RV32I-NEXT:    sw a4, 8(sp)
-; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a4, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    li a4, 2
 ; RV32I-NEXT:    li a5, 2
 ; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    call __atomic_compare_exchange_8 at plt
-; RV32I-NEXT:    lw a5, 12(sp)
-; RV32I-NEXT:    lw a4, 8(sp)
+; RV32I-NEXT:    lw a5, 8(sp)
+; RV32I-NEXT:    lw a4, 12(sp)
 ; RV32I-NEXT:    bnez a0, .LBB236_7
 ; RV32I-NEXT:  .LBB236_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a5, s1, .LBB236_4
+; RV32I-NEXT:    beq a4, s1, .LBB236_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB236_2 Depth=1
-; RV32I-NEXT:    sltu a0, s1, a5
+; RV32I-NEXT:    sltu a0, s1, a4
 ; RV32I-NEXT:    j .LBB236_5
 ; RV32I-NEXT:  .LBB236_4: # in Loop: Header=BB236_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a4
+; RV32I-NEXT:    sltu a0, s2, a5
 ; RV32I-NEXT:  .LBB236_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB236_2 Depth=1
-; RV32I-NEXT:    mv a2, a4
-; RV32I-NEXT:    mv a3, a5
+; RV32I-NEXT:    mv a2, a5
+; RV32I-NEXT:    mv a3, a4
 ; RV32I-NEXT:    beqz a0, .LBB236_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB236_2 Depth=1
@@ -21611,8 +21611,8 @@ define i64 @atomicrmw_umin_i64_acquire(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    mv a3, s1
 ; RV32I-NEXT:    j .LBB236_1
 ; RV32I-NEXT:  .LBB236_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    mv a1, a5
+; RV32I-NEXT:    mv a0, a5
+; RV32I-NEXT:    mv a1, a4
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -21628,36 +21628,36 @@ define i64 @atomicrmw_umin_i64_acquire(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 4(a0)
-; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 0(a0)
+; RV32IA-NEXT:    lw a4, 4(a0)
 ; RV32IA-NEXT:    mv s1, a2
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB236_2
 ; RV32IA-NEXT:  .LBB236_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB236_2 Depth=1
-; RV32IA-NEXT:    sw a4, 8(sp)
-; RV32IA-NEXT:    sw a5, 12(sp)
+; RV32IA-NEXT:    sw a5, 8(sp)
+; RV32IA-NEXT:    sw a4, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
 ; RV32IA-NEXT:    li a4, 2
 ; RV32IA-NEXT:    li a5, 2
 ; RV32IA-NEXT:    mv a0, s0
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8 at plt
-; RV32IA-NEXT:    lw a5, 12(sp)
-; RV32IA-NEXT:    lw a4, 8(sp)
+; RV32IA-NEXT:    lw a5, 8(sp)
+; RV32IA-NEXT:    lw a4, 12(sp)
 ; RV32IA-NEXT:    bnez a0, .LBB236_7
 ; RV32IA-NEXT:  .LBB236_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a5, s1, .LBB236_4
+; RV32IA-NEXT:    beq a4, s1, .LBB236_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB236_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s1, a5
+; RV32IA-NEXT:    sltu a0, s1, a4
 ; RV32IA-NEXT:    j .LBB236_5
 ; RV32IA-NEXT:  .LBB236_4: # in Loop: Header=BB236_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a4
+; RV32IA-NEXT:    sltu a0, s2, a5
 ; RV32IA-NEXT:  .LBB236_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB236_2 Depth=1
-; RV32IA-NEXT:    mv a2, a4
-; RV32IA-NEXT:    mv a3, a5
+; RV32IA-NEXT:    mv a2, a5
+; RV32IA-NEXT:    mv a3, a4
 ; RV32IA-NEXT:    beqz a0, .LBB236_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB236_2 Depth=1
@@ -21665,8 +21665,8 @@ define i64 @atomicrmw_umin_i64_acquire(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    mv a3, s1
 ; RV32IA-NEXT:    j .LBB236_1
 ; RV32IA-NEXT:  .LBB236_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a4
-; RV32IA-NEXT:    mv a1, a5
+; RV32IA-NEXT:    mv a0, a5
+; RV32IA-NEXT:    mv a1, a4
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -21732,36 +21732,36 @@ define i64 @atomicrmw_umin_i64_release(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 4(a0)
-; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 0(a0)
+; RV32I-NEXT:    lw a4, 4(a0)
 ; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB237_2
 ; RV32I-NEXT:  .LBB237_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB237_2 Depth=1
-; RV32I-NEXT:    sw a4, 8(sp)
-; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a4, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    li a4, 3
 ; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    li a5, 0
 ; RV32I-NEXT:    call __atomic_compare_exchange_8 at plt
-; RV32I-NEXT:    lw a5, 12(sp)
-; RV32I-NEXT:    lw a4, 8(sp)
+; RV32I-NEXT:    lw a5, 8(sp)
+; RV32I-NEXT:    lw a4, 12(sp)
 ; RV32I-NEXT:    bnez a0, .LBB237_7
 ; RV32I-NEXT:  .LBB237_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a5, s1, .LBB237_4
+; RV32I-NEXT:    beq a4, s1, .LBB237_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB237_2 Depth=1
-; RV32I-NEXT:    sltu a0, s1, a5
+; RV32I-NEXT:    sltu a0, s1, a4
 ; RV32I-NEXT:    j .LBB237_5
 ; RV32I-NEXT:  .LBB237_4: # in Loop: Header=BB237_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a4
+; RV32I-NEXT:    sltu a0, s2, a5
 ; RV32I-NEXT:  .LBB237_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB237_2 Depth=1
-; RV32I-NEXT:    mv a2, a4
-; RV32I-NEXT:    mv a3, a5
+; RV32I-NEXT:    mv a2, a5
+; RV32I-NEXT:    mv a3, a4
 ; RV32I-NEXT:    beqz a0, .LBB237_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB237_2 Depth=1
@@ -21769,8 +21769,8 @@ define i64 @atomicrmw_umin_i64_release(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    mv a3, s1
 ; RV32I-NEXT:    j .LBB237_1
 ; RV32I-NEXT:  .LBB237_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    mv a1, a5
+; RV32I-NEXT:    mv a0, a5
+; RV32I-NEXT:    mv a1, a4
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -21786,36 +21786,36 @@ define i64 @atomicrmw_umin_i64_release(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 4(a0)
-; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 0(a0)
+; RV32IA-NEXT:    lw a4, 4(a0)
 ; RV32IA-NEXT:    mv s1, a2
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB237_2
 ; RV32IA-NEXT:  .LBB237_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB237_2 Depth=1
-; RV32IA-NEXT:    sw a4, 8(sp)
-; RV32IA-NEXT:    sw a5, 12(sp)
+; RV32IA-NEXT:    sw a5, 8(sp)
+; RV32IA-NEXT:    sw a4, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
 ; RV32IA-NEXT:    li a4, 3
 ; RV32IA-NEXT:    mv a0, s0
 ; RV32IA-NEXT:    li a5, 0
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8 at plt
-; RV32IA-NEXT:    lw a5, 12(sp)
-; RV32IA-NEXT:    lw a4, 8(sp)
+; RV32IA-NEXT:    lw a5, 8(sp)
+; RV32IA-NEXT:    lw a4, 12(sp)
 ; RV32IA-NEXT:    bnez a0, .LBB237_7
 ; RV32IA-NEXT:  .LBB237_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a5, s1, .LBB237_4
+; RV32IA-NEXT:    beq a4, s1, .LBB237_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB237_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s1, a5
+; RV32IA-NEXT:    sltu a0, s1, a4
 ; RV32IA-NEXT:    j .LBB237_5
 ; RV32IA-NEXT:  .LBB237_4: # in Loop: Header=BB237_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a4
+; RV32IA-NEXT:    sltu a0, s2, a5
 ; RV32IA-NEXT:  .LBB237_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB237_2 Depth=1
-; RV32IA-NEXT:    mv a2, a4
-; RV32IA-NEXT:    mv a3, a5
+; RV32IA-NEXT:    mv a2, a5
+; RV32IA-NEXT:    mv a3, a4
 ; RV32IA-NEXT:    beqz a0, .LBB237_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB237_2 Depth=1
@@ -21823,8 +21823,8 @@ define i64 @atomicrmw_umin_i64_release(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    mv a3, s1
 ; RV32IA-NEXT:    j .LBB237_1
 ; RV32IA-NEXT:  .LBB237_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a4
-; RV32IA-NEXT:    mv a1, a5
+; RV32IA-NEXT:    mv a0, a5
+; RV32IA-NEXT:    mv a1, a4
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -21890,36 +21890,36 @@ define i64 @atomicrmw_umin_i64_acq_rel(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 4(a0)
-; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 0(a0)
+; RV32I-NEXT:    lw a4, 4(a0)
 ; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB238_2
 ; RV32I-NEXT:  .LBB238_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB238_2 Depth=1
-; RV32I-NEXT:    sw a4, 8(sp)
-; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a4, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    li a4, 4
 ; RV32I-NEXT:    li a5, 2
 ; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    call __atomic_compare_exchange_8 at plt
-; RV32I-NEXT:    lw a5, 12(sp)
-; RV32I-NEXT:    lw a4, 8(sp)
+; RV32I-NEXT:    lw a5, 8(sp)
+; RV32I-NEXT:    lw a4, 12(sp)
 ; RV32I-NEXT:    bnez a0, .LBB238_7
 ; RV32I-NEXT:  .LBB238_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a5, s1, .LBB238_4
+; RV32I-NEXT:    beq a4, s1, .LBB238_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB238_2 Depth=1
-; RV32I-NEXT:    sltu a0, s1, a5
+; RV32I-NEXT:    sltu a0, s1, a4
 ; RV32I-NEXT:    j .LBB238_5
 ; RV32I-NEXT:  .LBB238_4: # in Loop: Header=BB238_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a4
+; RV32I-NEXT:    sltu a0, s2, a5
 ; RV32I-NEXT:  .LBB238_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB238_2 Depth=1
-; RV32I-NEXT:    mv a2, a4
-; RV32I-NEXT:    mv a3, a5
+; RV32I-NEXT:    mv a2, a5
+; RV32I-NEXT:    mv a3, a4
 ; RV32I-NEXT:    beqz a0, .LBB238_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB238_2 Depth=1
@@ -21927,8 +21927,8 @@ define i64 @atomicrmw_umin_i64_acq_rel(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    mv a3, s1
 ; RV32I-NEXT:    j .LBB238_1
 ; RV32I-NEXT:  .LBB238_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    mv a1, a5
+; RV32I-NEXT:    mv a0, a5
+; RV32I-NEXT:    mv a1, a4
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -21944,36 +21944,36 @@ define i64 @atomicrmw_umin_i64_acq_rel(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 4(a0)
-; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 0(a0)
+; RV32IA-NEXT:    lw a4, 4(a0)
 ; RV32IA-NEXT:    mv s1, a2
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB238_2
 ; RV32IA-NEXT:  .LBB238_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB238_2 Depth=1
-; RV32IA-NEXT:    sw a4, 8(sp)
-; RV32IA-NEXT:    sw a5, 12(sp)
+; RV32IA-NEXT:    sw a5, 8(sp)
+; RV32IA-NEXT:    sw a4, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
 ; RV32IA-NEXT:    li a4, 4
 ; RV32IA-NEXT:    li a5, 2
 ; RV32IA-NEXT:    mv a0, s0
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8 at plt
-; RV32IA-NEXT:    lw a5, 12(sp)
-; RV32IA-NEXT:    lw a4, 8(sp)
+; RV32IA-NEXT:    lw a5, 8(sp)
+; RV32IA-NEXT:    lw a4, 12(sp)
 ; RV32IA-NEXT:    bnez a0, .LBB238_7
 ; RV32IA-NEXT:  .LBB238_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a5, s1, .LBB238_4
+; RV32IA-NEXT:    beq a4, s1, .LBB238_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB238_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s1, a5
+; RV32IA-NEXT:    sltu a0, s1, a4
 ; RV32IA-NEXT:    j .LBB238_5
 ; RV32IA-NEXT:  .LBB238_4: # in Loop: Header=BB238_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a4
+; RV32IA-NEXT:    sltu a0, s2, a5
 ; RV32IA-NEXT:  .LBB238_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB238_2 Depth=1
-; RV32IA-NEXT:    mv a2, a4
-; RV32IA-NEXT:    mv a3, a5
+; RV32IA-NEXT:    mv a2, a5
+; RV32IA-NEXT:    mv a3, a4
 ; RV32IA-NEXT:    beqz a0, .LBB238_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB238_2 Depth=1
@@ -21981,8 +21981,8 @@ define i64 @atomicrmw_umin_i64_acq_rel(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    mv a3, s1
 ; RV32IA-NEXT:    j .LBB238_1
 ; RV32IA-NEXT:  .LBB238_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a4
-; RV32IA-NEXT:    mv a1, a5
+; RV32IA-NEXT:    mv a0, a5
+; RV32IA-NEXT:    mv a1, a4
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -22048,36 +22048,36 @@ define i64 @atomicrmw_umin_i64_seq_cst(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 4(a0)
-; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 0(a0)
+; RV32I-NEXT:    lw a4, 4(a0)
 ; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB239_2
 ; RV32I-NEXT:  .LBB239_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB239_2 Depth=1
-; RV32I-NEXT:    sw a4, 8(sp)
-; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a4, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    li a4, 5
 ; RV32I-NEXT:    li a5, 5
 ; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    call __atomic_compare_exchange_8 at plt
-; RV32I-NEXT:    lw a5, 12(sp)
-; RV32I-NEXT:    lw a4, 8(sp)
+; RV32I-NEXT:    lw a5, 8(sp)
+; RV32I-NEXT:    lw a4, 12(sp)
 ; RV32I-NEXT:    bnez a0, .LBB239_7
 ; RV32I-NEXT:  .LBB239_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a5, s1, .LBB239_4
+; RV32I-NEXT:    beq a4, s1, .LBB239_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB239_2 Depth=1
-; RV32I-NEXT:    sltu a0, s1, a5
+; RV32I-NEXT:    sltu a0, s1, a4
 ; RV32I-NEXT:    j .LBB239_5
 ; RV32I-NEXT:  .LBB239_4: # in Loop: Header=BB239_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a4
+; RV32I-NEXT:    sltu a0, s2, a5
 ; RV32I-NEXT:  .LBB239_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB239_2 Depth=1
-; RV32I-NEXT:    mv a2, a4
-; RV32I-NEXT:    mv a3, a5
+; RV32I-NEXT:    mv a2, a5
+; RV32I-NEXT:    mv a3, a4
 ; RV32I-NEXT:    beqz a0, .LBB239_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB239_2 Depth=1
@@ -22085,8 +22085,8 @@ define i64 @atomicrmw_umin_i64_seq_cst(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    mv a3, s1
 ; RV32I-NEXT:    j .LBB239_1
 ; RV32I-NEXT:  .LBB239_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    mv a1, a5
+; RV32I-NEXT:    mv a0, a5
+; RV32I-NEXT:    mv a1, a4
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -22102,36 +22102,36 @@ define i64 @atomicrmw_umin_i64_seq_cst(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 4(a0)
-; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 0(a0)
+; RV32IA-NEXT:    lw a4, 4(a0)
 ; RV32IA-NEXT:    mv s1, a2
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB239_2
 ; RV32IA-NEXT:  .LBB239_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB239_2 Depth=1
-; RV32IA-NEXT:    sw a4, 8(sp)
-; RV32IA-NEXT:    sw a5, 12(sp)
+; RV32IA-NEXT:    sw a5, 8(sp)
+; RV32IA-NEXT:    sw a4, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
 ; RV32IA-NEXT:    li a4, 5
 ; RV32IA-NEXT:    li a5, 5
 ; RV32IA-NEXT:    mv a0, s0
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8 at plt
-; RV32IA-NEXT:    lw a5, 12(sp)
-; RV32IA-NEXT:    lw a4, 8(sp)
+; RV32IA-NEXT:    lw a5, 8(sp)
+; RV32IA-NEXT:    lw a4, 12(sp)
 ; RV32IA-NEXT:    bnez a0, .LBB239_7
 ; RV32IA-NEXT:  .LBB239_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a5, s1, .LBB239_4
+; RV32IA-NEXT:    beq a4, s1, .LBB239_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB239_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s1, a5
+; RV32IA-NEXT:    sltu a0, s1, a4
 ; RV32IA-NEXT:    j .LBB239_5
 ; RV32IA-NEXT:  .LBB239_4: # in Loop: Header=BB239_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a4
+; RV32IA-NEXT:    sltu a0, s2, a5
 ; RV32IA-NEXT:  .LBB239_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB239_2 Depth=1
-; RV32IA-NEXT:    mv a2, a4
-; RV32IA-NEXT:    mv a3, a5
+; RV32IA-NEXT:    mv a2, a5
+; RV32IA-NEXT:    mv a3, a4
 ; RV32IA-NEXT:    beqz a0, .LBB239_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB239_2 Depth=1
@@ -22139,8 +22139,8 @@ define i64 @atomicrmw_umin_i64_seq_cst(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    mv a3, s1
 ; RV32IA-NEXT:    j .LBB239_1
 ; RV32IA-NEXT:  .LBB239_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a4
-; RV32IA-NEXT:    mv a1, a5
+; RV32IA-NEXT:    mv a0, a5
+; RV32IA-NEXT:    mv a1, a4
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/atomic-signext.ll b/llvm/test/CodeGen/RISCV/atomic-signext.ll
index 2739fde250ee27..793ca7d08f5134 100644
--- a/llvm/test/CodeGen/RISCV/atomic-signext.ll
+++ b/llvm/test/CodeGen/RISCV/atomic-signext.ll
@@ -3137,36 +3137,36 @@ define signext i64 @atomicrmw_max_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 4(a0)
-; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 0(a0)
+; RV32I-NEXT:    lw a4, 4(a0)
 ; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB43_2
 ; RV32I-NEXT:  .LBB43_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB43_2 Depth=1
-; RV32I-NEXT:    sw a4, 8(sp)
-; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a4, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    li a4, 0
 ; RV32I-NEXT:    li a5, 0
 ; RV32I-NEXT:    call __atomic_compare_exchange_8 at plt
-; RV32I-NEXT:    lw a5, 12(sp)
-; RV32I-NEXT:    lw a4, 8(sp)
+; RV32I-NEXT:    lw a5, 8(sp)
+; RV32I-NEXT:    lw a4, 12(sp)
 ; RV32I-NEXT:    bnez a0, .LBB43_7
 ; RV32I-NEXT:  .LBB43_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a5, s1, .LBB43_4
+; RV32I-NEXT:    beq a4, s1, .LBB43_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB43_2 Depth=1
-; RV32I-NEXT:    slt a0, s1, a5
+; RV32I-NEXT:    slt a0, s1, a4
 ; RV32I-NEXT:    j .LBB43_5
 ; RV32I-NEXT:  .LBB43_4: # in Loop: Header=BB43_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a4
+; RV32I-NEXT:    sltu a0, s2, a5
 ; RV32I-NEXT:  .LBB43_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB43_2 Depth=1
-; RV32I-NEXT:    mv a2, a4
-; RV32I-NEXT:    mv a3, a5
+; RV32I-NEXT:    mv a2, a5
+; RV32I-NEXT:    mv a3, a4
 ; RV32I-NEXT:    bnez a0, .LBB43_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB43_2 Depth=1
@@ -3174,8 +3174,8 @@ define signext i64 @atomicrmw_max_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    mv a3, s1
 ; RV32I-NEXT:    j .LBB43_1
 ; RV32I-NEXT:  .LBB43_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    mv a1, a5
+; RV32I-NEXT:    mv a0, a5
+; RV32I-NEXT:    mv a1, a4
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -3191,36 +3191,36 @@ define signext i64 @atomicrmw_max_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 4(a0)
-; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 0(a0)
+; RV32IA-NEXT:    lw a4, 4(a0)
 ; RV32IA-NEXT:    mv s1, a2
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB43_2
 ; RV32IA-NEXT:  .LBB43_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB43_2 Depth=1
-; RV32IA-NEXT:    sw a4, 8(sp)
-; RV32IA-NEXT:    sw a5, 12(sp)
+; RV32IA-NEXT:    sw a5, 8(sp)
+; RV32IA-NEXT:    sw a4, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
 ; RV32IA-NEXT:    mv a0, s0
 ; RV32IA-NEXT:    li a4, 0
 ; RV32IA-NEXT:    li a5, 0
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8 at plt
-; RV32IA-NEXT:    lw a5, 12(sp)
-; RV32IA-NEXT:    lw a4, 8(sp)
+; RV32IA-NEXT:    lw a5, 8(sp)
+; RV32IA-NEXT:    lw a4, 12(sp)
 ; RV32IA-NEXT:    bnez a0, .LBB43_7
 ; RV32IA-NEXT:  .LBB43_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a5, s1, .LBB43_4
+; RV32IA-NEXT:    beq a4, s1, .LBB43_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB43_2 Depth=1
-; RV32IA-NEXT:    slt a0, s1, a5
+; RV32IA-NEXT:    slt a0, s1, a4
 ; RV32IA-NEXT:    j .LBB43_5
 ; RV32IA-NEXT:  .LBB43_4: # in Loop: Header=BB43_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a4
+; RV32IA-NEXT:    sltu a0, s2, a5
 ; RV32IA-NEXT:  .LBB43_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB43_2 Depth=1
-; RV32IA-NEXT:    mv a2, a4
-; RV32IA-NEXT:    mv a3, a5
+; RV32IA-NEXT:    mv a2, a5
+; RV32IA-NEXT:    mv a3, a4
 ; RV32IA-NEXT:    bnez a0, .LBB43_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB43_2 Depth=1
@@ -3228,8 +3228,8 @@ define signext i64 @atomicrmw_max_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    mv a3, s1
 ; RV32IA-NEXT:    j .LBB43_1
 ; RV32IA-NEXT:  .LBB43_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a4
-; RV32IA-NEXT:    mv a1, a5
+; RV32IA-NEXT:    mv a0, a5
+; RV32IA-NEXT:    mv a1, a4
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -3290,36 +3290,36 @@ define signext i64 @atomicrmw_min_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 4(a0)
-; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 0(a0)
+; RV32I-NEXT:    lw a4, 4(a0)
 ; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB44_2
 ; RV32I-NEXT:  .LBB44_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB44_2 Depth=1
-; RV32I-NEXT:    sw a4, 8(sp)
-; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a4, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    li a4, 0
 ; RV32I-NEXT:    li a5, 0
 ; RV32I-NEXT:    call __atomic_compare_exchange_8 at plt
-; RV32I-NEXT:    lw a5, 12(sp)
-; RV32I-NEXT:    lw a4, 8(sp)
+; RV32I-NEXT:    lw a5, 8(sp)
+; RV32I-NEXT:    lw a4, 12(sp)
 ; RV32I-NEXT:    bnez a0, .LBB44_7
 ; RV32I-NEXT:  .LBB44_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a5, s1, .LBB44_4
+; RV32I-NEXT:    beq a4, s1, .LBB44_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB44_2 Depth=1
-; RV32I-NEXT:    slt a0, s1, a5
+; RV32I-NEXT:    slt a0, s1, a4
 ; RV32I-NEXT:    j .LBB44_5
 ; RV32I-NEXT:  .LBB44_4: # in Loop: Header=BB44_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a4
+; RV32I-NEXT:    sltu a0, s2, a5
 ; RV32I-NEXT:  .LBB44_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB44_2 Depth=1
-; RV32I-NEXT:    mv a2, a4
-; RV32I-NEXT:    mv a3, a5
+; RV32I-NEXT:    mv a2, a5
+; RV32I-NEXT:    mv a3, a4
 ; RV32I-NEXT:    beqz a0, .LBB44_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB44_2 Depth=1
@@ -3327,8 +3327,8 @@ define signext i64 @atomicrmw_min_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    mv a3, s1
 ; RV32I-NEXT:    j .LBB44_1
 ; RV32I-NEXT:  .LBB44_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    mv a1, a5
+; RV32I-NEXT:    mv a0, a5
+; RV32I-NEXT:    mv a1, a4
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -3344,36 +3344,36 @@ define signext i64 @atomicrmw_min_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 4(a0)
-; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 0(a0)
+; RV32IA-NEXT:    lw a4, 4(a0)
 ; RV32IA-NEXT:    mv s1, a2
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB44_2
 ; RV32IA-NEXT:  .LBB44_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB44_2 Depth=1
-; RV32IA-NEXT:    sw a4, 8(sp)
-; RV32IA-NEXT:    sw a5, 12(sp)
+; RV32IA-NEXT:    sw a5, 8(sp)
+; RV32IA-NEXT:    sw a4, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
 ; RV32IA-NEXT:    mv a0, s0
 ; RV32IA-NEXT:    li a4, 0
 ; RV32IA-NEXT:    li a5, 0
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8 at plt
-; RV32IA-NEXT:    lw a5, 12(sp)
-; RV32IA-NEXT:    lw a4, 8(sp)
+; RV32IA-NEXT:    lw a5, 8(sp)
+; RV32IA-NEXT:    lw a4, 12(sp)
 ; RV32IA-NEXT:    bnez a0, .LBB44_7
 ; RV32IA-NEXT:  .LBB44_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a5, s1, .LBB44_4
+; RV32IA-NEXT:    beq a4, s1, .LBB44_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB44_2 Depth=1
-; RV32IA-NEXT:    slt a0, s1, a5
+; RV32IA-NEXT:    slt a0, s1, a4
 ; RV32IA-NEXT:    j .LBB44_5
 ; RV32IA-NEXT:  .LBB44_4: # in Loop: Header=BB44_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a4
+; RV32IA-NEXT:    sltu a0, s2, a5
 ; RV32IA-NEXT:  .LBB44_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB44_2 Depth=1
-; RV32IA-NEXT:    mv a2, a4
-; RV32IA-NEXT:    mv a3, a5
+; RV32IA-NEXT:    mv a2, a5
+; RV32IA-NEXT:    mv a3, a4
 ; RV32IA-NEXT:    beqz a0, .LBB44_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB44_2 Depth=1
@@ -3381,8 +3381,8 @@ define signext i64 @atomicrmw_min_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    mv a3, s1
 ; RV32IA-NEXT:    j .LBB44_1
 ; RV32IA-NEXT:  .LBB44_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a4
-; RV32IA-NEXT:    mv a1, a5
+; RV32IA-NEXT:    mv a0, a5
+; RV32IA-NEXT:    mv a1, a4
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -3443,36 +3443,36 @@ define signext i64 @atomicrmw_umax_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 4(a0)
-; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 0(a0)
+; RV32I-NEXT:    lw a4, 4(a0)
 ; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB45_2
 ; RV32I-NEXT:  .LBB45_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB45_2 Depth=1
-; RV32I-NEXT:    sw a4, 8(sp)
-; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a4, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    li a4, 0
 ; RV32I-NEXT:    li a5, 0
 ; RV32I-NEXT:    call __atomic_compare_exchange_8 at plt
-; RV32I-NEXT:    lw a5, 12(sp)
-; RV32I-NEXT:    lw a4, 8(sp)
+; RV32I-NEXT:    lw a5, 8(sp)
+; RV32I-NEXT:    lw a4, 12(sp)
 ; RV32I-NEXT:    bnez a0, .LBB45_7
 ; RV32I-NEXT:  .LBB45_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a5, s1, .LBB45_4
+; RV32I-NEXT:    beq a4, s1, .LBB45_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB45_2 Depth=1
-; RV32I-NEXT:    sltu a0, s1, a5
+; RV32I-NEXT:    sltu a0, s1, a4
 ; RV32I-NEXT:    j .LBB45_5
 ; RV32I-NEXT:  .LBB45_4: # in Loop: Header=BB45_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a4
+; RV32I-NEXT:    sltu a0, s2, a5
 ; RV32I-NEXT:  .LBB45_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB45_2 Depth=1
-; RV32I-NEXT:    mv a2, a4
-; RV32I-NEXT:    mv a3, a5
+; RV32I-NEXT:    mv a2, a5
+; RV32I-NEXT:    mv a3, a4
 ; RV32I-NEXT:    bnez a0, .LBB45_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB45_2 Depth=1
@@ -3480,8 +3480,8 @@ define signext i64 @atomicrmw_umax_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    mv a3, s1
 ; RV32I-NEXT:    j .LBB45_1
 ; RV32I-NEXT:  .LBB45_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    mv a1, a5
+; RV32I-NEXT:    mv a0, a5
+; RV32I-NEXT:    mv a1, a4
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -3497,36 +3497,36 @@ define signext i64 @atomicrmw_umax_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 4(a0)
-; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 0(a0)
+; RV32IA-NEXT:    lw a4, 4(a0)
 ; RV32IA-NEXT:    mv s1, a2
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB45_2
 ; RV32IA-NEXT:  .LBB45_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB45_2 Depth=1
-; RV32IA-NEXT:    sw a4, 8(sp)
-; RV32IA-NEXT:    sw a5, 12(sp)
+; RV32IA-NEXT:    sw a5, 8(sp)
+; RV32IA-NEXT:    sw a4, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
 ; RV32IA-NEXT:    mv a0, s0
 ; RV32IA-NEXT:    li a4, 0
 ; RV32IA-NEXT:    li a5, 0
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8 at plt
-; RV32IA-NEXT:    lw a5, 12(sp)
-; RV32IA-NEXT:    lw a4, 8(sp)
+; RV32IA-NEXT:    lw a5, 8(sp)
+; RV32IA-NEXT:    lw a4, 12(sp)
 ; RV32IA-NEXT:    bnez a0, .LBB45_7
 ; RV32IA-NEXT:  .LBB45_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a5, s1, .LBB45_4
+; RV32IA-NEXT:    beq a4, s1, .LBB45_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB45_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s1, a5
+; RV32IA-NEXT:    sltu a0, s1, a4
 ; RV32IA-NEXT:    j .LBB45_5
 ; RV32IA-NEXT:  .LBB45_4: # in Loop: Header=BB45_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a4
+; RV32IA-NEXT:    sltu a0, s2, a5
 ; RV32IA-NEXT:  .LBB45_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB45_2 Depth=1
-; RV32IA-NEXT:    mv a2, a4
-; RV32IA-NEXT:    mv a3, a5
+; RV32IA-NEXT:    mv a2, a5
+; RV32IA-NEXT:    mv a3, a4
 ; RV32IA-NEXT:    bnez a0, .LBB45_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB45_2 Depth=1
@@ -3534,8 +3534,8 @@ define signext i64 @atomicrmw_umax_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    mv a3, s1
 ; RV32IA-NEXT:    j .LBB45_1
 ; RV32IA-NEXT:  .LBB45_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a4
-; RV32IA-NEXT:    mv a1, a5
+; RV32IA-NEXT:    mv a0, a5
+; RV32IA-NEXT:    mv a1, a4
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -3596,36 +3596,36 @@ define signext i64 @atomicrmw_umin_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 4(a0)
-; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 0(a0)
+; RV32I-NEXT:    lw a4, 4(a0)
 ; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB46_2
 ; RV32I-NEXT:  .LBB46_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB46_2 Depth=1
-; RV32I-NEXT:    sw a4, 8(sp)
-; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a4, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    li a4, 0
 ; RV32I-NEXT:    li a5, 0
 ; RV32I-NEXT:    call __atomic_compare_exchange_8 at plt
-; RV32I-NEXT:    lw a5, 12(sp)
-; RV32I-NEXT:    lw a4, 8(sp)
+; RV32I-NEXT:    lw a5, 8(sp)
+; RV32I-NEXT:    lw a4, 12(sp)
 ; RV32I-NEXT:    bnez a0, .LBB46_7
 ; RV32I-NEXT:  .LBB46_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a5, s1, .LBB46_4
+; RV32I-NEXT:    beq a4, s1, .LBB46_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB46_2 Depth=1
-; RV32I-NEXT:    sltu a0, s1, a5
+; RV32I-NEXT:    sltu a0, s1, a4
 ; RV32I-NEXT:    j .LBB46_5
 ; RV32I-NEXT:  .LBB46_4: # in Loop: Header=BB46_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a4
+; RV32I-NEXT:    sltu a0, s2, a5
 ; RV32I-NEXT:  .LBB46_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB46_2 Depth=1
-; RV32I-NEXT:    mv a2, a4
-; RV32I-NEXT:    mv a3, a5
+; RV32I-NEXT:    mv a2, a5
+; RV32I-NEXT:    mv a3, a4
 ; RV32I-NEXT:    beqz a0, .LBB46_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB46_2 Depth=1
@@ -3633,8 +3633,8 @@ define signext i64 @atomicrmw_umin_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    mv a3, s1
 ; RV32I-NEXT:    j .LBB46_1
 ; RV32I-NEXT:  .LBB46_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    mv a1, a5
+; RV32I-NEXT:    mv a0, a5
+; RV32I-NEXT:    mv a1, a4
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -3650,36 +3650,36 @@ define signext i64 @atomicrmw_umin_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 4(a0)
-; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 0(a0)
+; RV32IA-NEXT:    lw a4, 4(a0)
 ; RV32IA-NEXT:    mv s1, a2
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB46_2
 ; RV32IA-NEXT:  .LBB46_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB46_2 Depth=1
-; RV32IA-NEXT:    sw a4, 8(sp)
-; RV32IA-NEXT:    sw a5, 12(sp)
+; RV32IA-NEXT:    sw a5, 8(sp)
+; RV32IA-NEXT:    sw a4, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
 ; RV32IA-NEXT:    mv a0, s0
 ; RV32IA-NEXT:    li a4, 0
 ; RV32IA-NEXT:    li a5, 0
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8 at plt
-; RV32IA-NEXT:    lw a5, 12(sp)
-; RV32IA-NEXT:    lw a4, 8(sp)
+; RV32IA-NEXT:    lw a5, 8(sp)
+; RV32IA-NEXT:    lw a4, 12(sp)
 ; RV32IA-NEXT:    bnez a0, .LBB46_7
 ; RV32IA-NEXT:  .LBB46_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a5, s1, .LBB46_4
+; RV32IA-NEXT:    beq a4, s1, .LBB46_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB46_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s1, a5
+; RV32IA-NEXT:    sltu a0, s1, a4
 ; RV32IA-NEXT:    j .LBB46_5
 ; RV32IA-NEXT:  .LBB46_4: # in Loop: Header=BB46_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a4
+; RV32IA-NEXT:    sltu a0, s2, a5
 ; RV32IA-NEXT:  .LBB46_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB46_2 Depth=1
-; RV32IA-NEXT:    mv a2, a4
-; RV32IA-NEXT:    mv a3, a5
+; RV32IA-NEXT:    mv a2, a5
+; RV32IA-NEXT:    mv a3, a4
 ; RV32IA-NEXT:    beqz a0, .LBB46_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB46_2 Depth=1
@@ -3687,8 +3687,8 @@ define signext i64 @atomicrmw_umin_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    mv a3, s1
 ; RV32IA-NEXT:    j .LBB46_1
 ; RV32IA-NEXT:  .LBB46_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a4
-; RV32IA-NEXT:    mv a1, a5
+; RV32IA-NEXT:    mv a0, a5
+; RV32IA-NEXT:    mv a1, a4
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll b/llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll
index 5f15a9c0671021..2fa610e7548b84 100644
--- a/llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll
+++ b/llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll
@@ -468,41 +468,41 @@ define i64 @atomicrmw_uinc_wrap_i64(ptr %ptr, i64 %val) {
 ; RV32I-NEXT:    .cfi_offset s1, -12
 ; RV32I-NEXT:    .cfi_offset s2, -16
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 4(a0)
-; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 0(a0)
+; RV32I-NEXT:    lw a4, 4(a0)
 ; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB3_3
 ; RV32I-NEXT:  .LBB3_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB3_3 Depth=1
-; RV32I-NEXT:    sltu a0, a5, s1
+; RV32I-NEXT:    sltu a0, a4, s1
 ; RV32I-NEXT:  .LBB3_2: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB3_3 Depth=1
-; RV32I-NEXT:    addi a1, a4, 1
+; RV32I-NEXT:    addi a1, a5, 1
 ; RV32I-NEXT:    seqz a2, a1
-; RV32I-NEXT:    add a3, a5, a2
+; RV32I-NEXT:    add a3, a4, a2
 ; RV32I-NEXT:    neg a0, a0
 ; RV32I-NEXT:    and a2, a0, a1
 ; RV32I-NEXT:    and a3, a0, a3
-; RV32I-NEXT:    sw a4, 8(sp)
-; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a4, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    li a4, 5
 ; RV32I-NEXT:    li a5, 5
 ; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    call __atomic_compare_exchange_8 at plt
-; RV32I-NEXT:    lw a5, 12(sp)
-; RV32I-NEXT:    lw a4, 8(sp)
+; RV32I-NEXT:    lw a5, 8(sp)
+; RV32I-NEXT:    lw a4, 12(sp)
 ; RV32I-NEXT:    bnez a0, .LBB3_5
 ; RV32I-NEXT:  .LBB3_3: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    bne a5, s1, .LBB3_1
+; RV32I-NEXT:    bne a4, s1, .LBB3_1
 ; RV32I-NEXT:  # %bb.4: # in Loop: Header=BB3_3 Depth=1
-; RV32I-NEXT:    sltu a0, a4, s2
+; RV32I-NEXT:    sltu a0, a5, s2
 ; RV32I-NEXT:    j .LBB3_2
 ; RV32I-NEXT:  .LBB3_5: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    mv a1, a5
+; RV32I-NEXT:    mv a0, a5
+; RV32I-NEXT:    mv a1, a4
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -523,41 +523,41 @@ define i64 @atomicrmw_uinc_wrap_i64(ptr %ptr, i64 %val) {
 ; RV32IA-NEXT:    .cfi_offset s1, -12
 ; RV32IA-NEXT:    .cfi_offset s2, -16
 ; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 4(a0)
-; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 0(a0)
+; RV32IA-NEXT:    lw a4, 4(a0)
 ; RV32IA-NEXT:    mv s1, a2
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB3_3
 ; RV32IA-NEXT:  .LBB3_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB3_3 Depth=1
-; RV32IA-NEXT:    sltu a0, a5, s1
+; RV32IA-NEXT:    sltu a0, a4, s1
 ; RV32IA-NEXT:  .LBB3_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB3_3 Depth=1
-; RV32IA-NEXT:    addi a1, a4, 1
+; RV32IA-NEXT:    addi a1, a5, 1
 ; RV32IA-NEXT:    seqz a2, a1
-; RV32IA-NEXT:    add a3, a5, a2
+; RV32IA-NEXT:    add a3, a4, a2
 ; RV32IA-NEXT:    neg a0, a0
 ; RV32IA-NEXT:    and a2, a0, a1
 ; RV32IA-NEXT:    and a3, a0, a3
-; RV32IA-NEXT:    sw a4, 8(sp)
-; RV32IA-NEXT:    sw a5, 12(sp)
+; RV32IA-NEXT:    sw a5, 8(sp)
+; RV32IA-NEXT:    sw a4, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
 ; RV32IA-NEXT:    li a4, 5
 ; RV32IA-NEXT:    li a5, 5
 ; RV32IA-NEXT:    mv a0, s0
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8 at plt
-; RV32IA-NEXT:    lw a5, 12(sp)
-; RV32IA-NEXT:    lw a4, 8(sp)
+; RV32IA-NEXT:    lw a5, 8(sp)
+; RV32IA-NEXT:    lw a4, 12(sp)
 ; RV32IA-NEXT:    bnez a0, .LBB3_5
 ; RV32IA-NEXT:  .LBB3_3: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    bne a5, s1, .LBB3_1
+; RV32IA-NEXT:    bne a4, s1, .LBB3_1
 ; RV32IA-NEXT:  # %bb.4: # in Loop: Header=BB3_3 Depth=1
-; RV32IA-NEXT:    sltu a0, a4, s2
+; RV32IA-NEXT:    sltu a0, a5, s2
 ; RV32IA-NEXT:    j .LBB3_2
 ; RV32IA-NEXT:  .LBB3_5: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a4
-; RV32IA-NEXT:    mv a1, a5
+; RV32IA-NEXT:    mv a0, a5
+; RV32IA-NEXT:    mv a1, a4
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -1211,35 +1211,35 @@ define i64 @atomicrmw_udec_wrap_i64(ptr %ptr, i64 %val) {
 ; RV32I-NEXT:    .cfi_offset s1, -12
 ; RV32I-NEXT:    .cfi_offset s2, -16
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 4(a0)
-; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 0(a0)
+; RV32I-NEXT:    lw a4, 4(a0)
 ; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB7_2
 ; RV32I-NEXT:  .LBB7_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB7_2 Depth=1
-; RV32I-NEXT:    sw a4, 8(sp)
-; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a4, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    li a4, 5
 ; RV32I-NEXT:    li a5, 5
 ; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    call __atomic_compare_exchange_8 at plt
-; RV32I-NEXT:    lw a5, 12(sp)
-; RV32I-NEXT:    lw a4, 8(sp)
+; RV32I-NEXT:    lw a5, 8(sp)
+; RV32I-NEXT:    lw a4, 12(sp)
 ; RV32I-NEXT:    bnez a0, .LBB7_7
 ; RV32I-NEXT:  .LBB7_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a5, s1, .LBB7_4
+; RV32I-NEXT:    beq a4, s1, .LBB7_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB7_2 Depth=1
-; RV32I-NEXT:    sltu a0, s1, a5
+; RV32I-NEXT:    sltu a0, s1, a4
 ; RV32I-NEXT:    j .LBB7_5
 ; RV32I-NEXT:  .LBB7_4: # in Loop: Header=BB7_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a4
+; RV32I-NEXT:    sltu a0, s2, a5
 ; RV32I-NEXT:  .LBB7_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB7_2 Depth=1
-; RV32I-NEXT:    or a1, a4, a5
+; RV32I-NEXT:    or a1, a5, a4
 ; RV32I-NEXT:    seqz a1, a1
 ; RV32I-NEXT:    or a0, a1, a0
 ; RV32I-NEXT:    mv a2, s2
@@ -1247,13 +1247,13 @@ define i64 @atomicrmw_udec_wrap_i64(ptr %ptr, i64 %val) {
 ; RV32I-NEXT:    bnez a0, .LBB7_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB7_2 Depth=1
-; RV32I-NEXT:    seqz a0, a4
-; RV32I-NEXT:    sub a3, a5, a0
-; RV32I-NEXT:    addi a2, a4, -1
+; RV32I-NEXT:    seqz a0, a5
+; RV32I-NEXT:    sub a3, a4, a0
+; RV32I-NEXT:    addi a2, a5, -1
 ; RV32I-NEXT:    j .LBB7_1
 ; RV32I-NEXT:  .LBB7_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    mv a1, a5
+; RV32I-NEXT:    mv a0, a5
+; RV32I-NEXT:    mv a1, a4
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -1274,35 +1274,35 @@ define i64 @atomicrmw_udec_wrap_i64(ptr %ptr, i64 %val) {
 ; RV32IA-NEXT:    .cfi_offset s1, -12
 ; RV32IA-NEXT:    .cfi_offset s2, -16
 ; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 4(a0)
-; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 0(a0)
+; RV32IA-NEXT:    lw a4, 4(a0)
 ; RV32IA-NEXT:    mv s1, a2
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB7_2
 ; RV32IA-NEXT:  .LBB7_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB7_2 Depth=1
-; RV32IA-NEXT:    sw a4, 8(sp)
-; RV32IA-NEXT:    sw a5, 12(sp)
+; RV32IA-NEXT:    sw a5, 8(sp)
+; RV32IA-NEXT:    sw a4, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
 ; RV32IA-NEXT:    li a4, 5
 ; RV32IA-NEXT:    li a5, 5
 ; RV32IA-NEXT:    mv a0, s0
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8 at plt
-; RV32IA-NEXT:    lw a5, 12(sp)
-; RV32IA-NEXT:    lw a4, 8(sp)
+; RV32IA-NEXT:    lw a5, 8(sp)
+; RV32IA-NEXT:    lw a4, 12(sp)
 ; RV32IA-NEXT:    bnez a0, .LBB7_7
 ; RV32IA-NEXT:  .LBB7_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a5, s1, .LBB7_4
+; RV32IA-NEXT:    beq a4, s1, .LBB7_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB7_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s1, a5
+; RV32IA-NEXT:    sltu a0, s1, a4
 ; RV32IA-NEXT:    j .LBB7_5
 ; RV32IA-NEXT:  .LBB7_4: # in Loop: Header=BB7_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a4
+; RV32IA-NEXT:    sltu a0, s2, a5
 ; RV32IA-NEXT:  .LBB7_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB7_2 Depth=1
-; RV32IA-NEXT:    or a1, a4, a5
+; RV32IA-NEXT:    or a1, a5, a4
 ; RV32IA-NEXT:    seqz a1, a1
 ; RV32IA-NEXT:    or a0, a1, a0
 ; RV32IA-NEXT:    mv a2, s2
@@ -1310,13 +1310,13 @@ define i64 @atomicrmw_udec_wrap_i64(ptr %ptr, i64 %val) {
 ; RV32IA-NEXT:    bnez a0, .LBB7_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB7_2 Depth=1
-; RV32IA-NEXT:    seqz a0, a4
-; RV32IA-NEXT:    sub a3, a5, a0
-; RV32IA-NEXT:    addi a2, a4, -1
+; RV32IA-NEXT:    seqz a0, a5
+; RV32IA-NEXT:    sub a3, a4, a0
+; RV32IA-NEXT:    addi a2, a5, -1
 ; RV32IA-NEXT:    j .LBB7_1
 ; RV32IA-NEXT:  .LBB7_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a4
-; RV32IA-NEXT:    mv a1, a5
+; RV32IA-NEXT:    mv a0, a5
+; RV32IA-NEXT:    mv a1, a4
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/callee-saved-fpr32s.ll b/llvm/test/CodeGen/RISCV/callee-saved-fpr32s.ll
index 7111316931f19b..7aeaaab68a208c 100644
--- a/llvm/test/CodeGen/RISCV/callee-saved-fpr32s.ll
+++ b/llvm/test/CodeGen/RISCV/callee-saved-fpr32s.ll
@@ -25,69 +25,69 @@ define void @callee() nounwind {
 ; ILP32:       # %bb.0:
 ; ILP32-NEXT:    lui a0, %hi(var)
 ; ILP32-NEXT:    flw fa5, %lo(var)(a0)
-; ILP32-NEXT:    flw fa4, %lo(var+4)(a0)
-; ILP32-NEXT:    flw fa3, %lo(var+8)(a0)
-; ILP32-NEXT:    flw fa2, %lo(var+12)(a0)
 ; ILP32-NEXT:    addi a1, a0, %lo(var)
-; ILP32-NEXT:    flw fa1, 16(a1)
-; ILP32-NEXT:    flw fa0, 20(a1)
-; ILP32-NEXT:    flw ft0, 24(a1)
-; ILP32-NEXT:    flw ft1, 28(a1)
-; ILP32-NEXT:    flw ft2, 32(a1)
-; ILP32-NEXT:    flw ft3, 36(a1)
-; ILP32-NEXT:    flw ft4, 40(a1)
-; ILP32-NEXT:    flw ft5, 44(a1)
-; ILP32-NEXT:    flw ft6, 48(a1)
-; ILP32-NEXT:    flw ft7, 52(a1)
-; ILP32-NEXT:    flw fa6, 56(a1)
-; ILP32-NEXT:    flw fa7, 60(a1)
-; ILP32-NEXT:    flw ft8, 64(a1)
-; ILP32-NEXT:    flw ft9, 68(a1)
-; ILP32-NEXT:    flw ft10, 72(a1)
-; ILP32-NEXT:    flw ft11, 76(a1)
-; ILP32-NEXT:    flw fs0, 80(a1)
-; ILP32-NEXT:    flw fs1, 84(a1)
-; ILP32-NEXT:    flw fs2, 88(a1)
-; ILP32-NEXT:    flw fs3, 92(a1)
-; ILP32-NEXT:    flw fs4, 96(a1)
-; ILP32-NEXT:    flw fs5, 100(a1)
-; ILP32-NEXT:    flw fs6, 104(a1)
-; ILP32-NEXT:    flw fs7, 108(a1)
+; ILP32-NEXT:    flw fa4, 16(a1)
+; ILP32-NEXT:    flw fa3, 20(a1)
+; ILP32-NEXT:    flw fa2, 24(a1)
+; ILP32-NEXT:    flw fa1, 28(a1)
+; ILP32-NEXT:    flw fa0, 32(a1)
+; ILP32-NEXT:    flw ft0, 36(a1)
+; ILP32-NEXT:    flw ft1, 40(a1)
+; ILP32-NEXT:    flw ft2, 44(a1)
+; ILP32-NEXT:    flw ft3, 48(a1)
+; ILP32-NEXT:    flw ft4, 52(a1)
+; ILP32-NEXT:    flw ft5, 56(a1)
+; ILP32-NEXT:    flw ft6, 60(a1)
+; ILP32-NEXT:    flw ft7, 64(a1)
+; ILP32-NEXT:    flw fa6, 68(a1)
+; ILP32-NEXT:    flw fa7, 72(a1)
+; ILP32-NEXT:    flw ft8, 76(a1)
+; ILP32-NEXT:    flw ft9, 80(a1)
+; ILP32-NEXT:    flw ft10, 84(a1)
+; ILP32-NEXT:    flw ft11, 88(a1)
+; ILP32-NEXT:    flw fs0, 92(a1)
+; ILP32-NEXT:    flw fs1, 96(a1)
+; ILP32-NEXT:    flw fs2, 100(a1)
+; ILP32-NEXT:    flw fs3, 104(a1)
+; ILP32-NEXT:    flw fs4, 108(a1)
+; ILP32-NEXT:    flw fs5, 112(a1)
+; ILP32-NEXT:    flw fs6, 116(a1)
+; ILP32-NEXT:    flw fs7, 120(a1)
 ; ILP32-NEXT:    flw fs8, 124(a1)
-; ILP32-NEXT:    flw fs9, 120(a1)
-; ILP32-NEXT:    flw fs10, 116(a1)
-; ILP32-NEXT:    flw fs11, 112(a1)
+; ILP32-NEXT:    flw fs9, %lo(var+4)(a0)
+; ILP32-NEXT:    flw fs10, %lo(var+8)(a0)
+; ILP32-NEXT:    flw fs11, %lo(var+12)(a0)
 ; ILP32-NEXT:    fsw fs8, 124(a1)
-; ILP32-NEXT:    fsw fs9, 120(a1)
-; ILP32-NEXT:    fsw fs10, 116(a1)
-; ILP32-NEXT:    fsw fs11, 112(a1)
-; ILP32-NEXT:    fsw fs7, 108(a1)
-; ILP32-NEXT:    fsw fs6, 104(a1)
-; ILP32-NEXT:    fsw fs5, 100(a1)
-; ILP32-NEXT:    fsw fs4, 96(a1)
-; ILP32-NEXT:    fsw fs3, 92(a1)
-; ILP32-NEXT:    fsw fs2, 88(a1)
-; ILP32-NEXT:    fsw fs1, 84(a1)
-; ILP32-NEXT:    fsw fs0, 80(a1)
-; ILP32-NEXT:    fsw ft11, 76(a1)
-; ILP32-NEXT:    fsw ft10, 72(a1)
-; ILP32-NEXT:    fsw ft9, 68(a1)
-; ILP32-NEXT:    fsw ft8, 64(a1)
-; ILP32-NEXT:    fsw fa7, 60(a1)
-; ILP32-NEXT:    fsw fa6, 56(a1)
-; ILP32-NEXT:    fsw ft7, 52(a1)
-; ILP32-NEXT:    fsw ft6, 48(a1)
-; ILP32-NEXT:    fsw ft5, 44(a1)
-; ILP32-NEXT:    fsw ft4, 40(a1)
-; ILP32-NEXT:    fsw ft3, 36(a1)
-; ILP32-NEXT:    fsw ft2, 32(a1)
-; ILP32-NEXT:    fsw ft1, 28(a1)
-; ILP32-NEXT:    fsw ft0, 24(a1)
-; ILP32-NEXT:    fsw fa0, 20(a1)
-; ILP32-NEXT:    fsw fa1, 16(a1)
-; ILP32-NEXT:    fsw fa2, %lo(var+12)(a0)
-; ILP32-NEXT:    fsw fa3, %lo(var+8)(a0)
-; ILP32-NEXT:    fsw fa4, %lo(var+4)(a0)
+; ILP32-NEXT:    fsw fs7, 120(a1)
+; ILP32-NEXT:    fsw fs6, 116(a1)
+; ILP32-NEXT:    fsw fs5, 112(a1)
+; ILP32-NEXT:    fsw fs4, 108(a1)
+; ILP32-NEXT:    fsw fs3, 104(a1)
+; ILP32-NEXT:    fsw fs2, 100(a1)
+; ILP32-NEXT:    fsw fs1, 96(a1)
+; ILP32-NEXT:    fsw fs0, 92(a1)
+; ILP32-NEXT:    fsw ft11, 88(a1)
+; ILP32-NEXT:    fsw ft10, 84(a1)
+; ILP32-NEXT:    fsw ft9, 80(a1)
+; ILP32-NEXT:    fsw ft8, 76(a1)
+; ILP32-NEXT:    fsw fa7, 72(a1)
+; ILP32-NEXT:    fsw fa6, 68(a1)
+; ILP32-NEXT:    fsw ft7, 64(a1)
+; ILP32-NEXT:    fsw ft6, 60(a1)
+; ILP32-NEXT:    fsw ft5, 56(a1)
+; ILP32-NEXT:    fsw ft4, 52(a1)
+; ILP32-NEXT:    fsw ft3, 48(a1)
+; ILP32-NEXT:    fsw ft2, 44(a1)
+; ILP32-NEXT:    fsw ft1, 40(a1)
+; ILP32-NEXT:    fsw ft0, 36(a1)
+; ILP32-NEXT:    fsw fa0, 32(a1)
+; ILP32-NEXT:    fsw fa1, 28(a1)
+; ILP32-NEXT:    fsw fa2, 24(a1)
+; ILP32-NEXT:    fsw fa3, 20(a1)
+; ILP32-NEXT:    fsw fa4, 16(a1)
+; ILP32-NEXT:    fsw fs11, %lo(var+12)(a0)
+; ILP32-NEXT:    fsw fs10, %lo(var+8)(a0)
+; ILP32-NEXT:    fsw fs9, %lo(var+4)(a0)
 ; ILP32-NEXT:    fsw fa5, %lo(var)(a0)
 ; ILP32-NEXT:    ret
 ;
@@ -95,69 +95,69 @@ define void @callee() nounwind {
 ; LP64:       # %bb.0:
 ; LP64-NEXT:    lui a0, %hi(var)
 ; LP64-NEXT:    flw fa5, %lo(var)(a0)
-; LP64-NEXT:    flw fa4, %lo(var+4)(a0)
-; LP64-NEXT:    flw fa3, %lo(var+8)(a0)
-; LP64-NEXT:    flw fa2, %lo(var+12)(a0)
 ; LP64-NEXT:    addi a1, a0, %lo(var)
-; LP64-NEXT:    flw fa1, 16(a1)
-; LP64-NEXT:    flw fa0, 20(a1)
-; LP64-NEXT:    flw ft0, 24(a1)
-; LP64-NEXT:    flw ft1, 28(a1)
-; LP64-NEXT:    flw ft2, 32(a1)
-; LP64-NEXT:    flw ft3, 36(a1)
-; LP64-NEXT:    flw ft4, 40(a1)
-; LP64-NEXT:    flw ft5, 44(a1)
-; LP64-NEXT:    flw ft6, 48(a1)
-; LP64-NEXT:    flw ft7, 52(a1)
-; LP64-NEXT:    flw fa6, 56(a1)
-; LP64-NEXT:    flw fa7, 60(a1)
-; LP64-NEXT:    flw ft8, 64(a1)
-; LP64-NEXT:    flw ft9, 68(a1)
-; LP64-NEXT:    flw ft10, 72(a1)
-; LP64-NEXT:    flw ft11, 76(a1)
-; LP64-NEXT:    flw fs0, 80(a1)
-; LP64-NEXT:    flw fs1, 84(a1)
-; LP64-NEXT:    flw fs2, 88(a1)
-; LP64-NEXT:    flw fs3, 92(a1)
-; LP64-NEXT:    flw fs4, 96(a1)
-; LP64-NEXT:    flw fs5, 100(a1)
-; LP64-NEXT:    flw fs6, 104(a1)
-; LP64-NEXT:    flw fs7, 108(a1)
+; LP64-NEXT:    flw fa4, 16(a1)
+; LP64-NEXT:    flw fa3, 20(a1)
+; LP64-NEXT:    flw fa2, 24(a1)
+; LP64-NEXT:    flw fa1, 28(a1)
+; LP64-NEXT:    flw fa0, 32(a1)
+; LP64-NEXT:    flw ft0, 36(a1)
+; LP64-NEXT:    flw ft1, 40(a1)
+; LP64-NEXT:    flw ft2, 44(a1)
+; LP64-NEXT:    flw ft3, 48(a1)
+; LP64-NEXT:    flw ft4, 52(a1)
+; LP64-NEXT:    flw ft5, 56(a1)
+; LP64-NEXT:    flw ft6, 60(a1)
+; LP64-NEXT:    flw ft7, 64(a1)
+; LP64-NEXT:    flw fa6, 68(a1)
+; LP64-NEXT:    flw fa7, 72(a1)
+; LP64-NEXT:    flw ft8, 76(a1)
+; LP64-NEXT:    flw ft9, 80(a1)
+; LP64-NEXT:    flw ft10, 84(a1)
+; LP64-NEXT:    flw ft11, 88(a1)
+; LP64-NEXT:    flw fs0, 92(a1)
+; LP64-NEXT:    flw fs1, 96(a1)
+; LP64-NEXT:    flw fs2, 100(a1)
+; LP64-NEXT:    flw fs3, 104(a1)
+; LP64-NEXT:    flw fs4, 108(a1)
+; LP64-NEXT:    flw fs5, 112(a1)
+; LP64-NEXT:    flw fs6, 116(a1)
+; LP64-NEXT:    flw fs7, 120(a1)
 ; LP64-NEXT:    flw fs8, 124(a1)
-; LP64-NEXT:    flw fs9, 120(a1)
-; LP64-NEXT:    flw fs10, 116(a1)
-; LP64-NEXT:    flw fs11, 112(a1)
+; LP64-NEXT:    flw fs9, %lo(var+4)(a0)
+; LP64-NEXT:    flw fs10, %lo(var+8)(a0)
+; LP64-NEXT:    flw fs11, %lo(var+12)(a0)
 ; LP64-NEXT:    fsw fs8, 124(a1)
-; LP64-NEXT:    fsw fs9, 120(a1)
-; LP64-NEXT:    fsw fs10, 116(a1)
-; LP64-NEXT:    fsw fs11, 112(a1)
-; LP64-NEXT:    fsw fs7, 108(a1)
-; LP64-NEXT:    fsw fs6, 104(a1)
-; LP64-NEXT:    fsw fs5, 100(a1)
-; LP64-NEXT:    fsw fs4, 96(a1)
-; LP64-NEXT:    fsw fs3, 92(a1)
-; LP64-NEXT:    fsw fs2, 88(a1)
-; LP64-NEXT:    fsw fs1, 84(a1)
-; LP64-NEXT:    fsw fs0, 80(a1)
-; LP64-NEXT:    fsw ft11, 76(a1)
-; LP64-NEXT:    fsw ft10, 72(a1)
-; LP64-NEXT:    fsw ft9, 68(a1)
-; LP64-NEXT:    fsw ft8, 64(a1)
-; LP64-NEXT:    fsw fa7, 60(a1)
-; LP64-NEXT:    fsw fa6, 56(a1)
-; LP64-NEXT:    fsw ft7, 52(a1)
-; LP64-NEXT:    fsw ft6, 48(a1)
-; LP64-NEXT:    fsw ft5, 44(a1)
-; LP64-NEXT:    fsw ft4, 40(a1)
-; LP64-NEXT:    fsw ft3, 36(a1)
-; LP64-NEXT:    fsw ft2, 32(a1)
-; LP64-NEXT:    fsw ft1, 28(a1)
-; LP64-NEXT:    fsw ft0, 24(a1)
-; LP64-NEXT:    fsw fa0, 20(a1)
-; LP64-NEXT:    fsw fa1, 16(a1)
-; LP64-NEXT:    fsw fa2, %lo(var+12)(a0)
-; LP64-NEXT:    fsw fa3, %lo(var+8)(a0)
-; LP64-NEXT:    fsw fa4, %lo(var+4)(a0)
+; LP64-NEXT:    fsw fs7, 120(a1)
+; LP64-NEXT:    fsw fs6, 116(a1)
+; LP64-NEXT:    fsw fs5, 112(a1)
+; LP64-NEXT:    fsw fs4, 108(a1)
+; LP64-NEXT:    fsw fs3, 104(a1)
+; LP64-NEXT:    fsw fs2, 100(a1)
+; LP64-NEXT:    fsw fs1, 96(a1)
+; LP64-NEXT:    fsw fs0, 92(a1)
+; LP64-NEXT:    fsw ft11, 88(a1)
+; LP64-NEXT:    fsw ft10, 84(a1)
+; LP64-NEXT:    fsw ft9, 80(a1)
+; LP64-NEXT:    fsw ft8, 76(a1)
+; LP64-NEXT:    fsw fa7, 72(a1)
+; LP64-NEXT:    fsw fa6, 68(a1)
+; LP64-NEXT:    fsw ft7, 64(a1)
+; LP64-NEXT:    fsw ft6, 60(a1)
+; LP64-NEXT:    fsw ft5, 56(a1)
+; LP64-NEXT:    fsw ft4, 52(a1)
+; LP64-NEXT:    fsw ft3, 48(a1)
+; LP64-NEXT:    fsw ft2, 44(a1)
+; LP64-NEXT:    fsw ft1, 40(a1)
+; LP64-NEXT:    fsw ft0, 36(a1)
+; LP64-NEXT:    fsw fa0, 32(a1)
+; LP64-NEXT:    fsw fa1, 28(a1)
+; LP64-NEXT:    fsw fa2, 24(a1)
+; LP64-NEXT:    fsw fa3, 20(a1)
+; LP64-NEXT:    fsw fa4, 16(a1)
+; LP64-NEXT:    fsw fs11, %lo(var+12)(a0)
+; LP64-NEXT:    fsw fs10, %lo(var+8)(a0)
+; LP64-NEXT:    fsw fs9, %lo(var+4)(a0)
 ; LP64-NEXT:    fsw fa5, %lo(var)(a0)
 ; LP64-NEXT:    ret
 ;
@@ -178,69 +178,69 @@ define void @callee() nounwind {
 ; ILP32F-NEXT:    fsw fs11, 0(sp) # 4-byte Folded Spill
 ; ILP32F-NEXT:    lui a0, %hi(var)
 ; ILP32F-NEXT:    flw fa5, %lo(var)(a0)
-; ILP32F-NEXT:    flw fa4, %lo(var+4)(a0)
-; ILP32F-NEXT:    flw fa3, %lo(var+8)(a0)
-; ILP32F-NEXT:    flw fa2, %lo(var+12)(a0)
 ; ILP32F-NEXT:    addi a1, a0, %lo(var)
-; ILP32F-NEXT:    flw fa1, 16(a1)
-; ILP32F-NEXT:    flw fa0, 20(a1)
-; ILP32F-NEXT:    flw ft0, 24(a1)
-; ILP32F-NEXT:    flw ft1, 28(a1)
-; ILP32F-NEXT:    flw ft2, 32(a1)
-; ILP32F-NEXT:    flw ft3, 36(a1)
-; ILP32F-NEXT:    flw ft4, 40(a1)
-; ILP32F-NEXT:    flw ft5, 44(a1)
-; ILP32F-NEXT:    flw ft6, 48(a1)
-; ILP32F-NEXT:    flw ft7, 52(a1)
-; ILP32F-NEXT:    flw fa6, 56(a1)
-; ILP32F-NEXT:    flw fa7, 60(a1)
-; ILP32F-NEXT:    flw ft8, 64(a1)
-; ILP32F-NEXT:    flw ft9, 68(a1)
-; ILP32F-NEXT:    flw ft10, 72(a1)
-; ILP32F-NEXT:    flw ft11, 76(a1)
-; ILP32F-NEXT:    flw fs0, 80(a1)
-; ILP32F-NEXT:    flw fs1, 84(a1)
-; ILP32F-NEXT:    flw fs2, 88(a1)
-; ILP32F-NEXT:    flw fs3, 92(a1)
-; ILP32F-NEXT:    flw fs4, 96(a1)
-; ILP32F-NEXT:    flw fs5, 100(a1)
-; ILP32F-NEXT:    flw fs6, 104(a1)
-; ILP32F-NEXT:    flw fs7, 108(a1)
+; ILP32F-NEXT:    flw fa4, 16(a1)
+; ILP32F-NEXT:    flw fa3, 20(a1)
+; ILP32F-NEXT:    flw fa2, 24(a1)
+; ILP32F-NEXT:    flw fa1, 28(a1)
+; ILP32F-NEXT:    flw fa0, 32(a1)
+; ILP32F-NEXT:    flw ft0, 36(a1)
+; ILP32F-NEXT:    flw ft1, 40(a1)
+; ILP32F-NEXT:    flw ft2, 44(a1)
+; ILP32F-NEXT:    flw ft3, 48(a1)
+; ILP32F-NEXT:    flw ft4, 52(a1)
+; ILP32F-NEXT:    flw ft5, 56(a1)
+; ILP32F-NEXT:    flw ft6, 60(a1)
+; ILP32F-NEXT:    flw ft7, 64(a1)
+; ILP32F-NEXT:    flw fa6, 68(a1)
+; ILP32F-NEXT:    flw fa7, 72(a1)
+; ILP32F-NEXT:    flw ft8, 76(a1)
+; ILP32F-NEXT:    flw ft9, 80(a1)
+; ILP32F-NEXT:    flw ft10, 84(a1)
+; ILP32F-NEXT:    flw ft11, 88(a1)
+; ILP32F-NEXT:    flw fs0, 92(a1)
+; ILP32F-NEXT:    flw fs1, 96(a1)
+; ILP32F-NEXT:    flw fs2, 100(a1)
+; ILP32F-NEXT:    flw fs3, 104(a1)
+; ILP32F-NEXT:    flw fs4, 108(a1)
+; ILP32F-NEXT:    flw fs5, 112(a1)
+; ILP32F-NEXT:    flw fs6, 116(a1)
+; ILP32F-NEXT:    flw fs7, 120(a1)
 ; ILP32F-NEXT:    flw fs8, 124(a1)
-; ILP32F-NEXT:    flw fs9, 120(a1)
-; ILP32F-NEXT:    flw fs10, 116(a1)
-; ILP32F-NEXT:    flw fs11, 112(a1)
+; ILP32F-NEXT:    flw fs9, %lo(var+4)(a0)
+; ILP32F-NEXT:    flw fs10, %lo(var+8)(a0)
+; ILP32F-NEXT:    flw fs11, %lo(var+12)(a0)
 ; ILP32F-NEXT:    fsw fs8, 124(a1)
-; ILP32F-NEXT:    fsw fs9, 120(a1)
-; ILP32F-NEXT:    fsw fs10, 116(a1)
-; ILP32F-NEXT:    fsw fs11, 112(a1)
-; ILP32F-NEXT:    fsw fs7, 108(a1)
-; ILP32F-NEXT:    fsw fs6, 104(a1)
-; ILP32F-NEXT:    fsw fs5, 100(a1)
-; ILP32F-NEXT:    fsw fs4, 96(a1)
-; ILP32F-NEXT:    fsw fs3, 92(a1)
-; ILP32F-NEXT:    fsw fs2, 88(a1)
-; ILP32F-NEXT:    fsw fs1, 84(a1)
-; ILP32F-NEXT:    fsw fs0, 80(a1)
-; ILP32F-NEXT:    fsw ft11, 76(a1)
-; ILP32F-NEXT:    fsw ft10, 72(a1)
-; ILP32F-NEXT:    fsw ft9, 68(a1)
-; ILP32F-NEXT:    fsw ft8, 64(a1)
-; ILP32F-NEXT:    fsw fa7, 60(a1)
-; ILP32F-NEXT:    fsw fa6, 56(a1)
-; ILP32F-NEXT:    fsw ft7, 52(a1)
-; ILP32F-NEXT:    fsw ft6, 48(a1)
-; ILP32F-NEXT:    fsw ft5, 44(a1)
-; ILP32F-NEXT:    fsw ft4, 40(a1)
-; ILP32F-NEXT:    fsw ft3, 36(a1)
-; ILP32F-NEXT:    fsw ft2, 32(a1)
-; ILP32F-NEXT:    fsw ft1, 28(a1)
-; ILP32F-NEXT:    fsw ft0, 24(a1)
-; ILP32F-NEXT:    fsw fa0, 20(a1)
-; ILP32F-NEXT:    fsw fa1, 16(a1)
-; ILP32F-NEXT:    fsw fa2, %lo(var+12)(a0)
-; ILP32F-NEXT:    fsw fa3, %lo(var+8)(a0)
-; ILP32F-NEXT:    fsw fa4, %lo(var+4)(a0)
+; ILP32F-NEXT:    fsw fs7, 120(a1)
+; ILP32F-NEXT:    fsw fs6, 116(a1)
+; ILP32F-NEXT:    fsw fs5, 112(a1)
+; ILP32F-NEXT:    fsw fs4, 108(a1)
+; ILP32F-NEXT:    fsw fs3, 104(a1)
+; ILP32F-NEXT:    fsw fs2, 100(a1)
+; ILP32F-NEXT:    fsw fs1, 96(a1)
+; ILP32F-NEXT:    fsw fs0, 92(a1)
+; ILP32F-NEXT:    fsw ft11, 88(a1)
+; ILP32F-NEXT:    fsw ft10, 84(a1)
+; ILP32F-NEXT:    fsw ft9, 80(a1)
+; ILP32F-NEXT:    fsw ft8, 76(a1)
+; ILP32F-NEXT:    fsw fa7, 72(a1)
+; ILP32F-NEXT:    fsw fa6, 68(a1)
+; ILP32F-NEXT:    fsw ft7, 64(a1)
+; ILP32F-NEXT:    fsw ft6, 60(a1)
+; ILP32F-NEXT:    fsw ft5, 56(a1)
+; ILP32F-NEXT:    fsw ft4, 52(a1)
+; ILP32F-NEXT:    fsw ft3, 48(a1)
+; ILP32F-NEXT:    fsw ft2, 44(a1)
+; ILP32F-NEXT:    fsw ft1, 40(a1)
+; ILP32F-NEXT:    fsw ft0, 36(a1)
+; ILP32F-NEXT:    fsw fa0, 32(a1)
+; ILP32F-NEXT:    fsw fa1, 28(a1)
+; ILP32F-NEXT:    fsw fa2, 24(a1)
+; ILP32F-NEXT:    fsw fa3, 20(a1)
+; ILP32F-NEXT:    fsw fa4, 16(a1)
+; ILP32F-NEXT:    fsw fs11, %lo(var+12)(a0)
+; ILP32F-NEXT:    fsw fs10, %lo(var+8)(a0)
+; ILP32F-NEXT:    fsw fs9, %lo(var+4)(a0)
 ; ILP32F-NEXT:    fsw fa5, %lo(var)(a0)
 ; ILP32F-NEXT:    flw fs0, 44(sp) # 4-byte Folded Reload
 ; ILP32F-NEXT:    flw fs1, 40(sp) # 4-byte Folded Reload
@@ -274,69 +274,69 @@ define void @callee() nounwind {
 ; LP64F-NEXT:    fsw fs11, 0(sp) # 4-byte Folded Spill
 ; LP64F-NEXT:    lui a0, %hi(var)
 ; LP64F-NEXT:    flw fa5, %lo(var)(a0)
-; LP64F-NEXT:    flw fa4, %lo(var+4)(a0)
-; LP64F-NEXT:    flw fa3, %lo(var+8)(a0)
-; LP64F-NEXT:    flw fa2, %lo(var+12)(a0)
 ; LP64F-NEXT:    addi a1, a0, %lo(var)
-; LP64F-NEXT:    flw fa1, 16(a1)
-; LP64F-NEXT:    flw fa0, 20(a1)
-; LP64F-NEXT:    flw ft0, 24(a1)
-; LP64F-NEXT:    flw ft1, 28(a1)
-; LP64F-NEXT:    flw ft2, 32(a1)
-; LP64F-NEXT:    flw ft3, 36(a1)
-; LP64F-NEXT:    flw ft4, 40(a1)
-; LP64F-NEXT:    flw ft5, 44(a1)
-; LP64F-NEXT:    flw ft6, 48(a1)
-; LP64F-NEXT:    flw ft7, 52(a1)
-; LP64F-NEXT:    flw fa6, 56(a1)
-; LP64F-NEXT:    flw fa7, 60(a1)
-; LP64F-NEXT:    flw ft8, 64(a1)
-; LP64F-NEXT:    flw ft9, 68(a1)
-; LP64F-NEXT:    flw ft10, 72(a1)
-; LP64F-NEXT:    flw ft11, 76(a1)
-; LP64F-NEXT:    flw fs0, 80(a1)
-; LP64F-NEXT:    flw fs1, 84(a1)
-; LP64F-NEXT:    flw fs2, 88(a1)
-; LP64F-NEXT:    flw fs3, 92(a1)
-; LP64F-NEXT:    flw fs4, 96(a1)
-; LP64F-NEXT:    flw fs5, 100(a1)
-; LP64F-NEXT:    flw fs6, 104(a1)
-; LP64F-NEXT:    flw fs7, 108(a1)
+; LP64F-NEXT:    flw fa4, 16(a1)
+; LP64F-NEXT:    flw fa3, 20(a1)
+; LP64F-NEXT:    flw fa2, 24(a1)
+; LP64F-NEXT:    flw fa1, 28(a1)
+; LP64F-NEXT:    flw fa0, 32(a1)
+; LP64F-NEXT:    flw ft0, 36(a1)
+; LP64F-NEXT:    flw ft1, 40(a1)
+; LP64F-NEXT:    flw ft2, 44(a1)
+; LP64F-NEXT:    flw ft3, 48(a1)
+; LP64F-NEXT:    flw ft4, 52(a1)
+; LP64F-NEXT:    flw ft5, 56(a1)
+; LP64F-NEXT:    flw ft6, 60(a1)
+; LP64F-NEXT:    flw ft7, 64(a1)
+; LP64F-NEXT:    flw fa6, 68(a1)
+; LP64F-NEXT:    flw fa7, 72(a1)
+; LP64F-NEXT:    flw ft8, 76(a1)
+; LP64F-NEXT:    flw ft9, 80(a1)
+; LP64F-NEXT:    flw ft10, 84(a1)
+; LP64F-NEXT:    flw ft11, 88(a1)
+; LP64F-NEXT:    flw fs0, 92(a1)
+; LP64F-NEXT:    flw fs1, 96(a1)
+; LP64F-NEXT:    flw fs2, 100(a1)
+; LP64F-NEXT:    flw fs3, 104(a1)
+; LP64F-NEXT:    flw fs4, 108(a1)
+; LP64F-NEXT:    flw fs5, 112(a1)
+; LP64F-NEXT:    flw fs6, 116(a1)
+; LP64F-NEXT:    flw fs7, 120(a1)
 ; LP64F-NEXT:    flw fs8, 124(a1)
-; LP64F-NEXT:    flw fs9, 120(a1)
-; LP64F-NEXT:    flw fs10, 116(a1)
-; LP64F-NEXT:    flw fs11, 112(a1)
+; LP64F-NEXT:    flw fs9, %lo(var+4)(a0)
+; LP64F-NEXT:    flw fs10, %lo(var+8)(a0)
+; LP64F-NEXT:    flw fs11, %lo(var+12)(a0)
 ; LP64F-NEXT:    fsw fs8, 124(a1)
-; LP64F-NEXT:    fsw fs9, 120(a1)
-; LP64F-NEXT:    fsw fs10, 116(a1)
-; LP64F-NEXT:    fsw fs11, 112(a1)
-; LP64F-NEXT:    fsw fs7, 108(a1)
-; LP64F-NEXT:    fsw fs6, 104(a1)
-; LP64F-NEXT:    fsw fs5, 100(a1)
-; LP64F-NEXT:    fsw fs4, 96(a1)
-; LP64F-NEXT:    fsw fs3, 92(a1)
-; LP64F-NEXT:    fsw fs2, 88(a1)
-; LP64F-NEXT:    fsw fs1, 84(a1)
-; LP64F-NEXT:    fsw fs0, 80(a1)
-; LP64F-NEXT:    fsw ft11, 76(a1)
-; LP64F-NEXT:    fsw ft10, 72(a1)
-; LP64F-NEXT:    fsw ft9, 68(a1)
-; LP64F-NEXT:    fsw ft8, 64(a1)
-; LP64F-NEXT:    fsw fa7, 60(a1)
-; LP64F-NEXT:    fsw fa6, 56(a1)
-; LP64F-NEXT:    fsw ft7, 52(a1)
-; LP64F-NEXT:    fsw ft6, 48(a1)
-; LP64F-NEXT:    fsw ft5, 44(a1)
-; LP64F-NEXT:    fsw ft4, 40(a1)
-; LP64F-NEXT:    fsw ft3, 36(a1)
-; LP64F-NEXT:    fsw ft2, 32(a1)
-; LP64F-NEXT:    fsw ft1, 28(a1)
-; LP64F-NEXT:    fsw ft0, 24(a1)
-; LP64F-NEXT:    fsw fa0, 20(a1)
-; LP64F-NEXT:    fsw fa1, 16(a1)
-; LP64F-NEXT:    fsw fa2, %lo(var+12)(a0)
-; LP64F-NEXT:    fsw fa3, %lo(var+8)(a0)
-; LP64F-NEXT:    fsw fa4, %lo(var+4)(a0)
+; LP64F-NEXT:    fsw fs7, 120(a1)
+; LP64F-NEXT:    fsw fs6, 116(a1)
+; LP64F-NEXT:    fsw fs5, 112(a1)
+; LP64F-NEXT:    fsw fs4, 108(a1)
+; LP64F-NEXT:    fsw fs3, 104(a1)
+; LP64F-NEXT:    fsw fs2, 100(a1)
+; LP64F-NEXT:    fsw fs1, 96(a1)
+; LP64F-NEXT:    fsw fs0, 92(a1)
+; LP64F-NEXT:    fsw ft11, 88(a1)
+; LP64F-NEXT:    fsw ft10, 84(a1)
+; LP64F-NEXT:    fsw ft9, 80(a1)
+; LP64F-NEXT:    fsw ft8, 76(a1)
+; LP64F-NEXT:    fsw fa7, 72(a1)
+; LP64F-NEXT:    fsw fa6, 68(a1)
+; LP64F-NEXT:    fsw ft7, 64(a1)
+; LP64F-NEXT:    fsw ft6, 60(a1)
+; LP64F-NEXT:    fsw ft5, 56(a1)
+; LP64F-NEXT:    fsw ft4, 52(a1)
+; LP64F-NEXT:    fsw ft3, 48(a1)
+; LP64F-NEXT:    fsw ft2, 44(a1)
+; LP64F-NEXT:    fsw ft1, 40(a1)
+; LP64F-NEXT:    fsw ft0, 36(a1)
+; LP64F-NEXT:    fsw fa0, 32(a1)
+; LP64F-NEXT:    fsw fa1, 28(a1)
+; LP64F-NEXT:    fsw fa2, 24(a1)
+; LP64F-NEXT:    fsw fa3, 20(a1)
+; LP64F-NEXT:    fsw fa4, 16(a1)
+; LP64F-NEXT:    fsw fs11, %lo(var+12)(a0)
+; LP64F-NEXT:    fsw fs10, %lo(var+8)(a0)
+; LP64F-NEXT:    fsw fs9, %lo(var+4)(a0)
 ; LP64F-NEXT:    fsw fa5, %lo(var)(a0)
 ; LP64F-NEXT:    flw fs0, 44(sp) # 4-byte Folded Reload
 ; LP64F-NEXT:    flw fs1, 40(sp) # 4-byte Folded Reload
@@ -370,69 +370,69 @@ define void @callee() nounwind {
 ; ILP32D-NEXT:    fsd fs11, 0(sp) # 8-byte Folded Spill
 ; ILP32D-NEXT:    lui a0, %hi(var)
 ; ILP32D-NEXT:    flw fa5, %lo(var)(a0)
-; ILP32D-NEXT:    flw fa4, %lo(var+4)(a0)
-; ILP32D-NEXT:    flw fa3, %lo(var+8)(a0)
-; ILP32D-NEXT:    flw fa2, %lo(var+12)(a0)
 ; ILP32D-NEXT:    addi a1, a0, %lo(var)
-; ILP32D-NEXT:    flw fa1, 16(a1)
-; ILP32D-NEXT:    flw fa0, 20(a1)
-; ILP32D-NEXT:    flw ft0, 24(a1)
-; ILP32D-NEXT:    flw ft1, 28(a1)
-; ILP32D-NEXT:    flw ft2, 32(a1)
-; ILP32D-NEXT:    flw ft3, 36(a1)
-; ILP32D-NEXT:    flw ft4, 40(a1)
-; ILP32D-NEXT:    flw ft5, 44(a1)
-; ILP32D-NEXT:    flw ft6, 48(a1)
-; ILP32D-NEXT:    flw ft7, 52(a1)
-; ILP32D-NEXT:    flw fa6, 56(a1)
-; ILP32D-NEXT:    flw fa7, 60(a1)
-; ILP32D-NEXT:    flw ft8, 64(a1)
-; ILP32D-NEXT:    flw ft9, 68(a1)
-; ILP32D-NEXT:    flw ft10, 72(a1)
-; ILP32D-NEXT:    flw ft11, 76(a1)
-; ILP32D-NEXT:    flw fs0, 80(a1)
-; ILP32D-NEXT:    flw fs1, 84(a1)
-; ILP32D-NEXT:    flw fs2, 88(a1)
-; ILP32D-NEXT:    flw fs3, 92(a1)
-; ILP32D-NEXT:    flw fs4, 96(a1)
-; ILP32D-NEXT:    flw fs5, 100(a1)
-; ILP32D-NEXT:    flw fs6, 104(a1)
-; ILP32D-NEXT:    flw fs7, 108(a1)
+; ILP32D-NEXT:    flw fa4, 16(a1)
+; ILP32D-NEXT:    flw fa3, 20(a1)
+; ILP32D-NEXT:    flw fa2, 24(a1)
+; ILP32D-NEXT:    flw fa1, 28(a1)
+; ILP32D-NEXT:    flw fa0, 32(a1)
+; ILP32D-NEXT:    flw ft0, 36(a1)
+; ILP32D-NEXT:    flw ft1, 40(a1)
+; ILP32D-NEXT:    flw ft2, 44(a1)
+; ILP32D-NEXT:    flw ft3, 48(a1)
+; ILP32D-NEXT:    flw ft4, 52(a1)
+; ILP32D-NEXT:    flw ft5, 56(a1)
+; ILP32D-NEXT:    flw ft6, 60(a1)
+; ILP32D-NEXT:    flw ft7, 64(a1)
+; ILP32D-NEXT:    flw fa6, 68(a1)
+; ILP32D-NEXT:    flw fa7, 72(a1)
+; ILP32D-NEXT:    flw ft8, 76(a1)
+; ILP32D-NEXT:    flw ft9, 80(a1)
+; ILP32D-NEXT:    flw ft10, 84(a1)
+; ILP32D-NEXT:    flw ft11, 88(a1)
+; ILP32D-NEXT:    flw fs0, 92(a1)
+; ILP32D-NEXT:    flw fs1, 96(a1)
+; ILP32D-NEXT:    flw fs2, 100(a1)
+; ILP32D-NEXT:    flw fs3, 104(a1)
+; ILP32D-NEXT:    flw fs4, 108(a1)
+; ILP32D-NEXT:    flw fs5, 112(a1)
+; ILP32D-NEXT:    flw fs6, 116(a1)
+; ILP32D-NEXT:    flw fs7, 120(a1)
 ; ILP32D-NEXT:    flw fs8, 124(a1)
-; ILP32D-NEXT:    flw fs9, 120(a1)
-; ILP32D-NEXT:    flw fs10, 116(a1)
-; ILP32D-NEXT:    flw fs11, 112(a1)
+; ILP32D-NEXT:    flw fs9, %lo(var+4)(a0)
+; ILP32D-NEXT:    flw fs10, %lo(var+8)(a0)
+; ILP32D-NEXT:    flw fs11, %lo(var+12)(a0)
 ; ILP32D-NEXT:    fsw fs8, 124(a1)
-; ILP32D-NEXT:    fsw fs9, 120(a1)
-; ILP32D-NEXT:    fsw fs10, 116(a1)
-; ILP32D-NEXT:    fsw fs11, 112(a1)
-; ILP32D-NEXT:    fsw fs7, 108(a1)
-; ILP32D-NEXT:    fsw fs6, 104(a1)
-; ILP32D-NEXT:    fsw fs5, 100(a1)
-; ILP32D-NEXT:    fsw fs4, 96(a1)
-; ILP32D-NEXT:    fsw fs3, 92(a1)
-; ILP32D-NEXT:    fsw fs2, 88(a1)
-; ILP32D-NEXT:    fsw fs1, 84(a1)
-; ILP32D-NEXT:    fsw fs0, 80(a1)
-; ILP32D-NEXT:    fsw ft11, 76(a1)
-; ILP32D-NEXT:    fsw ft10, 72(a1)
-; ILP32D-NEXT:    fsw ft9, 68(a1)
-; ILP32D-NEXT:    fsw ft8, 64(a1)
-; ILP32D-NEXT:    fsw fa7, 60(a1)
-; ILP32D-NEXT:    fsw fa6, 56(a1)
-; ILP32D-NEXT:    fsw ft7, 52(a1)
-; ILP32D-NEXT:    fsw ft6, 48(a1)
-; ILP32D-NEXT:    fsw ft5, 44(a1)
-; ILP32D-NEXT:    fsw ft4, 40(a1)
-; ILP32D-NEXT:    fsw ft3, 36(a1)
-; ILP32D-NEXT:    fsw ft2, 32(a1)
-; ILP32D-NEXT:    fsw ft1, 28(a1)
-; ILP32D-NEXT:    fsw ft0, 24(a1)
-; ILP32D-NEXT:    fsw fa0, 20(a1)
-; ILP32D-NEXT:    fsw fa1, 16(a1)
-; ILP32D-NEXT:    fsw fa2, %lo(var+12)(a0)
-; ILP32D-NEXT:    fsw fa3, %lo(var+8)(a0)
-; ILP32D-NEXT:    fsw fa4, %lo(var+4)(a0)
+; ILP32D-NEXT:    fsw fs7, 120(a1)
+; ILP32D-NEXT:    fsw fs6, 116(a1)
+; ILP32D-NEXT:    fsw fs5, 112(a1)
+; ILP32D-NEXT:    fsw fs4, 108(a1)
+; ILP32D-NEXT:    fsw fs3, 104(a1)
+; ILP32D-NEXT:    fsw fs2, 100(a1)
+; ILP32D-NEXT:    fsw fs1, 96(a1)
+; ILP32D-NEXT:    fsw fs0, 92(a1)
+; ILP32D-NEXT:    fsw ft11, 88(a1)
+; ILP32D-NEXT:    fsw ft10, 84(a1)
+; ILP32D-NEXT:    fsw ft9, 80(a1)
+; ILP32D-NEXT:    fsw ft8, 76(a1)
+; ILP32D-NEXT:    fsw fa7, 72(a1)
+; ILP32D-NEXT:    fsw fa6, 68(a1)
+; ILP32D-NEXT:    fsw ft7, 64(a1)
+; ILP32D-NEXT:    fsw ft6, 60(a1)
+; ILP32D-NEXT:    fsw ft5, 56(a1)
+; ILP32D-NEXT:    fsw ft4, 52(a1)
+; ILP32D-NEXT:    fsw ft3, 48(a1)
+; ILP32D-NEXT:    fsw ft2, 44(a1)
+; ILP32D-NEXT:    fsw ft1, 40(a1)
+; ILP32D-NEXT:    fsw ft0, 36(a1)
+; ILP32D-NEXT:    fsw fa0, 32(a1)
+; ILP32D-NEXT:    fsw fa1, 28(a1)
+; ILP32D-NEXT:    fsw fa2, 24(a1)
+; ILP32D-NEXT:    fsw fa3, 20(a1)
+; ILP32D-NEXT:    fsw fa4, 16(a1)
+; ILP32D-NEXT:    fsw fs11, %lo(var+12)(a0)
+; ILP32D-NEXT:    fsw fs10, %lo(var+8)(a0)
+; ILP32D-NEXT:    fsw fs9, %lo(var+4)(a0)
 ; ILP32D-NEXT:    fsw fa5, %lo(var)(a0)
 ; ILP32D-NEXT:    fld fs0, 88(sp) # 8-byte Folded Reload
 ; ILP32D-NEXT:    fld fs1, 80(sp) # 8-byte Folded Reload
@@ -466,69 +466,69 @@ define void @callee() nounwind {
 ; LP64D-NEXT:    fsd fs11, 0(sp) # 8-byte Folded Spill
 ; LP64D-NEXT:    lui a0, %hi(var)
 ; LP64D-NEXT:    flw fa5, %lo(var)(a0)
-; LP64D-NEXT:    flw fa4, %lo(var+4)(a0)
-; LP64D-NEXT:    flw fa3, %lo(var+8)(a0)
-; LP64D-NEXT:    flw fa2, %lo(var+12)(a0)
 ; LP64D-NEXT:    addi a1, a0, %lo(var)
-; LP64D-NEXT:    flw fa1, 16(a1)
-; LP64D-NEXT:    flw fa0, 20(a1)
-; LP64D-NEXT:    flw ft0, 24(a1)
-; LP64D-NEXT:    flw ft1, 28(a1)
-; LP64D-NEXT:    flw ft2, 32(a1)
-; LP64D-NEXT:    flw ft3, 36(a1)
-; LP64D-NEXT:    flw ft4, 40(a1)
-; LP64D-NEXT:    flw ft5, 44(a1)
-; LP64D-NEXT:    flw ft6, 48(a1)
-; LP64D-NEXT:    flw ft7, 52(a1)
-; LP64D-NEXT:    flw fa6, 56(a1)
-; LP64D-NEXT:    flw fa7, 60(a1)
-; LP64D-NEXT:    flw ft8, 64(a1)
-; LP64D-NEXT:    flw ft9, 68(a1)
-; LP64D-NEXT:    flw ft10, 72(a1)
-; LP64D-NEXT:    flw ft11, 76(a1)
-; LP64D-NEXT:    flw fs0, 80(a1)
-; LP64D-NEXT:    flw fs1, 84(a1)
-; LP64D-NEXT:    flw fs2, 88(a1)
-; LP64D-NEXT:    flw fs3, 92(a1)
-; LP64D-NEXT:    flw fs4, 96(a1)
-; LP64D-NEXT:    flw fs5, 100(a1)
-; LP64D-NEXT:    flw fs6, 104(a1)
-; LP64D-NEXT:    flw fs7, 108(a1)
+; LP64D-NEXT:    flw fa4, 16(a1)
+; LP64D-NEXT:    flw fa3, 20(a1)
+; LP64D-NEXT:    flw fa2, 24(a1)
+; LP64D-NEXT:    flw fa1, 28(a1)
+; LP64D-NEXT:    flw fa0, 32(a1)
+; LP64D-NEXT:    flw ft0, 36(a1)
+; LP64D-NEXT:    flw ft1, 40(a1)
+; LP64D-NEXT:    flw ft2, 44(a1)
+; LP64D-NEXT:    flw ft3, 48(a1)
+; LP64D-NEXT:    flw ft4, 52(a1)
+; LP64D-NEXT:    flw ft5, 56(a1)
+; LP64D-NEXT:    flw ft6, 60(a1)
+; LP64D-NEXT:    flw ft7, 64(a1)
+; LP64D-NEXT:    flw fa6, 68(a1)
+; LP64D-NEXT:    flw fa7, 72(a1)
+; LP64D-NEXT:    flw ft8, 76(a1)
+; LP64D-NEXT:    flw ft9, 80(a1)
+; LP64D-NEXT:    flw ft10, 84(a1)
+; LP64D-NEXT:    flw ft11, 88(a1)
+; LP64D-NEXT:    flw fs0, 92(a1)
+; LP64D-NEXT:    flw fs1, 96(a1)
+; LP64D-NEXT:    flw fs2, 100(a1)
+; LP64D-NEXT:    flw fs3, 104(a1)
+; LP64D-NEXT:    flw fs4, 108(a1)
+; LP64D-NEXT:    flw fs5, 112(a1)
+; LP64D-NEXT:    flw fs6, 116(a1)
+; LP64D-NEXT:    flw fs7, 120(a1)
 ; LP64D-NEXT:    flw fs8, 124(a1)
-; LP64D-NEXT:    flw fs9, 120(a1)
-; LP64D-NEXT:    flw fs10, 116(a1)
-; LP64D-NEXT:    flw fs11, 112(a1)
+; LP64D-NEXT:    flw fs9, %lo(var+4)(a0)
+; LP64D-NEXT:    flw fs10, %lo(var+8)(a0)
+; LP64D-NEXT:    flw fs11, %lo(var+12)(a0)
 ; LP64D-NEXT:    fsw fs8, 124(a1)
-; LP64D-NEXT:    fsw fs9, 120(a1)
-; LP64D-NEXT:    fsw fs10, 116(a1)
-; LP64D-NEXT:    fsw fs11, 112(a1)
-; LP64D-NEXT:    fsw fs7, 108(a1)
-; LP64D-NEXT:    fsw fs6, 104(a1)
-; LP64D-NEXT:    fsw fs5, 100(a1)
-; LP64D-NEXT:    fsw fs4, 96(a1)
-; LP64D-NEXT:    fsw fs3, 92(a1)
-; LP64D-NEXT:    fsw fs2, 88(a1)
-; LP64D-NEXT:    fsw fs1, 84(a1)
-; LP64D-NEXT:    fsw fs0, 80(a1)
-; LP64D-NEXT:    fsw ft11, 76(a1)
-; LP64D-NEXT:    fsw ft10, 72(a1)
-; LP64D-NEXT:    fsw ft9, 68(a1)
-; LP64D-NEXT:    fsw ft8, 64(a1)
-; LP64D-NEXT:    fsw fa7, 60(a1)
-; LP64D-NEXT:    fsw fa6, 56(a1)
-; LP64D-NEXT:    fsw ft7, 52(a1)
-; LP64D-NEXT:    fsw ft6, 48(a1)
-; LP64D-NEXT:    fsw ft5, 44(a1)
-; LP64D-NEXT:    fsw ft4, 40(a1)
-; LP64D-NEXT:    fsw ft3, 36(a1)
-; LP64D-NEXT:    fsw ft2, 32(a1)
-; LP64D-NEXT:    fsw ft1, 28(a1)
-; LP64D-NEXT:    fsw ft0, 24(a1)
-; LP64D-NEXT:    fsw fa0, 20(a1)
-; LP64D-NEXT:    fsw fa1, 16(a1)
-; LP64D-NEXT:    fsw fa2, %lo(var+12)(a0)
-; LP64D-NEXT:    fsw fa3, %lo(var+8)(a0)
-; LP64D-NEXT:    fsw fa4, %lo(var+4)(a0)
+; LP64D-NEXT:    fsw fs7, 120(a1)
+; LP64D-NEXT:    fsw fs6, 116(a1)
+; LP64D-NEXT:    fsw fs5, 112(a1)
+; LP64D-NEXT:    fsw fs4, 108(a1)
+; LP64D-NEXT:    fsw fs3, 104(a1)
+; LP64D-NEXT:    fsw fs2, 100(a1)
+; LP64D-NEXT:    fsw fs1, 96(a1)
+; LP64D-NEXT:    fsw fs0, 92(a1)
+; LP64D-NEXT:    fsw ft11, 88(a1)
+; LP64D-NEXT:    fsw ft10, 84(a1)
+; LP64D-NEXT:    fsw ft9, 80(a1)
+; LP64D-NEXT:    fsw ft8, 76(a1)
+; LP64D-NEXT:    fsw fa7, 72(a1)
+; LP64D-NEXT:    fsw fa6, 68(a1)
+; LP64D-NEXT:    fsw ft7, 64(a1)
+; LP64D-NEXT:    fsw ft6, 60(a1)
+; LP64D-NEXT:    fsw ft5, 56(a1)
+; LP64D-NEXT:    fsw ft4, 52(a1)
+; LP64D-NEXT:    fsw ft3, 48(a1)
+; LP64D-NEXT:    fsw ft2, 44(a1)
+; LP64D-NEXT:    fsw ft1, 40(a1)
+; LP64D-NEXT:    fsw ft0, 36(a1)
+; LP64D-NEXT:    fsw fa0, 32(a1)
+; LP64D-NEXT:    fsw fa1, 28(a1)
+; LP64D-NEXT:    fsw fa2, 24(a1)
+; LP64D-NEXT:    fsw fa3, 20(a1)
+; LP64D-NEXT:    fsw fa4, 16(a1)
+; LP64D-NEXT:    fsw fs11, %lo(var+12)(a0)
+; LP64D-NEXT:    fsw fs10, %lo(var+8)(a0)
+; LP64D-NEXT:    fsw fs9, %lo(var+4)(a0)
 ; LP64D-NEXT:    fsw fa5, %lo(var)(a0)
 ; LP64D-NEXT:    fld fs0, 88(sp) # 8-byte Folded Reload
 ; LP64D-NEXT:    fld fs1, 80(sp) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/callee-saved-fpr64s.ll b/llvm/test/CodeGen/RISCV/callee-saved-fpr64s.ll
index 40076316bca892..a7f582f5f0699a 100644
--- a/llvm/test/CodeGen/RISCV/callee-saved-fpr64s.ll
+++ b/llvm/test/CodeGen/RISCV/callee-saved-fpr64s.ll
@@ -20,141 +20,141 @@ define void @callee() nounwind {
 ; ILP32-LABEL: callee:
 ; ILP32:       # %bb.0:
 ; ILP32-NEXT:    lui a0, %hi(var)
-; ILP32-NEXT:    fld fa5, %lo(var)(a0)
-; ILP32-NEXT:    fld fa4, %lo(var+8)(a0)
 ; ILP32-NEXT:    addi a1, a0, %lo(var)
-; ILP32-NEXT:    fld fa3, 16(a1)
-; ILP32-NEXT:    fld fa2, 24(a1)
-; ILP32-NEXT:    fld fa1, 32(a1)
-; ILP32-NEXT:    fld fa0, 40(a1)
-; ILP32-NEXT:    fld ft0, 48(a1)
-; ILP32-NEXT:    fld ft1, 56(a1)
-; ILP32-NEXT:    fld ft2, 64(a1)
-; ILP32-NEXT:    fld ft3, 72(a1)
-; ILP32-NEXT:    fld ft4, 80(a1)
-; ILP32-NEXT:    fld ft5, 88(a1)
-; ILP32-NEXT:    fld ft6, 96(a1)
-; ILP32-NEXT:    fld ft7, 104(a1)
-; ILP32-NEXT:    fld fa6, 112(a1)
-; ILP32-NEXT:    fld fa7, 120(a1)
-; ILP32-NEXT:    fld ft8, 128(a1)
-; ILP32-NEXT:    fld ft9, 136(a1)
-; ILP32-NEXT:    fld ft10, 144(a1)
-; ILP32-NEXT:    fld ft11, 152(a1)
-; ILP32-NEXT:    fld fs0, 160(a1)
-; ILP32-NEXT:    fld fs1, 168(a1)
-; ILP32-NEXT:    fld fs2, 176(a1)
-; ILP32-NEXT:    fld fs3, 184(a1)
-; ILP32-NEXT:    fld fs4, 192(a1)
-; ILP32-NEXT:    fld fs5, 200(a1)
-; ILP32-NEXT:    fld fs6, 208(a1)
-; ILP32-NEXT:    fld fs7, 216(a1)
-; ILP32-NEXT:    fld fs8, 248(a1)
+; ILP32-NEXT:    fld fa5, 248(a1)
+; ILP32-NEXT:    fld fa4, 16(a1)
+; ILP32-NEXT:    fld fa3, 24(a1)
+; ILP32-NEXT:    fld fa2, 32(a1)
+; ILP32-NEXT:    fld fa1, 40(a1)
+; ILP32-NEXT:    fld fa0, 48(a1)
+; ILP32-NEXT:    fld ft0, 56(a1)
+; ILP32-NEXT:    fld ft1, 64(a1)
+; ILP32-NEXT:    fld ft2, 72(a1)
+; ILP32-NEXT:    fld ft3, 80(a1)
+; ILP32-NEXT:    fld ft4, 88(a1)
+; ILP32-NEXT:    fld ft5, 96(a1)
+; ILP32-NEXT:    fld ft6, 104(a1)
+; ILP32-NEXT:    fld ft7, 112(a1)
+; ILP32-NEXT:    fld fa6, 120(a1)
+; ILP32-NEXT:    fld fa7, 128(a1)
+; ILP32-NEXT:    fld ft8, 136(a1)
+; ILP32-NEXT:    fld ft9, 144(a1)
+; ILP32-NEXT:    fld ft10, 152(a1)
+; ILP32-NEXT:    fld ft11, 160(a1)
+; ILP32-NEXT:    fld fs0, 168(a1)
+; ILP32-NEXT:    fld fs1, 176(a1)
+; ILP32-NEXT:    fld fs2, 184(a1)
+; ILP32-NEXT:    fld fs3, 192(a1)
+; ILP32-NEXT:    fld fs4, 200(a1)
+; ILP32-NEXT:    fld fs5, 208(a1)
+; ILP32-NEXT:    fld fs6, 216(a1)
+; ILP32-NEXT:    fld fs7, 224(a1)
+; ILP32-NEXT:    fld fs8, 232(a1)
 ; ILP32-NEXT:    fld fs9, 240(a1)
-; ILP32-NEXT:    fld fs10, 232(a1)
-; ILP32-NEXT:    fld fs11, 224(a1)
-; ILP32-NEXT:    fsd fs8, 248(a1)
+; ILP32-NEXT:    fld fs10, %lo(var)(a0)
+; ILP32-NEXT:    fld fs11, %lo(var+8)(a0)
+; ILP32-NEXT:    fsd fa5, 248(a1)
 ; ILP32-NEXT:    fsd fs9, 240(a1)
-; ILP32-NEXT:    fsd fs10, 232(a1)
-; ILP32-NEXT:    fsd fs11, 224(a1)
-; ILP32-NEXT:    fsd fs7, 216(a1)
-; ILP32-NEXT:    fsd fs6, 208(a1)
-; ILP32-NEXT:    fsd fs5, 200(a1)
-; ILP32-NEXT:    fsd fs4, 192(a1)
-; ILP32-NEXT:    fsd fs3, 184(a1)
-; ILP32-NEXT:    fsd fs2, 176(a1)
-; ILP32-NEXT:    fsd fs1, 168(a1)
-; ILP32-NEXT:    fsd fs0, 160(a1)
-; ILP32-NEXT:    fsd ft11, 152(a1)
-; ILP32-NEXT:    fsd ft10, 144(a1)
-; ILP32-NEXT:    fsd ft9, 136(a1)
-; ILP32-NEXT:    fsd ft8, 128(a1)
-; ILP32-NEXT:    fsd fa7, 120(a1)
-; ILP32-NEXT:    fsd fa6, 112(a1)
-; ILP32-NEXT:    fsd ft7, 104(a1)
-; ILP32-NEXT:    fsd ft6, 96(a1)
-; ILP32-NEXT:    fsd ft5, 88(a1)
-; ILP32-NEXT:    fsd ft4, 80(a1)
-; ILP32-NEXT:    fsd ft3, 72(a1)
-; ILP32-NEXT:    fsd ft2, 64(a1)
-; ILP32-NEXT:    fsd ft1, 56(a1)
-; ILP32-NEXT:    fsd ft0, 48(a1)
-; ILP32-NEXT:    fsd fa0, 40(a1)
-; ILP32-NEXT:    fsd fa1, 32(a1)
-; ILP32-NEXT:    fsd fa2, 24(a1)
-; ILP32-NEXT:    fsd fa3, 16(a1)
-; ILP32-NEXT:    fsd fa4, %lo(var+8)(a0)
-; ILP32-NEXT:    fsd fa5, %lo(var)(a0)
+; ILP32-NEXT:    fsd fs8, 232(a1)
+; ILP32-NEXT:    fsd fs7, 224(a1)
+; ILP32-NEXT:    fsd fs6, 216(a1)
+; ILP32-NEXT:    fsd fs5, 208(a1)
+; ILP32-NEXT:    fsd fs4, 200(a1)
+; ILP32-NEXT:    fsd fs3, 192(a1)
+; ILP32-NEXT:    fsd fs2, 184(a1)
+; ILP32-NEXT:    fsd fs1, 176(a1)
+; ILP32-NEXT:    fsd fs0, 168(a1)
+; ILP32-NEXT:    fsd ft11, 160(a1)
+; ILP32-NEXT:    fsd ft10, 152(a1)
+; ILP32-NEXT:    fsd ft9, 144(a1)
+; ILP32-NEXT:    fsd ft8, 136(a1)
+; ILP32-NEXT:    fsd fa7, 128(a1)
+; ILP32-NEXT:    fsd fa6, 120(a1)
+; ILP32-NEXT:    fsd ft7, 112(a1)
+; ILP32-NEXT:    fsd ft6, 104(a1)
+; ILP32-NEXT:    fsd ft5, 96(a1)
+; ILP32-NEXT:    fsd ft4, 88(a1)
+; ILP32-NEXT:    fsd ft3, 80(a1)
+; ILP32-NEXT:    fsd ft2, 72(a1)
+; ILP32-NEXT:    fsd ft1, 64(a1)
+; ILP32-NEXT:    fsd ft0, 56(a1)
+; ILP32-NEXT:    fsd fa0, 48(a1)
+; ILP32-NEXT:    fsd fa1, 40(a1)
+; ILP32-NEXT:    fsd fa2, 32(a1)
+; ILP32-NEXT:    fsd fa3, 24(a1)
+; ILP32-NEXT:    fsd fa4, 16(a1)
+; ILP32-NEXT:    fsd fs11, %lo(var+8)(a0)
+; ILP32-NEXT:    fsd fs10, %lo(var)(a0)
 ; ILP32-NEXT:    ret
 ;
 ; LP64-LABEL: callee:
 ; LP64:       # %bb.0:
 ; LP64-NEXT:    lui a0, %hi(var)
-; LP64-NEXT:    fld fa5, %lo(var)(a0)
-; LP64-NEXT:    fld fa4, %lo(var+8)(a0)
 ; LP64-NEXT:    addi a1, a0, %lo(var)
-; LP64-NEXT:    fld fa3, 16(a1)
-; LP64-NEXT:    fld fa2, 24(a1)
-; LP64-NEXT:    fld fa1, 32(a1)
-; LP64-NEXT:    fld fa0, 40(a1)
-; LP64-NEXT:    fld ft0, 48(a1)
-; LP64-NEXT:    fld ft1, 56(a1)
-; LP64-NEXT:    fld ft2, 64(a1)
-; LP64-NEXT:    fld ft3, 72(a1)
-; LP64-NEXT:    fld ft4, 80(a1)
-; LP64-NEXT:    fld ft5, 88(a1)
-; LP64-NEXT:    fld ft6, 96(a1)
-; LP64-NEXT:    fld ft7, 104(a1)
-; LP64-NEXT:    fld fa6, 112(a1)
-; LP64-NEXT:    fld fa7, 120(a1)
-; LP64-NEXT:    fld ft8, 128(a1)
-; LP64-NEXT:    fld ft9, 136(a1)
-; LP64-NEXT:    fld ft10, 144(a1)
-; LP64-NEXT:    fld ft11, 152(a1)
-; LP64-NEXT:    fld fs0, 160(a1)
-; LP64-NEXT:    fld fs1, 168(a1)
-; LP64-NEXT:    fld fs2, 176(a1)
-; LP64-NEXT:    fld fs3, 184(a1)
-; LP64-NEXT:    fld fs4, 192(a1)
-; LP64-NEXT:    fld fs5, 200(a1)
-; LP64-NEXT:    fld fs6, 208(a1)
-; LP64-NEXT:    fld fs7, 216(a1)
-; LP64-NEXT:    fld fs8, 248(a1)
+; LP64-NEXT:    fld fa5, 248(a1)
+; LP64-NEXT:    fld fa4, 16(a1)
+; LP64-NEXT:    fld fa3, 24(a1)
+; LP64-NEXT:    fld fa2, 32(a1)
+; LP64-NEXT:    fld fa1, 40(a1)
+; LP64-NEXT:    fld fa0, 48(a1)
+; LP64-NEXT:    fld ft0, 56(a1)
+; LP64-NEXT:    fld ft1, 64(a1)
+; LP64-NEXT:    fld ft2, 72(a1)
+; LP64-NEXT:    fld ft3, 80(a1)
+; LP64-NEXT:    fld ft4, 88(a1)
+; LP64-NEXT:    fld ft5, 96(a1)
+; LP64-NEXT:    fld ft6, 104(a1)
+; LP64-NEXT:    fld ft7, 112(a1)
+; LP64-NEXT:    fld fa6, 120(a1)
+; LP64-NEXT:    fld fa7, 128(a1)
+; LP64-NEXT:    fld ft8, 136(a1)
+; LP64-NEXT:    fld ft9, 144(a1)
+; LP64-NEXT:    fld ft10, 152(a1)
+; LP64-NEXT:    fld ft11, 160(a1)
+; LP64-NEXT:    fld fs0, 168(a1)
+; LP64-NEXT:    fld fs1, 176(a1)
+; LP64-NEXT:    fld fs2, 184(a1)
+; LP64-NEXT:    fld fs3, 192(a1)
+; LP64-NEXT:    fld fs4, 200(a1)
+; LP64-NEXT:    fld fs5, 208(a1)
+; LP64-NEXT:    fld fs6, 216(a1)
+; LP64-NEXT:    fld fs7, 224(a1)
+; LP64-NEXT:    fld fs8, 232(a1)
 ; LP64-NEXT:    fld fs9, 240(a1)
-; LP64-NEXT:    fld fs10, 232(a1)
-; LP64-NEXT:    fld fs11, 224(a1)
-; LP64-NEXT:    fsd fs8, 248(a1)
+; LP64-NEXT:    fld fs10, %lo(var)(a0)
+; LP64-NEXT:    fld fs11, %lo(var+8)(a0)
+; LP64-NEXT:    fsd fa5, 248(a1)
 ; LP64-NEXT:    fsd fs9, 240(a1)
-; LP64-NEXT:    fsd fs10, 232(a1)
-; LP64-NEXT:    fsd fs11, 224(a1)
-; LP64-NEXT:    fsd fs7, 216(a1)
-; LP64-NEXT:    fsd fs6, 208(a1)
-; LP64-NEXT:    fsd fs5, 200(a1)
-; LP64-NEXT:    fsd fs4, 192(a1)
-; LP64-NEXT:    fsd fs3, 184(a1)
-; LP64-NEXT:    fsd fs2, 176(a1)
-; LP64-NEXT:    fsd fs1, 168(a1)
-; LP64-NEXT:    fsd fs0, 160(a1)
-; LP64-NEXT:    fsd ft11, 152(a1)
-; LP64-NEXT:    fsd ft10, 144(a1)
-; LP64-NEXT:    fsd ft9, 136(a1)
-; LP64-NEXT:    fsd ft8, 128(a1)
-; LP64-NEXT:    fsd fa7, 120(a1)
-; LP64-NEXT:    fsd fa6, 112(a1)
-; LP64-NEXT:    fsd ft7, 104(a1)
-; LP64-NEXT:    fsd ft6, 96(a1)
-; LP64-NEXT:    fsd ft5, 88(a1)
-; LP64-NEXT:    fsd ft4, 80(a1)
-; LP64-NEXT:    fsd ft3, 72(a1)
-; LP64-NEXT:    fsd ft2, 64(a1)
-; LP64-NEXT:    fsd ft1, 56(a1)
-; LP64-NEXT:    fsd ft0, 48(a1)
-; LP64-NEXT:    fsd fa0, 40(a1)
-; LP64-NEXT:    fsd fa1, 32(a1)
-; LP64-NEXT:    fsd fa2, 24(a1)
-; LP64-NEXT:    fsd fa3, 16(a1)
-; LP64-NEXT:    fsd fa4, %lo(var+8)(a0)
-; LP64-NEXT:    fsd fa5, %lo(var)(a0)
+; LP64-NEXT:    fsd fs8, 232(a1)
+; LP64-NEXT:    fsd fs7, 224(a1)
+; LP64-NEXT:    fsd fs6, 216(a1)
+; LP64-NEXT:    fsd fs5, 208(a1)
+; LP64-NEXT:    fsd fs4, 200(a1)
+; LP64-NEXT:    fsd fs3, 192(a1)
+; LP64-NEXT:    fsd fs2, 184(a1)
+; LP64-NEXT:    fsd fs1, 176(a1)
+; LP64-NEXT:    fsd fs0, 168(a1)
+; LP64-NEXT:    fsd ft11, 160(a1)
+; LP64-NEXT:    fsd ft10, 152(a1)
+; LP64-NEXT:    fsd ft9, 144(a1)
+; LP64-NEXT:    fsd ft8, 136(a1)
+; LP64-NEXT:    fsd fa7, 128(a1)
+; LP64-NEXT:    fsd fa6, 120(a1)
+; LP64-NEXT:    fsd ft7, 112(a1)
+; LP64-NEXT:    fsd ft6, 104(a1)
+; LP64-NEXT:    fsd ft5, 96(a1)
+; LP64-NEXT:    fsd ft4, 88(a1)
+; LP64-NEXT:    fsd ft3, 80(a1)
+; LP64-NEXT:    fsd ft2, 72(a1)
+; LP64-NEXT:    fsd ft1, 64(a1)
+; LP64-NEXT:    fsd ft0, 56(a1)
+; LP64-NEXT:    fsd fa0, 48(a1)
+; LP64-NEXT:    fsd fa1, 40(a1)
+; LP64-NEXT:    fsd fa2, 32(a1)
+; LP64-NEXT:    fsd fa3, 24(a1)
+; LP64-NEXT:    fsd fa4, 16(a1)
+; LP64-NEXT:    fsd fs11, %lo(var+8)(a0)
+; LP64-NEXT:    fsd fs10, %lo(var)(a0)
 ; LP64-NEXT:    ret
 ;
 ; ILP32D-LABEL: callee:
@@ -173,71 +173,71 @@ define void @callee() nounwind {
 ; ILP32D-NEXT:    fsd fs10, 8(sp) # 8-byte Folded Spill
 ; ILP32D-NEXT:    fsd fs11, 0(sp) # 8-byte Folded Spill
 ; ILP32D-NEXT:    lui a0, %hi(var)
-; ILP32D-NEXT:    fld fa5, %lo(var)(a0)
-; ILP32D-NEXT:    fld fa4, %lo(var+8)(a0)
 ; ILP32D-NEXT:    addi a1, a0, %lo(var)
-; ILP32D-NEXT:    fld fa3, 16(a1)
-; ILP32D-NEXT:    fld fa2, 24(a1)
-; ILP32D-NEXT:    fld fa1, 32(a1)
-; ILP32D-NEXT:    fld fa0, 40(a1)
-; ILP32D-NEXT:    fld ft0, 48(a1)
-; ILP32D-NEXT:    fld ft1, 56(a1)
-; ILP32D-NEXT:    fld ft2, 64(a1)
-; ILP32D-NEXT:    fld ft3, 72(a1)
-; ILP32D-NEXT:    fld ft4, 80(a1)
-; ILP32D-NEXT:    fld ft5, 88(a1)
-; ILP32D-NEXT:    fld ft6, 96(a1)
-; ILP32D-NEXT:    fld ft7, 104(a1)
-; ILP32D-NEXT:    fld fa6, 112(a1)
-; ILP32D-NEXT:    fld fa7, 120(a1)
-; ILP32D-NEXT:    fld ft8, 128(a1)
-; ILP32D-NEXT:    fld ft9, 136(a1)
-; ILP32D-NEXT:    fld ft10, 144(a1)
-; ILP32D-NEXT:    fld ft11, 152(a1)
-; ILP32D-NEXT:    fld fs0, 160(a1)
-; ILP32D-NEXT:    fld fs1, 168(a1)
-; ILP32D-NEXT:    fld fs2, 176(a1)
-; ILP32D-NEXT:    fld fs3, 184(a1)
-; ILP32D-NEXT:    fld fs4, 192(a1)
-; ILP32D-NEXT:    fld fs5, 200(a1)
-; ILP32D-NEXT:    fld fs6, 208(a1)
-; ILP32D-NEXT:    fld fs7, 216(a1)
-; ILP32D-NEXT:    fld fs8, 248(a1)
+; ILP32D-NEXT:    fld fa5, 248(a1)
+; ILP32D-NEXT:    fld fa4, 16(a1)
+; ILP32D-NEXT:    fld fa3, 24(a1)
+; ILP32D-NEXT:    fld fa2, 32(a1)
+; ILP32D-NEXT:    fld fa1, 40(a1)
+; ILP32D-NEXT:    fld fa0, 48(a1)
+; ILP32D-NEXT:    fld ft0, 56(a1)
+; ILP32D-NEXT:    fld ft1, 64(a1)
+; ILP32D-NEXT:    fld ft2, 72(a1)
+; ILP32D-NEXT:    fld ft3, 80(a1)
+; ILP32D-NEXT:    fld ft4, 88(a1)
+; ILP32D-NEXT:    fld ft5, 96(a1)
+; ILP32D-NEXT:    fld ft6, 104(a1)
+; ILP32D-NEXT:    fld ft7, 112(a1)
+; ILP32D-NEXT:    fld fa6, 120(a1)
+; ILP32D-NEXT:    fld fa7, 128(a1)
+; ILP32D-NEXT:    fld ft8, 136(a1)
+; ILP32D-NEXT:    fld ft9, 144(a1)
+; ILP32D-NEXT:    fld ft10, 152(a1)
+; ILP32D-NEXT:    fld ft11, 160(a1)
+; ILP32D-NEXT:    fld fs0, 168(a1)
+; ILP32D-NEXT:    fld fs1, 176(a1)
+; ILP32D-NEXT:    fld fs2, 184(a1)
+; ILP32D-NEXT:    fld fs3, 192(a1)
+; ILP32D-NEXT:    fld fs4, 200(a1)
+; ILP32D-NEXT:    fld fs5, 208(a1)
+; ILP32D-NEXT:    fld fs6, 216(a1)
+; ILP32D-NEXT:    fld fs7, 224(a1)
+; ILP32D-NEXT:    fld fs8, 232(a1)
 ; ILP32D-NEXT:    fld fs9, 240(a1)
-; ILP32D-NEXT:    fld fs10, 232(a1)
-; ILP32D-NEXT:    fld fs11, 224(a1)
-; ILP32D-NEXT:    fsd fs8, 248(a1)
+; ILP32D-NEXT:    fld fs10, %lo(var)(a0)
+; ILP32D-NEXT:    fld fs11, %lo(var+8)(a0)
+; ILP32D-NEXT:    fsd fa5, 248(a1)
 ; ILP32D-NEXT:    fsd fs9, 240(a1)
-; ILP32D-NEXT:    fsd fs10, 232(a1)
-; ILP32D-NEXT:    fsd fs11, 224(a1)
-; ILP32D-NEXT:    fsd fs7, 216(a1)
-; ILP32D-NEXT:    fsd fs6, 208(a1)
-; ILP32D-NEXT:    fsd fs5, 200(a1)
-; ILP32D-NEXT:    fsd fs4, 192(a1)
-; ILP32D-NEXT:    fsd fs3, 184(a1)
-; ILP32D-NEXT:    fsd fs2, 176(a1)
-; ILP32D-NEXT:    fsd fs1, 168(a1)
-; ILP32D-NEXT:    fsd fs0, 160(a1)
-; ILP32D-NEXT:    fsd ft11, 152(a1)
-; ILP32D-NEXT:    fsd ft10, 144(a1)
-; ILP32D-NEXT:    fsd ft9, 136(a1)
-; ILP32D-NEXT:    fsd ft8, 128(a1)
-; ILP32D-NEXT:    fsd fa7, 120(a1)
-; ILP32D-NEXT:    fsd fa6, 112(a1)
-; ILP32D-NEXT:    fsd ft7, 104(a1)
-; ILP32D-NEXT:    fsd ft6, 96(a1)
-; ILP32D-NEXT:    fsd ft5, 88(a1)
-; ILP32D-NEXT:    fsd ft4, 80(a1)
-; ILP32D-NEXT:    fsd ft3, 72(a1)
-; ILP32D-NEXT:    fsd ft2, 64(a1)
-; ILP32D-NEXT:    fsd ft1, 56(a1)
-; ILP32D-NEXT:    fsd ft0, 48(a1)
-; ILP32D-NEXT:    fsd fa0, 40(a1)
-; ILP32D-NEXT:    fsd fa1, 32(a1)
-; ILP32D-NEXT:    fsd fa2, 24(a1)
-; ILP32D-NEXT:    fsd fa3, 16(a1)
-; ILP32D-NEXT:    fsd fa4, %lo(var+8)(a0)
-; ILP32D-NEXT:    fsd fa5, %lo(var)(a0)
+; ILP32D-NEXT:    fsd fs8, 232(a1)
+; ILP32D-NEXT:    fsd fs7, 224(a1)
+; ILP32D-NEXT:    fsd fs6, 216(a1)
+; ILP32D-NEXT:    fsd fs5, 208(a1)
+; ILP32D-NEXT:    fsd fs4, 200(a1)
+; ILP32D-NEXT:    fsd fs3, 192(a1)
+; ILP32D-NEXT:    fsd fs2, 184(a1)
+; ILP32D-NEXT:    fsd fs1, 176(a1)
+; ILP32D-NEXT:    fsd fs0, 168(a1)
+; ILP32D-NEXT:    fsd ft11, 160(a1)
+; ILP32D-NEXT:    fsd ft10, 152(a1)
+; ILP32D-NEXT:    fsd ft9, 144(a1)
+; ILP32D-NEXT:    fsd ft8, 136(a1)
+; ILP32D-NEXT:    fsd fa7, 128(a1)
+; ILP32D-NEXT:    fsd fa6, 120(a1)
+; ILP32D-NEXT:    fsd ft7, 112(a1)
+; ILP32D-NEXT:    fsd ft6, 104(a1)
+; ILP32D-NEXT:    fsd ft5, 96(a1)
+; ILP32D-NEXT:    fsd ft4, 88(a1)
+; ILP32D-NEXT:    fsd ft3, 80(a1)
+; ILP32D-NEXT:    fsd ft2, 72(a1)
+; ILP32D-NEXT:    fsd ft1, 64(a1)
+; ILP32D-NEXT:    fsd ft0, 56(a1)
+; ILP32D-NEXT:    fsd fa0, 48(a1)
+; ILP32D-NEXT:    fsd fa1, 40(a1)
+; ILP32D-NEXT:    fsd fa2, 32(a1)
+; ILP32D-NEXT:    fsd fa3, 24(a1)
+; ILP32D-NEXT:    fsd fa4, 16(a1)
+; ILP32D-NEXT:    fsd fs11, %lo(var+8)(a0)
+; ILP32D-NEXT:    fsd fs10, %lo(var)(a0)
 ; ILP32D-NEXT:    fld fs0, 88(sp) # 8-byte Folded Reload
 ; ILP32D-NEXT:    fld fs1, 80(sp) # 8-byte Folded Reload
 ; ILP32D-NEXT:    fld fs2, 72(sp) # 8-byte Folded Reload
@@ -269,71 +269,71 @@ define void @callee() nounwind {
 ; LP64D-NEXT:    fsd fs10, 8(sp) # 8-byte Folded Spill
 ; LP64D-NEXT:    fsd fs11, 0(sp) # 8-byte Folded Spill
 ; LP64D-NEXT:    lui a0, %hi(var)
-; LP64D-NEXT:    fld fa5, %lo(var)(a0)
-; LP64D-NEXT:    fld fa4, %lo(var+8)(a0)
 ; LP64D-NEXT:    addi a1, a0, %lo(var)
-; LP64D-NEXT:    fld fa3, 16(a1)
-; LP64D-NEXT:    fld fa2, 24(a1)
-; LP64D-NEXT:    fld fa1, 32(a1)
-; LP64D-NEXT:    fld fa0, 40(a1)
-; LP64D-NEXT:    fld ft0, 48(a1)
-; LP64D-NEXT:    fld ft1, 56(a1)
-; LP64D-NEXT:    fld ft2, 64(a1)
-; LP64D-NEXT:    fld ft3, 72(a1)
-; LP64D-NEXT:    fld ft4, 80(a1)
-; LP64D-NEXT:    fld ft5, 88(a1)
-; LP64D-NEXT:    fld ft6, 96(a1)
-; LP64D-NEXT:    fld ft7, 104(a1)
-; LP64D-NEXT:    fld fa6, 112(a1)
-; LP64D-NEXT:    fld fa7, 120(a1)
-; LP64D-NEXT:    fld ft8, 128(a1)
-; LP64D-NEXT:    fld ft9, 136(a1)
-; LP64D-NEXT:    fld ft10, 144(a1)
-; LP64D-NEXT:    fld ft11, 152(a1)
-; LP64D-NEXT:    fld fs0, 160(a1)
-; LP64D-NEXT:    fld fs1, 168(a1)
-; LP64D-NEXT:    fld fs2, 176(a1)
-; LP64D-NEXT:    fld fs3, 184(a1)
-; LP64D-NEXT:    fld fs4, 192(a1)
-; LP64D-NEXT:    fld fs5, 200(a1)
-; LP64D-NEXT:    fld fs6, 208(a1)
-; LP64D-NEXT:    fld fs7, 216(a1)
-; LP64D-NEXT:    fld fs8, 248(a1)
+; LP64D-NEXT:    fld fa5, 248(a1)
+; LP64D-NEXT:    fld fa4, 16(a1)
+; LP64D-NEXT:    fld fa3, 24(a1)
+; LP64D-NEXT:    fld fa2, 32(a1)
+; LP64D-NEXT:    fld fa1, 40(a1)
+; LP64D-NEXT:    fld fa0, 48(a1)
+; LP64D-NEXT:    fld ft0, 56(a1)
+; LP64D-NEXT:    fld ft1, 64(a1)
+; LP64D-NEXT:    fld ft2, 72(a1)
+; LP64D-NEXT:    fld ft3, 80(a1)
+; LP64D-NEXT:    fld ft4, 88(a1)
+; LP64D-NEXT:    fld ft5, 96(a1)
+; LP64D-NEXT:    fld ft6, 104(a1)
+; LP64D-NEXT:    fld ft7, 112(a1)
+; LP64D-NEXT:    fld fa6, 120(a1)
+; LP64D-NEXT:    fld fa7, 128(a1)
+; LP64D-NEXT:    fld ft8, 136(a1)
+; LP64D-NEXT:    fld ft9, 144(a1)
+; LP64D-NEXT:    fld ft10, 152(a1)
+; LP64D-NEXT:    fld ft11, 160(a1)
+; LP64D-NEXT:    fld fs0, 168(a1)
+; LP64D-NEXT:    fld fs1, 176(a1)
+; LP64D-NEXT:    fld fs2, 184(a1)
+; LP64D-NEXT:    fld fs3, 192(a1)
+; LP64D-NEXT:    fld fs4, 200(a1)
+; LP64D-NEXT:    fld fs5, 208(a1)
+; LP64D-NEXT:    fld fs6, 216(a1)
+; LP64D-NEXT:    fld fs7, 224(a1)
+; LP64D-NEXT:    fld fs8, 232(a1)
 ; LP64D-NEXT:    fld fs9, 240(a1)
-; LP64D-NEXT:    fld fs10, 232(a1)
-; LP64D-NEXT:    fld fs11, 224(a1)
-; LP64D-NEXT:    fsd fs8, 248(a1)
+; LP64D-NEXT:    fld fs10, %lo(var)(a0)
+; LP64D-NEXT:    fld fs11, %lo(var+8)(a0)
+; LP64D-NEXT:    fsd fa5, 248(a1)
 ; LP64D-NEXT:    fsd fs9, 240(a1)
-; LP64D-NEXT:    fsd fs10, 232(a1)
-; LP64D-NEXT:    fsd fs11, 224(a1)
-; LP64D-NEXT:    fsd fs7, 216(a1)
-; LP64D-NEXT:    fsd fs6, 208(a1)
-; LP64D-NEXT:    fsd fs5, 200(a1)
-; LP64D-NEXT:    fsd fs4, 192(a1)
-; LP64D-NEXT:    fsd fs3, 184(a1)
-; LP64D-NEXT:    fsd fs2, 176(a1)
-; LP64D-NEXT:    fsd fs1, 168(a1)
-; LP64D-NEXT:    fsd fs0, 160(a1)
-; LP64D-NEXT:    fsd ft11, 152(a1)
-; LP64D-NEXT:    fsd ft10, 144(a1)
-; LP64D-NEXT:    fsd ft9, 136(a1)
-; LP64D-NEXT:    fsd ft8, 128(a1)
-; LP64D-NEXT:    fsd fa7, 120(a1)
-; LP64D-NEXT:    fsd fa6, 112(a1)
-; LP64D-NEXT:    fsd ft7, 104(a1)
-; LP64D-NEXT:    fsd ft6, 96(a1)
-; LP64D-NEXT:    fsd ft5, 88(a1)
-; LP64D-NEXT:    fsd ft4, 80(a1)
-; LP64D-NEXT:    fsd ft3, 72(a1)
-; LP64D-NEXT:    fsd ft2, 64(a1)
-; LP64D-NEXT:    fsd ft1, 56(a1)
-; LP64D-NEXT:    fsd ft0, 48(a1)
-; LP64D-NEXT:    fsd fa0, 40(a1)
-; LP64D-NEXT:    fsd fa1, 32(a1)
-; LP64D-NEXT:    fsd fa2, 24(a1)
-; LP64D-NEXT:    fsd fa3, 16(a1)
-; LP64D-NEXT:    fsd fa4, %lo(var+8)(a0)
-; LP64D-NEXT:    fsd fa5, %lo(var)(a0)
+; LP64D-NEXT:    fsd fs8, 232(a1)
+; LP64D-NEXT:    fsd fs7, 224(a1)
+; LP64D-NEXT:    fsd fs6, 216(a1)
+; LP64D-NEXT:    fsd fs5, 208(a1)
+; LP64D-NEXT:    fsd fs4, 200(a1)
+; LP64D-NEXT:    fsd fs3, 192(a1)
+; LP64D-NEXT:    fsd fs2, 184(a1)
+; LP64D-NEXT:    fsd fs1, 176(a1)
+; LP64D-NEXT:    fsd fs0, 168(a1)
+; LP64D-NEXT:    fsd ft11, 160(a1)
+; LP64D-NEXT:    fsd ft10, 152(a1)
+; LP64D-NEXT:    fsd ft9, 144(a1)
+; LP64D-NEXT:    fsd ft8, 136(a1)
+; LP64D-NEXT:    fsd fa7, 128(a1)
+; LP64D-NEXT:    fsd fa6, 120(a1)
+; LP64D-NEXT:    fsd ft7, 112(a1)
+; LP64D-NEXT:    fsd ft6, 104(a1)
+; LP64D-NEXT:    fsd ft5, 96(a1)
+; LP64D-NEXT:    fsd ft4, 88(a1)
+; LP64D-NEXT:    fsd ft3, 80(a1)
+; LP64D-NEXT:    fsd ft2, 72(a1)
+; LP64D-NEXT:    fsd ft1, 64(a1)
+; LP64D-NEXT:    fsd ft0, 56(a1)
+; LP64D-NEXT:    fsd fa0, 48(a1)
+; LP64D-NEXT:    fsd fa1, 40(a1)
+; LP64D-NEXT:    fsd fa2, 32(a1)
+; LP64D-NEXT:    fsd fa3, 24(a1)
+; LP64D-NEXT:    fsd fa4, 16(a1)
+; LP64D-NEXT:    fsd fs11, %lo(var+8)(a0)
+; LP64D-NEXT:    fsd fs10, %lo(var)(a0)
 ; LP64D-NEXT:    fld fs0, 88(sp) # 8-byte Folded Reload
 ; LP64D-NEXT:    fld fs1, 80(sp) # 8-byte Folded Reload
 ; LP64D-NEXT:    fld fs2, 72(sp) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/callee-saved-gprs.ll b/llvm/test/CodeGen/RISCV/callee-saved-gprs.ll
index 09ecbbc7e8feb8..a8ca5d02ff78de 100644
--- a/llvm/test/CodeGen/RISCV/callee-saved-gprs.ll
+++ b/llvm/test/CodeGen/RISCV/callee-saved-gprs.ll
@@ -50,84 +50,84 @@ define void @callee() nounwind {
 ; RV32I-NEXT:    sw s9, 36(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s10, 32(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s11, 28(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lui a6, %hi(var)
-; RV32I-NEXT:    lw a0, %lo(var)(a6)
+; RV32I-NEXT:    lui a4, %hi(var)
+; RV32I-NEXT:    lw a0, %lo(var)(a4)
 ; RV32I-NEXT:    sw a0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lw a0, %lo(var+4)(a6)
+; RV32I-NEXT:    addi a2, a4, %lo(var)
+; RV32I-NEXT:    lw a0, 16(a2)
 ; RV32I-NEXT:    sw a0, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lw a0, %lo(var+8)(a6)
+; RV32I-NEXT:    lw a0, 20(a2)
 ; RV32I-NEXT:    sw a0, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lw a0, %lo(var+12)(a6)
+; RV32I-NEXT:    lw a0, 24(a2)
 ; RV32I-NEXT:    sw a0, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    addi a5, a6, %lo(var)
-; RV32I-NEXT:    lw a0, 16(a5)
+; RV32I-NEXT:    lw a0, 28(a2)
 ; RV32I-NEXT:    sw a0, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lw a0, 20(a5)
+; RV32I-NEXT:    lw a0, 32(a2)
 ; RV32I-NEXT:    sw a0, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lw t0, 24(a5)
-; RV32I-NEXT:    lw t1, 28(a5)
-; RV32I-NEXT:    lw t2, 32(a5)
-; RV32I-NEXT:    lw t3, 36(a5)
-; RV32I-NEXT:    lw t4, 40(a5)
-; RV32I-NEXT:    lw t5, 44(a5)
-; RV32I-NEXT:    lw t6, 48(a5)
-; RV32I-NEXT:    lw s0, 52(a5)
-; RV32I-NEXT:    lw s1, 56(a5)
-; RV32I-NEXT:    lw s2, 60(a5)
-; RV32I-NEXT:    lw s3, 64(a5)
-; RV32I-NEXT:    lw s4, 68(a5)
-; RV32I-NEXT:    lw s5, 72(a5)
-; RV32I-NEXT:    lw s6, 76(a5)
-; RV32I-NEXT:    lw s7, 80(a5)
-; RV32I-NEXT:    lw s8, 84(a5)
-; RV32I-NEXT:    lw s9, 88(a5)
-; RV32I-NEXT:    lw s10, 92(a5)
-; RV32I-NEXT:    lw s11, 96(a5)
-; RV32I-NEXT:    lw ra, 100(a5)
-; RV32I-NEXT:    lw a7, 104(a5)
-; RV32I-NEXT:    lw a4, 108(a5)
-; RV32I-NEXT:    lw a0, 124(a5)
-; RV32I-NEXT:    lw a1, 120(a5)
-; RV32I-NEXT:    lw a2, 116(a5)
-; RV32I-NEXT:    lw a3, 112(a5)
-; RV32I-NEXT:    sw a0, 124(a5)
-; RV32I-NEXT:    sw a1, 120(a5)
-; RV32I-NEXT:    sw a2, 116(a5)
-; RV32I-NEXT:    sw a3, 112(a5)
-; RV32I-NEXT:    sw a4, 108(a5)
-; RV32I-NEXT:    sw a7, 104(a5)
-; RV32I-NEXT:    sw ra, 100(a5)
-; RV32I-NEXT:    sw s11, 96(a5)
-; RV32I-NEXT:    sw s10, 92(a5)
-; RV32I-NEXT:    sw s9, 88(a5)
-; RV32I-NEXT:    sw s8, 84(a5)
-; RV32I-NEXT:    sw s7, 80(a5)
-; RV32I-NEXT:    sw s6, 76(a5)
-; RV32I-NEXT:    sw s5, 72(a5)
-; RV32I-NEXT:    sw s4, 68(a5)
-; RV32I-NEXT:    sw s3, 64(a5)
-; RV32I-NEXT:    sw s2, 60(a5)
-; RV32I-NEXT:    sw s1, 56(a5)
-; RV32I-NEXT:    sw s0, 52(a5)
-; RV32I-NEXT:    sw t6, 48(a5)
-; RV32I-NEXT:    sw t5, 44(a5)
-; RV32I-NEXT:    sw t4, 40(a5)
-; RV32I-NEXT:    sw t3, 36(a5)
-; RV32I-NEXT:    sw t2, 32(a5)
-; RV32I-NEXT:    sw t1, 28(a5)
-; RV32I-NEXT:    sw t0, 24(a5)
+; RV32I-NEXT:    lw t0, 36(a2)
+; RV32I-NEXT:    lw t1, 40(a2)
+; RV32I-NEXT:    lw t2, 44(a2)
+; RV32I-NEXT:    lw t3, 48(a2)
+; RV32I-NEXT:    lw t4, 52(a2)
+; RV32I-NEXT:    lw t5, 56(a2)
+; RV32I-NEXT:    lw t6, 60(a2)
+; RV32I-NEXT:    lw s0, 64(a2)
+; RV32I-NEXT:    lw s1, 68(a2)
+; RV32I-NEXT:    lw s2, 72(a2)
+; RV32I-NEXT:    lw s3, 76(a2)
+; RV32I-NEXT:    lw s4, 80(a2)
+; RV32I-NEXT:    lw s5, 84(a2)
+; RV32I-NEXT:    lw s6, 88(a2)
+; RV32I-NEXT:    lw s7, 92(a2)
+; RV32I-NEXT:    lw s8, 96(a2)
+; RV32I-NEXT:    lw s9, 100(a2)
+; RV32I-NEXT:    lw s10, 104(a2)
+; RV32I-NEXT:    lw s11, 108(a2)
+; RV32I-NEXT:    lw ra, 112(a2)
+; RV32I-NEXT:    lw a3, 116(a2)
+; RV32I-NEXT:    lw a1, 120(a2)
+; RV32I-NEXT:    lw a0, 124(a2)
+; RV32I-NEXT:    lw a7, %lo(var+4)(a4)
+; RV32I-NEXT:    lw a6, %lo(var+8)(a4)
+; RV32I-NEXT:    lw a5, %lo(var+12)(a4)
+; RV32I-NEXT:    sw a0, 124(a2)
+; RV32I-NEXT:    sw a1, 120(a2)
+; RV32I-NEXT:    sw a3, 116(a2)
+; RV32I-NEXT:    sw ra, 112(a2)
+; RV32I-NEXT:    sw s11, 108(a2)
+; RV32I-NEXT:    sw s10, 104(a2)
+; RV32I-NEXT:    sw s9, 100(a2)
+; RV32I-NEXT:    sw s8, 96(a2)
+; RV32I-NEXT:    sw s7, 92(a2)
+; RV32I-NEXT:    sw s6, 88(a2)
+; RV32I-NEXT:    sw s5, 84(a2)
+; RV32I-NEXT:    sw s4, 80(a2)
+; RV32I-NEXT:    sw s3, 76(a2)
+; RV32I-NEXT:    sw s2, 72(a2)
+; RV32I-NEXT:    sw s1, 68(a2)
+; RV32I-NEXT:    sw s0, 64(a2)
+; RV32I-NEXT:    sw t6, 60(a2)
+; RV32I-NEXT:    sw t5, 56(a2)
+; RV32I-NEXT:    sw t4, 52(a2)
+; RV32I-NEXT:    sw t3, 48(a2)
+; RV32I-NEXT:    sw t2, 44(a2)
+; RV32I-NEXT:    sw t1, 40(a2)
+; RV32I-NEXT:    sw t0, 36(a2)
 ; RV32I-NEXT:    lw a0, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sw a0, 20(a5)
+; RV32I-NEXT:    sw a0, 32(a2)
 ; RV32I-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sw a0, 16(a5)
+; RV32I-NEXT:    sw a0, 28(a2)
 ; RV32I-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sw a0, %lo(var+12)(a6)
+; RV32I-NEXT:    sw a0, 24(a2)
 ; RV32I-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sw a0, %lo(var+8)(a6)
+; RV32I-NEXT:    sw a0, 20(a2)
 ; RV32I-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sw a0, %lo(var+4)(a6)
+; RV32I-NEXT:    sw a0, 16(a2)
+; RV32I-NEXT:    sw a5, %lo(var+12)(a4)
+; RV32I-NEXT:    sw a6, %lo(var+8)(a4)
+; RV32I-NEXT:    sw a7, %lo(var+4)(a4)
 ; RV32I-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sw a0, %lo(var)(a6)
+; RV32I-NEXT:    sw a0, %lo(var)(a4)
 ; RV32I-NEXT:    lw ra, 76(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 72(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 68(sp) # 4-byte Folded Reload
@@ -161,86 +161,86 @@ define void @callee() nounwind {
 ; RV32I-WITH-FP-NEXT:    sw s10, 32(sp) # 4-byte Folded Spill
 ; RV32I-WITH-FP-NEXT:    sw s11, 28(sp) # 4-byte Folded Spill
 ; RV32I-WITH-FP-NEXT:    addi s0, sp, 80
-; RV32I-WITH-FP-NEXT:    lui a6, %hi(var)
-; RV32I-WITH-FP-NEXT:    lw a0, %lo(var)(a6)
+; RV32I-WITH-FP-NEXT:    lui a5, %hi(var)
+; RV32I-WITH-FP-NEXT:    lw a0, %lo(var)(a5)
 ; RV32I-WITH-FP-NEXT:    sw a0, -56(s0) # 4-byte Folded Spill
-; RV32I-WITH-FP-NEXT:    lw a0, %lo(var+4)(a6)
+; RV32I-WITH-FP-NEXT:    addi a2, a5, %lo(var)
+; RV32I-WITH-FP-NEXT:    lw a0, 16(a2)
 ; RV32I-WITH-FP-NEXT:    sw a0, -60(s0) # 4-byte Folded Spill
-; RV32I-WITH-FP-NEXT:    lw a0, %lo(var+8)(a6)
+; RV32I-WITH-FP-NEXT:    lw a0, 20(a2)
 ; RV32I-WITH-FP-NEXT:    sw a0, -64(s0) # 4-byte Folded Spill
-; RV32I-WITH-FP-NEXT:    lw a0, %lo(var+12)(a6)
+; RV32I-WITH-FP-NEXT:    lw a0, 24(a2)
 ; RV32I-WITH-FP-NEXT:    sw a0, -68(s0) # 4-byte Folded Spill
-; RV32I-WITH-FP-NEXT:    addi a5, a6, %lo(var)
-; RV32I-WITH-FP-NEXT:    lw a0, 16(a5)
+; RV32I-WITH-FP-NEXT:    lw a0, 28(a2)
 ; RV32I-WITH-FP-NEXT:    sw a0, -72(s0) # 4-byte Folded Spill
-; RV32I-WITH-FP-NEXT:    lw a0, 20(a5)
+; RV32I-WITH-FP-NEXT:    lw a0, 32(a2)
 ; RV32I-WITH-FP-NEXT:    sw a0, -76(s0) # 4-byte Folded Spill
-; RV32I-WITH-FP-NEXT:    lw a0, 24(a5)
+; RV32I-WITH-FP-NEXT:    lw a0, 36(a2)
 ; RV32I-WITH-FP-NEXT:    sw a0, -80(s0) # 4-byte Folded Spill
-; RV32I-WITH-FP-NEXT:    lw t1, 28(a5)
-; RV32I-WITH-FP-NEXT:    lw t2, 32(a5)
-; RV32I-WITH-FP-NEXT:    lw t3, 36(a5)
-; RV32I-WITH-FP-NEXT:    lw t4, 40(a5)
-; RV32I-WITH-FP-NEXT:    lw t5, 44(a5)
-; RV32I-WITH-FP-NEXT:    lw t6, 48(a5)
-; RV32I-WITH-FP-NEXT:    lw s1, 52(a5)
-; RV32I-WITH-FP-NEXT:    lw s2, 56(a5)
-; RV32I-WITH-FP-NEXT:    lw s3, 60(a5)
-; RV32I-WITH-FP-NEXT:    lw s4, 64(a5)
-; RV32I-WITH-FP-NEXT:    lw s5, 68(a5)
-; RV32I-WITH-FP-NEXT:    lw s6, 72(a5)
-; RV32I-WITH-FP-NEXT:    lw s7, 76(a5)
-; RV32I-WITH-FP-NEXT:    lw s8, 80(a5)
-; RV32I-WITH-FP-NEXT:    lw s9, 84(a5)
-; RV32I-WITH-FP-NEXT:    lw s10, 88(a5)
-; RV32I-WITH-FP-NEXT:    lw s11, 92(a5)
-; RV32I-WITH-FP-NEXT:    lw ra, 96(a5)
-; RV32I-WITH-FP-NEXT:    lw t0, 100(a5)
-; RV32I-WITH-FP-NEXT:    lw a7, 104(a5)
-; RV32I-WITH-FP-NEXT:    lw a4, 108(a5)
-; RV32I-WITH-FP-NEXT:    lw a0, 124(a5)
-; RV32I-WITH-FP-NEXT:    lw a1, 120(a5)
-; RV32I-WITH-FP-NEXT:    lw a2, 116(a5)
-; RV32I-WITH-FP-NEXT:    lw a3, 112(a5)
-; RV32I-WITH-FP-NEXT:    sw a0, 124(a5)
-; RV32I-WITH-FP-NEXT:    sw a1, 120(a5)
-; RV32I-WITH-FP-NEXT:    sw a2, 116(a5)
-; RV32I-WITH-FP-NEXT:    sw a3, 112(a5)
-; RV32I-WITH-FP-NEXT:    sw a4, 108(a5)
-; RV32I-WITH-FP-NEXT:    sw a7, 104(a5)
-; RV32I-WITH-FP-NEXT:    sw t0, 100(a5)
-; RV32I-WITH-FP-NEXT:    sw ra, 96(a5)
-; RV32I-WITH-FP-NEXT:    sw s11, 92(a5)
-; RV32I-WITH-FP-NEXT:    sw s10, 88(a5)
-; RV32I-WITH-FP-NEXT:    sw s9, 84(a5)
-; RV32I-WITH-FP-NEXT:    sw s8, 80(a5)
-; RV32I-WITH-FP-NEXT:    sw s7, 76(a5)
-; RV32I-WITH-FP-NEXT:    sw s6, 72(a5)
-; RV32I-WITH-FP-NEXT:    sw s5, 68(a5)
-; RV32I-WITH-FP-NEXT:    sw s4, 64(a5)
-; RV32I-WITH-FP-NEXT:    sw s3, 60(a5)
-; RV32I-WITH-FP-NEXT:    sw s2, 56(a5)
-; RV32I-WITH-FP-NEXT:    sw s1, 52(a5)
-; RV32I-WITH-FP-NEXT:    sw t6, 48(a5)
-; RV32I-WITH-FP-NEXT:    sw t5, 44(a5)
-; RV32I-WITH-FP-NEXT:    sw t4, 40(a5)
-; RV32I-WITH-FP-NEXT:    sw t3, 36(a5)
-; RV32I-WITH-FP-NEXT:    sw t2, 32(a5)
-; RV32I-WITH-FP-NEXT:    sw t1, 28(a5)
+; RV32I-WITH-FP-NEXT:    lw t1, 40(a2)
+; RV32I-WITH-FP-NEXT:    lw t2, 44(a2)
+; RV32I-WITH-FP-NEXT:    lw t3, 48(a2)
+; RV32I-WITH-FP-NEXT:    lw t4, 52(a2)
+; RV32I-WITH-FP-NEXT:    lw t5, 56(a2)
+; RV32I-WITH-FP-NEXT:    lw t6, 60(a2)
+; RV32I-WITH-FP-NEXT:    lw s1, 64(a2)
+; RV32I-WITH-FP-NEXT:    lw s2, 68(a2)
+; RV32I-WITH-FP-NEXT:    lw s3, 72(a2)
+; RV32I-WITH-FP-NEXT:    lw s4, 76(a2)
+; RV32I-WITH-FP-NEXT:    lw s5, 80(a2)
+; RV32I-WITH-FP-NEXT:    lw s6, 84(a2)
+; RV32I-WITH-FP-NEXT:    lw s7, 88(a2)
+; RV32I-WITH-FP-NEXT:    lw s8, 92(a2)
+; RV32I-WITH-FP-NEXT:    lw s9, 96(a2)
+; RV32I-WITH-FP-NEXT:    lw s10, 100(a2)
+; RV32I-WITH-FP-NEXT:    lw s11, 104(a2)
+; RV32I-WITH-FP-NEXT:    lw ra, 108(a2)
+; RV32I-WITH-FP-NEXT:    lw a4, 112(a2)
+; RV32I-WITH-FP-NEXT:    lw a3, 116(a2)
+; RV32I-WITH-FP-NEXT:    lw a1, 120(a2)
+; RV32I-WITH-FP-NEXT:    lw a0, 124(a2)
+; RV32I-WITH-FP-NEXT:    lw t0, %lo(var+4)(a5)
+; RV32I-WITH-FP-NEXT:    lw a7, %lo(var+8)(a5)
+; RV32I-WITH-FP-NEXT:    lw a6, %lo(var+12)(a5)
+; RV32I-WITH-FP-NEXT:    sw a0, 124(a2)
+; RV32I-WITH-FP-NEXT:    sw a1, 120(a2)
+; RV32I-WITH-FP-NEXT:    sw a3, 116(a2)
+; RV32I-WITH-FP-NEXT:    sw a4, 112(a2)
+; RV32I-WITH-FP-NEXT:    sw ra, 108(a2)
+; RV32I-WITH-FP-NEXT:    sw s11, 104(a2)
+; RV32I-WITH-FP-NEXT:    sw s10, 100(a2)
+; RV32I-WITH-FP-NEXT:    sw s9, 96(a2)
+; RV32I-WITH-FP-NEXT:    sw s8, 92(a2)
+; RV32I-WITH-FP-NEXT:    sw s7, 88(a2)
+; RV32I-WITH-FP-NEXT:    sw s6, 84(a2)
+; RV32I-WITH-FP-NEXT:    sw s5, 80(a2)
+; RV32I-WITH-FP-NEXT:    sw s4, 76(a2)
+; RV32I-WITH-FP-NEXT:    sw s3, 72(a2)
+; RV32I-WITH-FP-NEXT:    sw s2, 68(a2)
+; RV32I-WITH-FP-NEXT:    sw s1, 64(a2)
+; RV32I-WITH-FP-NEXT:    sw t6, 60(a2)
+; RV32I-WITH-FP-NEXT:    sw t5, 56(a2)
+; RV32I-WITH-FP-NEXT:    sw t4, 52(a2)
+; RV32I-WITH-FP-NEXT:    sw t3, 48(a2)
+; RV32I-WITH-FP-NEXT:    sw t2, 44(a2)
+; RV32I-WITH-FP-NEXT:    sw t1, 40(a2)
 ; RV32I-WITH-FP-NEXT:    lw a0, -80(s0) # 4-byte Folded Reload
-; RV32I-WITH-FP-NEXT:    sw a0, 24(a5)
+; RV32I-WITH-FP-NEXT:    sw a0, 36(a2)
 ; RV32I-WITH-FP-NEXT:    lw a0, -76(s0) # 4-byte Folded Reload
-; RV32I-WITH-FP-NEXT:    sw a0, 20(a5)
+; RV32I-WITH-FP-NEXT:    sw a0, 32(a2)
 ; RV32I-WITH-FP-NEXT:    lw a0, -72(s0) # 4-byte Folded Reload
-; RV32I-WITH-FP-NEXT:    sw a0, 16(a5)
+; RV32I-WITH-FP-NEXT:    sw a0, 28(a2)
 ; RV32I-WITH-FP-NEXT:    lw a0, -68(s0) # 4-byte Folded Reload
-; RV32I-WITH-FP-NEXT:    sw a0, %lo(var+12)(a6)
+; RV32I-WITH-FP-NEXT:    sw a0, 24(a2)
 ; RV32I-WITH-FP-NEXT:    lw a0, -64(s0) # 4-byte Folded Reload
-; RV32I-WITH-FP-NEXT:    sw a0, %lo(var+8)(a6)
+; RV32I-WITH-FP-NEXT:    sw a0, 20(a2)
 ; RV32I-WITH-FP-NEXT:    lw a0, -60(s0) # 4-byte Folded Reload
-; RV32I-WITH-FP-NEXT:    sw a0, %lo(var+4)(a6)
+; RV32I-WITH-FP-NEXT:    sw a0, 16(a2)
+; RV32I-WITH-FP-NEXT:    sw a6, %lo(var+12)(a5)
+; RV32I-WITH-FP-NEXT:    sw a7, %lo(var+8)(a5)
+; RV32I-WITH-FP-NEXT:    sw t0, %lo(var+4)(a5)
 ; RV32I-WITH-FP-NEXT:    lw a0, -56(s0) # 4-byte Folded Reload
-; RV32I-WITH-FP-NEXT:    sw a0, %lo(var)(a6)
+; RV32I-WITH-FP-NEXT:    sw a0, %lo(var)(a5)
 ; RV32I-WITH-FP-NEXT:    lw ra, 76(sp) # 4-byte Folded Reload
 ; RV32I-WITH-FP-NEXT:    lw s0, 72(sp) # 4-byte Folded Reload
 ; RV32I-WITH-FP-NEXT:    lw s1, 68(sp) # 4-byte Folded Reload
@@ -260,84 +260,84 @@ define void @callee() nounwind {
 ; RV32IZCMP-LABEL: callee:
 ; RV32IZCMP:       # %bb.0:
 ; RV32IZCMP-NEXT:    cm.push {ra, s0-s11}, -96
-; RV32IZCMP-NEXT:    lui a6, %hi(var)
-; RV32IZCMP-NEXT:    lw a0, %lo(var)(a6)
+; RV32IZCMP-NEXT:    lui a5, %hi(var)
+; RV32IZCMP-NEXT:    lw a0, %lo(var)(a5)
 ; RV32IZCMP-NEXT:    sw a0, 28(sp) # 4-byte Folded Spill
-; RV32IZCMP-NEXT:    lw a0, %lo(var+4)(a6)
+; RV32IZCMP-NEXT:    addi a2, a5, %lo(var)
+; RV32IZCMP-NEXT:    lw a0, 16(a2)
 ; RV32IZCMP-NEXT:    sw a0, 24(sp) # 4-byte Folded Spill
-; RV32IZCMP-NEXT:    lw a0, %lo(var+8)(a6)
+; RV32IZCMP-NEXT:    lw a0, 20(a2)
 ; RV32IZCMP-NEXT:    sw a0, 20(sp) # 4-byte Folded Spill
-; RV32IZCMP-NEXT:    lw a0, %lo(var+12)(a6)
+; RV32IZCMP-NEXT:    lw a0, 24(a2)
 ; RV32IZCMP-NEXT:    sw a0, 16(sp) # 4-byte Folded Spill
-; RV32IZCMP-NEXT:    addi a5, a6, %lo(var)
-; RV32IZCMP-NEXT:    lw a0, 16(a5)
+; RV32IZCMP-NEXT:    lw a0, 28(a2)
 ; RV32IZCMP-NEXT:    sw a0, 12(sp) # 4-byte Folded Spill
-; RV32IZCMP-NEXT:    lw a0, 20(a5)
+; RV32IZCMP-NEXT:    lw a0, 32(a2)
 ; RV32IZCMP-NEXT:    sw a0, 8(sp) # 4-byte Folded Spill
-; RV32IZCMP-NEXT:    lw t4, 24(a5)
-; RV32IZCMP-NEXT:    lw t5, 28(a5)
-; RV32IZCMP-NEXT:    lw t6, 32(a5)
-; RV32IZCMP-NEXT:    lw s2, 36(a5)
-; RV32IZCMP-NEXT:    lw s3, 40(a5)
-; RV32IZCMP-NEXT:    lw s4, 44(a5)
-; RV32IZCMP-NEXT:    lw s5, 48(a5)
-; RV32IZCMP-NEXT:    lw s6, 52(a5)
-; RV32IZCMP-NEXT:    lw s7, 56(a5)
-; RV32IZCMP-NEXT:    lw s8, 60(a5)
-; RV32IZCMP-NEXT:    lw s9, 64(a5)
-; RV32IZCMP-NEXT:    lw s10, 68(a5)
-; RV32IZCMP-NEXT:    lw s11, 72(a5)
-; RV32IZCMP-NEXT:    lw ra, 76(a5)
-; RV32IZCMP-NEXT:    lw s1, 80(a5)
-; RV32IZCMP-NEXT:    lw t3, 84(a5)
-; RV32IZCMP-NEXT:    lw t2, 88(a5)
-; RV32IZCMP-NEXT:    lw t1, 92(a5)
-; RV32IZCMP-NEXT:    lw t0, 96(a5)
-; RV32IZCMP-NEXT:    lw s0, 100(a5)
-; RV32IZCMP-NEXT:    lw a7, 104(a5)
-; RV32IZCMP-NEXT:    lw a4, 108(a5)
-; RV32IZCMP-NEXT:    lw a0, 124(a5)
-; RV32IZCMP-NEXT:    lw a1, 120(a5)
-; RV32IZCMP-NEXT:    lw a2, 116(a5)
-; RV32IZCMP-NEXT:    lw a3, 112(a5)
-; RV32IZCMP-NEXT:    sw a0, 124(a5)
-; RV32IZCMP-NEXT:    sw a1, 120(a5)
-; RV32IZCMP-NEXT:    sw a2, 116(a5)
-; RV32IZCMP-NEXT:    sw a3, 112(a5)
-; RV32IZCMP-NEXT:    sw a4, 108(a5)
-; RV32IZCMP-NEXT:    sw a7, 104(a5)
-; RV32IZCMP-NEXT:    sw s0, 100(a5)
-; RV32IZCMP-NEXT:    sw t0, 96(a5)
-; RV32IZCMP-NEXT:    sw t1, 92(a5)
-; RV32IZCMP-NEXT:    sw t2, 88(a5)
-; RV32IZCMP-NEXT:    sw t3, 84(a5)
-; RV32IZCMP-NEXT:    sw s1, 80(a5)
-; RV32IZCMP-NEXT:    sw ra, 76(a5)
-; RV32IZCMP-NEXT:    sw s11, 72(a5)
-; RV32IZCMP-NEXT:    sw s10, 68(a5)
-; RV32IZCMP-NEXT:    sw s9, 64(a5)
-; RV32IZCMP-NEXT:    sw s8, 60(a5)
-; RV32IZCMP-NEXT:    sw s7, 56(a5)
-; RV32IZCMP-NEXT:    sw s6, 52(a5)
-; RV32IZCMP-NEXT:    sw s5, 48(a5)
-; RV32IZCMP-NEXT:    sw s4, 44(a5)
-; RV32IZCMP-NEXT:    sw s3, 40(a5)
-; RV32IZCMP-NEXT:    sw s2, 36(a5)
-; RV32IZCMP-NEXT:    sw t6, 32(a5)
-; RV32IZCMP-NEXT:    sw t5, 28(a5)
-; RV32IZCMP-NEXT:    sw t4, 24(a5)
+; RV32IZCMP-NEXT:    lw t4, 36(a2)
+; RV32IZCMP-NEXT:    lw t5, 40(a2)
+; RV32IZCMP-NEXT:    lw t6, 44(a2)
+; RV32IZCMP-NEXT:    lw s2, 48(a2)
+; RV32IZCMP-NEXT:    lw s3, 52(a2)
+; RV32IZCMP-NEXT:    lw s4, 56(a2)
+; RV32IZCMP-NEXT:    lw s5, 60(a2)
+; RV32IZCMP-NEXT:    lw s6, 64(a2)
+; RV32IZCMP-NEXT:    lw s7, 68(a2)
+; RV32IZCMP-NEXT:    lw s8, 72(a2)
+; RV32IZCMP-NEXT:    lw s9, 76(a2)
+; RV32IZCMP-NEXT:    lw s10, 80(a2)
+; RV32IZCMP-NEXT:    lw s11, 84(a2)
+; RV32IZCMP-NEXT:    lw ra, 88(a2)
+; RV32IZCMP-NEXT:    lw s1, 92(a2)
+; RV32IZCMP-NEXT:    lw t0, 96(a2)
+; RV32IZCMP-NEXT:    lw a7, 100(a2)
+; RV32IZCMP-NEXT:    lw a6, 104(a2)
+; RV32IZCMP-NEXT:    lw a4, 108(a2)
+; RV32IZCMP-NEXT:    lw s0, 112(a2)
+; RV32IZCMP-NEXT:    lw a3, 116(a2)
+; RV32IZCMP-NEXT:    lw a1, 120(a2)
+; RV32IZCMP-NEXT:    lw a0, 124(a2)
+; RV32IZCMP-NEXT:    lw t3, %lo(var+4)(a5)
+; RV32IZCMP-NEXT:    lw t2, %lo(var+8)(a5)
+; RV32IZCMP-NEXT:    lw t1, %lo(var+12)(a5)
+; RV32IZCMP-NEXT:    sw a0, 124(a2)
+; RV32IZCMP-NEXT:    sw a1, 120(a2)
+; RV32IZCMP-NEXT:    sw a3, 116(a2)
+; RV32IZCMP-NEXT:    sw s0, 112(a2)
+; RV32IZCMP-NEXT:    sw a4, 108(a2)
+; RV32IZCMP-NEXT:    sw a6, 104(a2)
+; RV32IZCMP-NEXT:    sw a7, 100(a2)
+; RV32IZCMP-NEXT:    sw t0, 96(a2)
+; RV32IZCMP-NEXT:    sw s1, 92(a2)
+; RV32IZCMP-NEXT:    sw ra, 88(a2)
+; RV32IZCMP-NEXT:    sw s11, 84(a2)
+; RV32IZCMP-NEXT:    sw s10, 80(a2)
+; RV32IZCMP-NEXT:    sw s9, 76(a2)
+; RV32IZCMP-NEXT:    sw s8, 72(a2)
+; RV32IZCMP-NEXT:    sw s7, 68(a2)
+; RV32IZCMP-NEXT:    sw s6, 64(a2)
+; RV32IZCMP-NEXT:    sw s5, 60(a2)
+; RV32IZCMP-NEXT:    sw s4, 56(a2)
+; RV32IZCMP-NEXT:    sw s3, 52(a2)
+; RV32IZCMP-NEXT:    sw s2, 48(a2)
+; RV32IZCMP-NEXT:    sw t6, 44(a2)
+; RV32IZCMP-NEXT:    sw t5, 40(a2)
+; RV32IZCMP-NEXT:    sw t4, 36(a2)
 ; RV32IZCMP-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
-; RV32IZCMP-NEXT:    sw a0, 20(a5)
+; RV32IZCMP-NEXT:    sw a0, 32(a2)
 ; RV32IZCMP-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
-; RV32IZCMP-NEXT:    sw a0, 16(a5)
+; RV32IZCMP-NEXT:    sw a0, 28(a2)
 ; RV32IZCMP-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
-; RV32IZCMP-NEXT:    sw a0, %lo(var+12)(a6)
+; RV32IZCMP-NEXT:    sw a0, 24(a2)
 ; RV32IZCMP-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
-; RV32IZCMP-NEXT:    sw a0, %lo(var+8)(a6)
+; RV32IZCMP-NEXT:    sw a0, 20(a2)
 ; RV32IZCMP-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
-; RV32IZCMP-NEXT:    sw a0, %lo(var+4)(a6)
+; RV32IZCMP-NEXT:    sw a0, 16(a2)
+; RV32IZCMP-NEXT:    sw t1, %lo(var+12)(a5)
+; RV32IZCMP-NEXT:    sw t2, %lo(var+8)(a5)
+; RV32IZCMP-NEXT:    sw t3, %lo(var+4)(a5)
 ; RV32IZCMP-NEXT:    lw a0, 28(sp) # 4-byte Folded Reload
-; RV32IZCMP-NEXT:    sw a0, %lo(var)(a6)
+; RV32IZCMP-NEXT:    sw a0, %lo(var)(a5)
 ; RV32IZCMP-NEXT:    cm.popret {ra, s0-s11}, 96
 ;
 ; RV32IZCMP-WITH-FP-LABEL: callee:
@@ -360,81 +360,81 @@ define void @callee() nounwind {
 ; RV32IZCMP-WITH-FP-NEXT:    lui a6, %hi(var)
 ; RV32IZCMP-WITH-FP-NEXT:    lw a0, %lo(var)(a6)
 ; RV32IZCMP-WITH-FP-NEXT:    sw a0, -56(s0) # 4-byte Folded Spill
-; RV32IZCMP-WITH-FP-NEXT:    lw a0, %lo(var+4)(a6)
+; RV32IZCMP-WITH-FP-NEXT:    addi a2, a6, %lo(var)
+; RV32IZCMP-WITH-FP-NEXT:    lw a0, 16(a2)
 ; RV32IZCMP-WITH-FP-NEXT:    sw a0, -60(s0) # 4-byte Folded Spill
-; RV32IZCMP-WITH-FP-NEXT:    lw a0, %lo(var+8)(a6)
+; RV32IZCMP-WITH-FP-NEXT:    lw a0, 20(a2)
 ; RV32IZCMP-WITH-FP-NEXT:    sw a0, -64(s0) # 4-byte Folded Spill
-; RV32IZCMP-WITH-FP-NEXT:    lw a0, %lo(var+12)(a6)
+; RV32IZCMP-WITH-FP-NEXT:    lw a0, 24(a2)
 ; RV32IZCMP-WITH-FP-NEXT:    sw a0, -68(s0) # 4-byte Folded Spill
-; RV32IZCMP-WITH-FP-NEXT:    addi a5, a6, %lo(var)
-; RV32IZCMP-WITH-FP-NEXT:    lw a0, 16(a5)
+; RV32IZCMP-WITH-FP-NEXT:    lw a0, 28(a2)
 ; RV32IZCMP-WITH-FP-NEXT:    sw a0, -72(s0) # 4-byte Folded Spill
-; RV32IZCMP-WITH-FP-NEXT:    lw a0, 20(a5)
+; RV32IZCMP-WITH-FP-NEXT:    lw a0, 32(a2)
 ; RV32IZCMP-WITH-FP-NEXT:    sw a0, -76(s0) # 4-byte Folded Spill
-; RV32IZCMP-WITH-FP-NEXT:    lw a0, 24(a5)
+; RV32IZCMP-WITH-FP-NEXT:    lw a0, 36(a2)
 ; RV32IZCMP-WITH-FP-NEXT:    sw a0, -80(s0) # 4-byte Folded Spill
-; RV32IZCMP-WITH-FP-NEXT:    lw t5, 28(a5)
-; RV32IZCMP-WITH-FP-NEXT:    lw t6, 32(a5)
-; RV32IZCMP-WITH-FP-NEXT:    lw s2, 36(a5)
-; RV32IZCMP-WITH-FP-NEXT:    lw s3, 40(a5)
-; RV32IZCMP-WITH-FP-NEXT:    lw s4, 44(a5)
-; RV32IZCMP-WITH-FP-NEXT:    lw s5, 48(a5)
-; RV32IZCMP-WITH-FP-NEXT:    lw s6, 52(a5)
-; RV32IZCMP-WITH-FP-NEXT:    lw s7, 56(a5)
-; RV32IZCMP-WITH-FP-NEXT:    lw s8, 60(a5)
-; RV32IZCMP-WITH-FP-NEXT:    lw s9, 64(a5)
-; RV32IZCMP-WITH-FP-NEXT:    lw s10, 68(a5)
-; RV32IZCMP-WITH-FP-NEXT:    lw s11, 72(a5)
-; RV32IZCMP-WITH-FP-NEXT:    lw ra, 76(a5)
-; RV32IZCMP-WITH-FP-NEXT:    lw t4, 80(a5)
-; RV32IZCMP-WITH-FP-NEXT:    lw t3, 84(a5)
-; RV32IZCMP-WITH-FP-NEXT:    lw t2, 88(a5)
-; RV32IZCMP-WITH-FP-NEXT:    lw s1, 92(a5)
-; RV32IZCMP-WITH-FP-NEXT:    lw t1, 96(a5)
-; RV32IZCMP-WITH-FP-NEXT:    lw t0, 100(a5)
-; RV32IZCMP-WITH-FP-NEXT:    lw a7, 104(a5)
-; RV32IZCMP-WITH-FP-NEXT:    lw a4, 108(a5)
-; RV32IZCMP-WITH-FP-NEXT:    lw a0, 124(a5)
-; RV32IZCMP-WITH-FP-NEXT:    lw a1, 120(a5)
-; RV32IZCMP-WITH-FP-NEXT:    lw a2, 116(a5)
-; RV32IZCMP-WITH-FP-NEXT:    lw a3, 112(a5)
-; RV32IZCMP-WITH-FP-NEXT:    sw a0, 124(a5)
-; RV32IZCMP-WITH-FP-NEXT:    sw a1, 120(a5)
-; RV32IZCMP-WITH-FP-NEXT:    sw a2, 116(a5)
-; RV32IZCMP-WITH-FP-NEXT:    sw a3, 112(a5)
-; RV32IZCMP-WITH-FP-NEXT:    sw a4, 108(a5)
-; RV32IZCMP-WITH-FP-NEXT:    sw a7, 104(a5)
-; RV32IZCMP-WITH-FP-NEXT:    sw t0, 100(a5)
-; RV32IZCMP-WITH-FP-NEXT:    sw t1, 96(a5)
-; RV32IZCMP-WITH-FP-NEXT:    sw s1, 92(a5)
-; RV32IZCMP-WITH-FP-NEXT:    sw t2, 88(a5)
-; RV32IZCMP-WITH-FP-NEXT:    sw t3, 84(a5)
-; RV32IZCMP-WITH-FP-NEXT:    sw t4, 80(a5)
-; RV32IZCMP-WITH-FP-NEXT:    sw ra, 76(a5)
-; RV32IZCMP-WITH-FP-NEXT:    sw s11, 72(a5)
-; RV32IZCMP-WITH-FP-NEXT:    sw s10, 68(a5)
-; RV32IZCMP-WITH-FP-NEXT:    sw s9, 64(a5)
-; RV32IZCMP-WITH-FP-NEXT:    sw s8, 60(a5)
-; RV32IZCMP-WITH-FP-NEXT:    sw s7, 56(a5)
-; RV32IZCMP-WITH-FP-NEXT:    sw s6, 52(a5)
-; RV32IZCMP-WITH-FP-NEXT:    sw s5, 48(a5)
-; RV32IZCMP-WITH-FP-NEXT:    sw s4, 44(a5)
-; RV32IZCMP-WITH-FP-NEXT:    sw s3, 40(a5)
-; RV32IZCMP-WITH-FP-NEXT:    sw s2, 36(a5)
-; RV32IZCMP-WITH-FP-NEXT:    sw t6, 32(a5)
-; RV32IZCMP-WITH-FP-NEXT:    sw t5, 28(a5)
+; RV32IZCMP-WITH-FP-NEXT:    lw t5, 40(a2)
+; RV32IZCMP-WITH-FP-NEXT:    lw t6, 44(a2)
+; RV32IZCMP-WITH-FP-NEXT:    lw s2, 48(a2)
+; RV32IZCMP-WITH-FP-NEXT:    lw s3, 52(a2)
+; RV32IZCMP-WITH-FP-NEXT:    lw s4, 56(a2)
+; RV32IZCMP-WITH-FP-NEXT:    lw s5, 60(a2)
+; RV32IZCMP-WITH-FP-NEXT:    lw s6, 64(a2)
+; RV32IZCMP-WITH-FP-NEXT:    lw s7, 68(a2)
+; RV32IZCMP-WITH-FP-NEXT:    lw s8, 72(a2)
+; RV32IZCMP-WITH-FP-NEXT:    lw s9, 76(a2)
+; RV32IZCMP-WITH-FP-NEXT:    lw s10, 80(a2)
+; RV32IZCMP-WITH-FP-NEXT:    lw s11, 84(a2)
+; RV32IZCMP-WITH-FP-NEXT:    lw ra, 88(a2)
+; RV32IZCMP-WITH-FP-NEXT:    lw t1, 92(a2)
+; RV32IZCMP-WITH-FP-NEXT:    lw t0, 96(a2)
+; RV32IZCMP-WITH-FP-NEXT:    lw a7, 100(a2)
+; RV32IZCMP-WITH-FP-NEXT:    lw s1, 104(a2)
+; RV32IZCMP-WITH-FP-NEXT:    lw a5, 108(a2)
+; RV32IZCMP-WITH-FP-NEXT:    lw a4, 112(a2)
+; RV32IZCMP-WITH-FP-NEXT:    lw a3, 116(a2)
+; RV32IZCMP-WITH-FP-NEXT:    lw a1, 120(a2)
+; RV32IZCMP-WITH-FP-NEXT:    lw a0, 124(a2)
+; RV32IZCMP-WITH-FP-NEXT:    lw t4, %lo(var+4)(a6)
+; RV32IZCMP-WITH-FP-NEXT:    lw t3, %lo(var+8)(a6)
+; RV32IZCMP-WITH-FP-NEXT:    lw t2, %lo(var+12)(a6)
+; RV32IZCMP-WITH-FP-NEXT:    sw a0, 124(a2)
+; RV32IZCMP-WITH-FP-NEXT:    sw a1, 120(a2)
+; RV32IZCMP-WITH-FP-NEXT:    sw a3, 116(a2)
+; RV32IZCMP-WITH-FP-NEXT:    sw a4, 112(a2)
+; RV32IZCMP-WITH-FP-NEXT:    sw a5, 108(a2)
+; RV32IZCMP-WITH-FP-NEXT:    sw s1, 104(a2)
+; RV32IZCMP-WITH-FP-NEXT:    sw a7, 100(a2)
+; RV32IZCMP-WITH-FP-NEXT:    sw t0, 96(a2)
+; RV32IZCMP-WITH-FP-NEXT:    sw t1, 92(a2)
+; RV32IZCMP-WITH-FP-NEXT:    sw ra, 88(a2)
+; RV32IZCMP-WITH-FP-NEXT:    sw s11, 84(a2)
+; RV32IZCMP-WITH-FP-NEXT:    sw s10, 80(a2)
+; RV32IZCMP-WITH-FP-NEXT:    sw s9, 76(a2)
+; RV32IZCMP-WITH-FP-NEXT:    sw s8, 72(a2)
+; RV32IZCMP-WITH-FP-NEXT:    sw s7, 68(a2)
+; RV32IZCMP-WITH-FP-NEXT:    sw s6, 64(a2)
+; RV32IZCMP-WITH-FP-NEXT:    sw s5, 60(a2)
+; RV32IZCMP-WITH-FP-NEXT:    sw s4, 56(a2)
+; RV32IZCMP-WITH-FP-NEXT:    sw s3, 52(a2)
+; RV32IZCMP-WITH-FP-NEXT:    sw s2, 48(a2)
+; RV32IZCMP-WITH-FP-NEXT:    sw t6, 44(a2)
+; RV32IZCMP-WITH-FP-NEXT:    sw t5, 40(a2)
 ; RV32IZCMP-WITH-FP-NEXT:    lw a0, -80(s0) # 4-byte Folded Reload
-; RV32IZCMP-WITH-FP-NEXT:    sw a0, 24(a5)
+; RV32IZCMP-WITH-FP-NEXT:    sw a0, 36(a2)
 ; RV32IZCMP-WITH-FP-NEXT:    lw a0, -76(s0) # 4-byte Folded Reload
-; RV32IZCMP-WITH-FP-NEXT:    sw a0, 20(a5)
+; RV32IZCMP-WITH-FP-NEXT:    sw a0, 32(a2)
 ; RV32IZCMP-WITH-FP-NEXT:    lw a0, -72(s0) # 4-byte Folded Reload
-; RV32IZCMP-WITH-FP-NEXT:    sw a0, 16(a5)
+; RV32IZCMP-WITH-FP-NEXT:    sw a0, 28(a2)
 ; RV32IZCMP-WITH-FP-NEXT:    lw a0, -68(s0) # 4-byte Folded Reload
-; RV32IZCMP-WITH-FP-NEXT:    sw a0, %lo(var+12)(a6)
+; RV32IZCMP-WITH-FP-NEXT:    sw a0, 24(a2)
 ; RV32IZCMP-WITH-FP-NEXT:    lw a0, -64(s0) # 4-byte Folded Reload
-; RV32IZCMP-WITH-FP-NEXT:    sw a0, %lo(var+8)(a6)
+; RV32IZCMP-WITH-FP-NEXT:    sw a0, 20(a2)
 ; RV32IZCMP-WITH-FP-NEXT:    lw a0, -60(s0) # 4-byte Folded Reload
-; RV32IZCMP-WITH-FP-NEXT:    sw a0, %lo(var+4)(a6)
+; RV32IZCMP-WITH-FP-NEXT:    sw a0, 16(a2)
+; RV32IZCMP-WITH-FP-NEXT:    sw t2, %lo(var+12)(a6)
+; RV32IZCMP-WITH-FP-NEXT:    sw t3, %lo(var+8)(a6)
+; RV32IZCMP-WITH-FP-NEXT:    sw t4, %lo(var+4)(a6)
 ; RV32IZCMP-WITH-FP-NEXT:    lw a0, -56(s0) # 4-byte Folded Reload
 ; RV32IZCMP-WITH-FP-NEXT:    sw a0, %lo(var)(a6)
 ; RV32IZCMP-WITH-FP-NEXT:    lw ra, 76(sp) # 4-byte Folded Reload
@@ -469,84 +469,84 @@ define void @callee() nounwind {
 ; RV64I-NEXT:    sd s9, 72(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s10, 64(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s11, 56(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lui a6, %hi(var)
-; RV64I-NEXT:    lw a0, %lo(var)(a6)
+; RV64I-NEXT:    lui a4, %hi(var)
+; RV64I-NEXT:    lw a0, %lo(var)(a4)
 ; RV64I-NEXT:    sd a0, 48(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lw a0, %lo(var+4)(a6)
+; RV64I-NEXT:    addi a2, a4, %lo(var)
+; RV64I-NEXT:    lw a0, 16(a2)
 ; RV64I-NEXT:    sd a0, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lw a0, %lo(var+8)(a6)
+; RV64I-NEXT:    lw a0, 20(a2)
 ; RV64I-NEXT:    sd a0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lw a0, %lo(var+12)(a6)
+; RV64I-NEXT:    lw a0, 24(a2)
 ; RV64I-NEXT:    sd a0, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    addi a5, a6, %lo(var)
-; RV64I-NEXT:    lw a0, 16(a5)
+; RV64I-NEXT:    lw a0, 28(a2)
 ; RV64I-NEXT:    sd a0, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lw a0, 20(a5)
+; RV64I-NEXT:    lw a0, 32(a2)
 ; RV64I-NEXT:    sd a0, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lw t0, 24(a5)
-; RV64I-NEXT:    lw t1, 28(a5)
-; RV64I-NEXT:    lw t2, 32(a5)
-; RV64I-NEXT:    lw t3, 36(a5)
-; RV64I-NEXT:    lw t4, 40(a5)
-; RV64I-NEXT:    lw t5, 44(a5)
-; RV64I-NEXT:    lw t6, 48(a5)
-; RV64I-NEXT:    lw s0, 52(a5)
-; RV64I-NEXT:    lw s1, 56(a5)
-; RV64I-NEXT:    lw s2, 60(a5)
-; RV64I-NEXT:    lw s3, 64(a5)
-; RV64I-NEXT:    lw s4, 68(a5)
-; RV64I-NEXT:    lw s5, 72(a5)
-; RV64I-NEXT:    lw s6, 76(a5)
-; RV64I-NEXT:    lw s7, 80(a5)
-; RV64I-NEXT:    lw s8, 84(a5)
-; RV64I-NEXT:    lw s9, 88(a5)
-; RV64I-NEXT:    lw s10, 92(a5)
-; RV64I-NEXT:    lw s11, 96(a5)
-; RV64I-NEXT:    lw ra, 100(a5)
-; RV64I-NEXT:    lw a7, 104(a5)
-; RV64I-NEXT:    lw a4, 108(a5)
-; RV64I-NEXT:    lw a0, 124(a5)
-; RV64I-NEXT:    lw a1, 120(a5)
-; RV64I-NEXT:    lw a2, 116(a5)
-; RV64I-NEXT:    lw a3, 112(a5)
-; RV64I-NEXT:    sw a0, 124(a5)
-; RV64I-NEXT:    sw a1, 120(a5)
-; RV64I-NEXT:    sw a2, 116(a5)
-; RV64I-NEXT:    sw a3, 112(a5)
-; RV64I-NEXT:    sw a4, 108(a5)
-; RV64I-NEXT:    sw a7, 104(a5)
-; RV64I-NEXT:    sw ra, 100(a5)
-; RV64I-NEXT:    sw s11, 96(a5)
-; RV64I-NEXT:    sw s10, 92(a5)
-; RV64I-NEXT:    sw s9, 88(a5)
-; RV64I-NEXT:    sw s8, 84(a5)
-; RV64I-NEXT:    sw s7, 80(a5)
-; RV64I-NEXT:    sw s6, 76(a5)
-; RV64I-NEXT:    sw s5, 72(a5)
-; RV64I-NEXT:    sw s4, 68(a5)
-; RV64I-NEXT:    sw s3, 64(a5)
-; RV64I-NEXT:    sw s2, 60(a5)
-; RV64I-NEXT:    sw s1, 56(a5)
-; RV64I-NEXT:    sw s0, 52(a5)
-; RV64I-NEXT:    sw t6, 48(a5)
-; RV64I-NEXT:    sw t5, 44(a5)
-; RV64I-NEXT:    sw t4, 40(a5)
-; RV64I-NEXT:    sw t3, 36(a5)
-; RV64I-NEXT:    sw t2, 32(a5)
-; RV64I-NEXT:    sw t1, 28(a5)
-; RV64I-NEXT:    sw t0, 24(a5)
+; RV64I-NEXT:    lw t0, 36(a2)
+; RV64I-NEXT:    lw t1, 40(a2)
+; RV64I-NEXT:    lw t2, 44(a2)
+; RV64I-NEXT:    lw t3, 48(a2)
+; RV64I-NEXT:    lw t4, 52(a2)
+; RV64I-NEXT:    lw t5, 56(a2)
+; RV64I-NEXT:    lw t6, 60(a2)
+; RV64I-NEXT:    lw s0, 64(a2)
+; RV64I-NEXT:    lw s1, 68(a2)
+; RV64I-NEXT:    lw s2, 72(a2)
+; RV64I-NEXT:    lw s3, 76(a2)
+; RV64I-NEXT:    lw s4, 80(a2)
+; RV64I-NEXT:    lw s5, 84(a2)
+; RV64I-NEXT:    lw s6, 88(a2)
+; RV64I-NEXT:    lw s7, 92(a2)
+; RV64I-NEXT:    lw s8, 96(a2)
+; RV64I-NEXT:    lw s9, 100(a2)
+; RV64I-NEXT:    lw s10, 104(a2)
+; RV64I-NEXT:    lw s11, 108(a2)
+; RV64I-NEXT:    lw ra, 112(a2)
+; RV64I-NEXT:    lw a3, 116(a2)
+; RV64I-NEXT:    lw a1, 120(a2)
+; RV64I-NEXT:    lw a0, 124(a2)
+; RV64I-NEXT:    lw a7, %lo(var+4)(a4)
+; RV64I-NEXT:    lw a6, %lo(var+8)(a4)
+; RV64I-NEXT:    lw a5, %lo(var+12)(a4)
+; RV64I-NEXT:    sw a0, 124(a2)
+; RV64I-NEXT:    sw a1, 120(a2)
+; RV64I-NEXT:    sw a3, 116(a2)
+; RV64I-NEXT:    sw ra, 112(a2)
+; RV64I-NEXT:    sw s11, 108(a2)
+; RV64I-NEXT:    sw s10, 104(a2)
+; RV64I-NEXT:    sw s9, 100(a2)
+; RV64I-NEXT:    sw s8, 96(a2)
+; RV64I-NEXT:    sw s7, 92(a2)
+; RV64I-NEXT:    sw s6, 88(a2)
+; RV64I-NEXT:    sw s5, 84(a2)
+; RV64I-NEXT:    sw s4, 80(a2)
+; RV64I-NEXT:    sw s3, 76(a2)
+; RV64I-NEXT:    sw s2, 72(a2)
+; RV64I-NEXT:    sw s1, 68(a2)
+; RV64I-NEXT:    sw s0, 64(a2)
+; RV64I-NEXT:    sw t6, 60(a2)
+; RV64I-NEXT:    sw t5, 56(a2)
+; RV64I-NEXT:    sw t4, 52(a2)
+; RV64I-NEXT:    sw t3, 48(a2)
+; RV64I-NEXT:    sw t2, 44(a2)
+; RV64I-NEXT:    sw t1, 40(a2)
+; RV64I-NEXT:    sw t0, 36(a2)
 ; RV64I-NEXT:    ld a0, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sw a0, 20(a5)
+; RV64I-NEXT:    sw a0, 32(a2)
 ; RV64I-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sw a0, 16(a5)
+; RV64I-NEXT:    sw a0, 28(a2)
 ; RV64I-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sw a0, %lo(var+12)(a6)
+; RV64I-NEXT:    sw a0, 24(a2)
 ; RV64I-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sw a0, %lo(var+8)(a6)
+; RV64I-NEXT:    sw a0, 20(a2)
 ; RV64I-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sw a0, %lo(var+4)(a6)
+; RV64I-NEXT:    sw a0, 16(a2)
+; RV64I-NEXT:    sw a5, %lo(var+12)(a4)
+; RV64I-NEXT:    sw a6, %lo(var+8)(a4)
+; RV64I-NEXT:    sw a7, %lo(var+4)(a4)
 ; RV64I-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sw a0, %lo(var)(a6)
+; RV64I-NEXT:    sw a0, %lo(var)(a4)
 ; RV64I-NEXT:    ld ra, 152(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 144(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 136(sp) # 8-byte Folded Reload
@@ -580,86 +580,86 @@ define void @callee() nounwind {
 ; RV64I-WITH-FP-NEXT:    sd s10, 64(sp) # 8-byte Folded Spill
 ; RV64I-WITH-FP-NEXT:    sd s11, 56(sp) # 8-byte Folded Spill
 ; RV64I-WITH-FP-NEXT:    addi s0, sp, 160
-; RV64I-WITH-FP-NEXT:    lui a6, %hi(var)
-; RV64I-WITH-FP-NEXT:    lw a0, %lo(var)(a6)
+; RV64I-WITH-FP-NEXT:    lui a5, %hi(var)
+; RV64I-WITH-FP-NEXT:    lw a0, %lo(var)(a5)
 ; RV64I-WITH-FP-NEXT:    sd a0, -112(s0) # 8-byte Folded Spill
-; RV64I-WITH-FP-NEXT:    lw a0, %lo(var+4)(a6)
+; RV64I-WITH-FP-NEXT:    addi a2, a5, %lo(var)
+; RV64I-WITH-FP-NEXT:    lw a0, 16(a2)
 ; RV64I-WITH-FP-NEXT:    sd a0, -120(s0) # 8-byte Folded Spill
-; RV64I-WITH-FP-NEXT:    lw a0, %lo(var+8)(a6)
+; RV64I-WITH-FP-NEXT:    lw a0, 20(a2)
 ; RV64I-WITH-FP-NEXT:    sd a0, -128(s0) # 8-byte Folded Spill
-; RV64I-WITH-FP-NEXT:    lw a0, %lo(var+12)(a6)
+; RV64I-WITH-FP-NEXT:    lw a0, 24(a2)
 ; RV64I-WITH-FP-NEXT:    sd a0, -136(s0) # 8-byte Folded Spill
-; RV64I-WITH-FP-NEXT:    addi a5, a6, %lo(var)
-; RV64I-WITH-FP-NEXT:    lw a0, 16(a5)
+; RV64I-WITH-FP-NEXT:    lw a0, 28(a2)
 ; RV64I-WITH-FP-NEXT:    sd a0, -144(s0) # 8-byte Folded Spill
-; RV64I-WITH-FP-NEXT:    lw a0, 20(a5)
+; RV64I-WITH-FP-NEXT:    lw a0, 32(a2)
 ; RV64I-WITH-FP-NEXT:    sd a0, -152(s0) # 8-byte Folded Spill
-; RV64I-WITH-FP-NEXT:    lw a0, 24(a5)
+; RV64I-WITH-FP-NEXT:    lw a0, 36(a2)
 ; RV64I-WITH-FP-NEXT:    sd a0, -160(s0) # 8-byte Folded Spill
-; RV64I-WITH-FP-NEXT:    lw t1, 28(a5)
-; RV64I-WITH-FP-NEXT:    lw t2, 32(a5)
-; RV64I-WITH-FP-NEXT:    lw t3, 36(a5)
-; RV64I-WITH-FP-NEXT:    lw t4, 40(a5)
-; RV64I-WITH-FP-NEXT:    lw t5, 44(a5)
-; RV64I-WITH-FP-NEXT:    lw t6, 48(a5)
-; RV64I-WITH-FP-NEXT:    lw s1, 52(a5)
-; RV64I-WITH-FP-NEXT:    lw s2, 56(a5)
-; RV64I-WITH-FP-NEXT:    lw s3, 60(a5)
-; RV64I-WITH-FP-NEXT:    lw s4, 64(a5)
-; RV64I-WITH-FP-NEXT:    lw s5, 68(a5)
-; RV64I-WITH-FP-NEXT:    lw s6, 72(a5)
-; RV64I-WITH-FP-NEXT:    lw s7, 76(a5)
-; RV64I-WITH-FP-NEXT:    lw s8, 80(a5)
-; RV64I-WITH-FP-NEXT:    lw s9, 84(a5)
-; RV64I-WITH-FP-NEXT:    lw s10, 88(a5)
-; RV64I-WITH-FP-NEXT:    lw s11, 92(a5)
-; RV64I-WITH-FP-NEXT:    lw ra, 96(a5)
-; RV64I-WITH-FP-NEXT:    lw t0, 100(a5)
-; RV64I-WITH-FP-NEXT:    lw a7, 104(a5)
-; RV64I-WITH-FP-NEXT:    lw a4, 108(a5)
-; RV64I-WITH-FP-NEXT:    lw a0, 124(a5)
-; RV64I-WITH-FP-NEXT:    lw a1, 120(a5)
-; RV64I-WITH-FP-NEXT:    lw a2, 116(a5)
-; RV64I-WITH-FP-NEXT:    lw a3, 112(a5)
-; RV64I-WITH-FP-NEXT:    sw a0, 124(a5)
-; RV64I-WITH-FP-NEXT:    sw a1, 120(a5)
-; RV64I-WITH-FP-NEXT:    sw a2, 116(a5)
-; RV64I-WITH-FP-NEXT:    sw a3, 112(a5)
-; RV64I-WITH-FP-NEXT:    sw a4, 108(a5)
-; RV64I-WITH-FP-NEXT:    sw a7, 104(a5)
-; RV64I-WITH-FP-NEXT:    sw t0, 100(a5)
-; RV64I-WITH-FP-NEXT:    sw ra, 96(a5)
-; RV64I-WITH-FP-NEXT:    sw s11, 92(a5)
-; RV64I-WITH-FP-NEXT:    sw s10, 88(a5)
-; RV64I-WITH-FP-NEXT:    sw s9, 84(a5)
-; RV64I-WITH-FP-NEXT:    sw s8, 80(a5)
-; RV64I-WITH-FP-NEXT:    sw s7, 76(a5)
-; RV64I-WITH-FP-NEXT:    sw s6, 72(a5)
-; RV64I-WITH-FP-NEXT:    sw s5, 68(a5)
-; RV64I-WITH-FP-NEXT:    sw s4, 64(a5)
-; RV64I-WITH-FP-NEXT:    sw s3, 60(a5)
-; RV64I-WITH-FP-NEXT:    sw s2, 56(a5)
-; RV64I-WITH-FP-NEXT:    sw s1, 52(a5)
-; RV64I-WITH-FP-NEXT:    sw t6, 48(a5)
-; RV64I-WITH-FP-NEXT:    sw t5, 44(a5)
-; RV64I-WITH-FP-NEXT:    sw t4, 40(a5)
-; RV64I-WITH-FP-NEXT:    sw t3, 36(a5)
-; RV64I-WITH-FP-NEXT:    sw t2, 32(a5)
-; RV64I-WITH-FP-NEXT:    sw t1, 28(a5)
+; RV64I-WITH-FP-NEXT:    lw t1, 40(a2)
+; RV64I-WITH-FP-NEXT:    lw t2, 44(a2)
+; RV64I-WITH-FP-NEXT:    lw t3, 48(a2)
+; RV64I-WITH-FP-NEXT:    lw t4, 52(a2)
+; RV64I-WITH-FP-NEXT:    lw t5, 56(a2)
+; RV64I-WITH-FP-NEXT:    lw t6, 60(a2)
+; RV64I-WITH-FP-NEXT:    lw s1, 64(a2)
+; RV64I-WITH-FP-NEXT:    lw s2, 68(a2)
+; RV64I-WITH-FP-NEXT:    lw s3, 72(a2)
+; RV64I-WITH-FP-NEXT:    lw s4, 76(a2)
+; RV64I-WITH-FP-NEXT:    lw s5, 80(a2)
+; RV64I-WITH-FP-NEXT:    lw s6, 84(a2)
+; RV64I-WITH-FP-NEXT:    lw s7, 88(a2)
+; RV64I-WITH-FP-NEXT:    lw s8, 92(a2)
+; RV64I-WITH-FP-NEXT:    lw s9, 96(a2)
+; RV64I-WITH-FP-NEXT:    lw s10, 100(a2)
+; RV64I-WITH-FP-NEXT:    lw s11, 104(a2)
+; RV64I-WITH-FP-NEXT:    lw ra, 108(a2)
+; RV64I-WITH-FP-NEXT:    lw a4, 112(a2)
+; RV64I-WITH-FP-NEXT:    lw a3, 116(a2)
+; RV64I-WITH-FP-NEXT:    lw a1, 120(a2)
+; RV64I-WITH-FP-NEXT:    lw a0, 124(a2)
+; RV64I-WITH-FP-NEXT:    lw t0, %lo(var+4)(a5)
+; RV64I-WITH-FP-NEXT:    lw a7, %lo(var+8)(a5)
+; RV64I-WITH-FP-NEXT:    lw a6, %lo(var+12)(a5)
+; RV64I-WITH-FP-NEXT:    sw a0, 124(a2)
+; RV64I-WITH-FP-NEXT:    sw a1, 120(a2)
+; RV64I-WITH-FP-NEXT:    sw a3, 116(a2)
+; RV64I-WITH-FP-NEXT:    sw a4, 112(a2)
+; RV64I-WITH-FP-NEXT:    sw ra, 108(a2)
+; RV64I-WITH-FP-NEXT:    sw s11, 104(a2)
+; RV64I-WITH-FP-NEXT:    sw s10, 100(a2)
+; RV64I-WITH-FP-NEXT:    sw s9, 96(a2)
+; RV64I-WITH-FP-NEXT:    sw s8, 92(a2)
+; RV64I-WITH-FP-NEXT:    sw s7, 88(a2)
+; RV64I-WITH-FP-NEXT:    sw s6, 84(a2)
+; RV64I-WITH-FP-NEXT:    sw s5, 80(a2)
+; RV64I-WITH-FP-NEXT:    sw s4, 76(a2)
+; RV64I-WITH-FP-NEXT:    sw s3, 72(a2)
+; RV64I-WITH-FP-NEXT:    sw s2, 68(a2)
+; RV64I-WITH-FP-NEXT:    sw s1, 64(a2)
+; RV64I-WITH-FP-NEXT:    sw t6, 60(a2)
+; RV64I-WITH-FP-NEXT:    sw t5, 56(a2)
+; RV64I-WITH-FP-NEXT:    sw t4, 52(a2)
+; RV64I-WITH-FP-NEXT:    sw t3, 48(a2)
+; RV64I-WITH-FP-NEXT:    sw t2, 44(a2)
+; RV64I-WITH-FP-NEXT:    sw t1, 40(a2)
 ; RV64I-WITH-FP-NEXT:    ld a0, -160(s0) # 8-byte Folded Reload
-; RV64I-WITH-FP-NEXT:    sw a0, 24(a5)
+; RV64I-WITH-FP-NEXT:    sw a0, 36(a2)
 ; RV64I-WITH-FP-NEXT:    ld a0, -152(s0) # 8-byte Folded Reload
-; RV64I-WITH-FP-NEXT:    sw a0, 20(a5)
+; RV64I-WITH-FP-NEXT:    sw a0, 32(a2)
 ; RV64I-WITH-FP-NEXT:    ld a0, -144(s0) # 8-byte Folded Reload
-; RV64I-WITH-FP-NEXT:    sw a0, 16(a5)
+; RV64I-WITH-FP-NEXT:    sw a0, 28(a2)
 ; RV64I-WITH-FP-NEXT:    ld a0, -136(s0) # 8-byte Folded Reload
-; RV64I-WITH-FP-NEXT:    sw a0, %lo(var+12)(a6)
+; RV64I-WITH-FP-NEXT:    sw a0, 24(a2)
 ; RV64I-WITH-FP-NEXT:    ld a0, -128(s0) # 8-byte Folded Reload
-; RV64I-WITH-FP-NEXT:    sw a0, %lo(var+8)(a6)
+; RV64I-WITH-FP-NEXT:    sw a0, 20(a2)
 ; RV64I-WITH-FP-NEXT:    ld a0, -120(s0) # 8-byte Folded Reload
-; RV64I-WITH-FP-NEXT:    sw a0, %lo(var+4)(a6)
+; RV64I-WITH-FP-NEXT:    sw a0, 16(a2)
+; RV64I-WITH-FP-NEXT:    sw a6, %lo(var+12)(a5)
+; RV64I-WITH-FP-NEXT:    sw a7, %lo(var+8)(a5)
+; RV64I-WITH-FP-NEXT:    sw t0, %lo(var+4)(a5)
 ; RV64I-WITH-FP-NEXT:    ld a0, -112(s0) # 8-byte Folded Reload
-; RV64I-WITH-FP-NEXT:    sw a0, %lo(var)(a6)
+; RV64I-WITH-FP-NEXT:    sw a0, %lo(var)(a5)
 ; RV64I-WITH-FP-NEXT:    ld ra, 152(sp) # 8-byte Folded Reload
 ; RV64I-WITH-FP-NEXT:    ld s0, 144(sp) # 8-byte Folded Reload
 ; RV64I-WITH-FP-NEXT:    ld s1, 136(sp) # 8-byte Folded Reload
@@ -679,84 +679,84 @@ define void @callee() nounwind {
 ; RV64IZCMP-LABEL: callee:
 ; RV64IZCMP:       # %bb.0:
 ; RV64IZCMP-NEXT:    cm.push {ra, s0-s11}, -160
-; RV64IZCMP-NEXT:    lui a6, %hi(var)
-; RV64IZCMP-NEXT:    lw a0, %lo(var)(a6)
+; RV64IZCMP-NEXT:    lui a5, %hi(var)
+; RV64IZCMP-NEXT:    lw a0, %lo(var)(a5)
 ; RV64IZCMP-NEXT:    sd a0, 40(sp) # 8-byte Folded Spill
-; RV64IZCMP-NEXT:    lw a0, %lo(var+4)(a6)
+; RV64IZCMP-NEXT:    addi a2, a5, %lo(var)
+; RV64IZCMP-NEXT:    lw a0, 16(a2)
 ; RV64IZCMP-NEXT:    sd a0, 32(sp) # 8-byte Folded Spill
-; RV64IZCMP-NEXT:    lw a0, %lo(var+8)(a6)
+; RV64IZCMP-NEXT:    lw a0, 20(a2)
 ; RV64IZCMP-NEXT:    sd a0, 24(sp) # 8-byte Folded Spill
-; RV64IZCMP-NEXT:    lw a0, %lo(var+12)(a6)
+; RV64IZCMP-NEXT:    lw a0, 24(a2)
 ; RV64IZCMP-NEXT:    sd a0, 16(sp) # 8-byte Folded Spill
-; RV64IZCMP-NEXT:    addi a5, a6, %lo(var)
-; RV64IZCMP-NEXT:    lw a0, 16(a5)
+; RV64IZCMP-NEXT:    lw a0, 28(a2)
 ; RV64IZCMP-NEXT:    sd a0, 8(sp) # 8-byte Folded Spill
-; RV64IZCMP-NEXT:    lw a0, 20(a5)
+; RV64IZCMP-NEXT:    lw a0, 32(a2)
 ; RV64IZCMP-NEXT:    sd a0, 0(sp) # 8-byte Folded Spill
-; RV64IZCMP-NEXT:    lw t4, 24(a5)
-; RV64IZCMP-NEXT:    lw t5, 28(a5)
-; RV64IZCMP-NEXT:    lw t6, 32(a5)
-; RV64IZCMP-NEXT:    lw s2, 36(a5)
-; RV64IZCMP-NEXT:    lw s3, 40(a5)
-; RV64IZCMP-NEXT:    lw s4, 44(a5)
-; RV64IZCMP-NEXT:    lw s5, 48(a5)
-; RV64IZCMP-NEXT:    lw s6, 52(a5)
-; RV64IZCMP-NEXT:    lw s7, 56(a5)
-; RV64IZCMP-NEXT:    lw s8, 60(a5)
-; RV64IZCMP-NEXT:    lw s9, 64(a5)
-; RV64IZCMP-NEXT:    lw s10, 68(a5)
-; RV64IZCMP-NEXT:    lw s11, 72(a5)
-; RV64IZCMP-NEXT:    lw ra, 76(a5)
-; RV64IZCMP-NEXT:    lw s1, 80(a5)
-; RV64IZCMP-NEXT:    lw t3, 84(a5)
-; RV64IZCMP-NEXT:    lw t2, 88(a5)
-; RV64IZCMP-NEXT:    lw t1, 92(a5)
-; RV64IZCMP-NEXT:    lw t0, 96(a5)
-; RV64IZCMP-NEXT:    lw s0, 100(a5)
-; RV64IZCMP-NEXT:    lw a7, 104(a5)
-; RV64IZCMP-NEXT:    lw a4, 108(a5)
-; RV64IZCMP-NEXT:    lw a0, 124(a5)
-; RV64IZCMP-NEXT:    lw a1, 120(a5)
-; RV64IZCMP-NEXT:    lw a2, 116(a5)
-; RV64IZCMP-NEXT:    lw a3, 112(a5)
-; RV64IZCMP-NEXT:    sw a0, 124(a5)
-; RV64IZCMP-NEXT:    sw a1, 120(a5)
-; RV64IZCMP-NEXT:    sw a2, 116(a5)
-; RV64IZCMP-NEXT:    sw a3, 112(a5)
-; RV64IZCMP-NEXT:    sw a4, 108(a5)
-; RV64IZCMP-NEXT:    sw a7, 104(a5)
-; RV64IZCMP-NEXT:    sw s0, 100(a5)
-; RV64IZCMP-NEXT:    sw t0, 96(a5)
-; RV64IZCMP-NEXT:    sw t1, 92(a5)
-; RV64IZCMP-NEXT:    sw t2, 88(a5)
-; RV64IZCMP-NEXT:    sw t3, 84(a5)
-; RV64IZCMP-NEXT:    sw s1, 80(a5)
-; RV64IZCMP-NEXT:    sw ra, 76(a5)
-; RV64IZCMP-NEXT:    sw s11, 72(a5)
-; RV64IZCMP-NEXT:    sw s10, 68(a5)
-; RV64IZCMP-NEXT:    sw s9, 64(a5)
-; RV64IZCMP-NEXT:    sw s8, 60(a5)
-; RV64IZCMP-NEXT:    sw s7, 56(a5)
-; RV64IZCMP-NEXT:    sw s6, 52(a5)
-; RV64IZCMP-NEXT:    sw s5, 48(a5)
-; RV64IZCMP-NEXT:    sw s4, 44(a5)
-; RV64IZCMP-NEXT:    sw s3, 40(a5)
-; RV64IZCMP-NEXT:    sw s2, 36(a5)
-; RV64IZCMP-NEXT:    sw t6, 32(a5)
-; RV64IZCMP-NEXT:    sw t5, 28(a5)
-; RV64IZCMP-NEXT:    sw t4, 24(a5)
+; RV64IZCMP-NEXT:    lw t4, 36(a2)
+; RV64IZCMP-NEXT:    lw t5, 40(a2)
+; RV64IZCMP-NEXT:    lw t6, 44(a2)
+; RV64IZCMP-NEXT:    lw s2, 48(a2)
+; RV64IZCMP-NEXT:    lw s3, 52(a2)
+; RV64IZCMP-NEXT:    lw s4, 56(a2)
+; RV64IZCMP-NEXT:    lw s5, 60(a2)
+; RV64IZCMP-NEXT:    lw s6, 64(a2)
+; RV64IZCMP-NEXT:    lw s7, 68(a2)
+; RV64IZCMP-NEXT:    lw s8, 72(a2)
+; RV64IZCMP-NEXT:    lw s9, 76(a2)
+; RV64IZCMP-NEXT:    lw s10, 80(a2)
+; RV64IZCMP-NEXT:    lw s11, 84(a2)
+; RV64IZCMP-NEXT:    lw ra, 88(a2)
+; RV64IZCMP-NEXT:    lw s1, 92(a2)
+; RV64IZCMP-NEXT:    lw t0, 96(a2)
+; RV64IZCMP-NEXT:    lw a7, 100(a2)
+; RV64IZCMP-NEXT:    lw a6, 104(a2)
+; RV64IZCMP-NEXT:    lw a4, 108(a2)
+; RV64IZCMP-NEXT:    lw s0, 112(a2)
+; RV64IZCMP-NEXT:    lw a3, 116(a2)
+; RV64IZCMP-NEXT:    lw a1, 120(a2)
+; RV64IZCMP-NEXT:    lw a0, 124(a2)
+; RV64IZCMP-NEXT:    lw t3, %lo(var+4)(a5)
+; RV64IZCMP-NEXT:    lw t2, %lo(var+8)(a5)
+; RV64IZCMP-NEXT:    lw t1, %lo(var+12)(a5)
+; RV64IZCMP-NEXT:    sw a0, 124(a2)
+; RV64IZCMP-NEXT:    sw a1, 120(a2)
+; RV64IZCMP-NEXT:    sw a3, 116(a2)
+; RV64IZCMP-NEXT:    sw s0, 112(a2)
+; RV64IZCMP-NEXT:    sw a4, 108(a2)
+; RV64IZCMP-NEXT:    sw a6, 104(a2)
+; RV64IZCMP-NEXT:    sw a7, 100(a2)
+; RV64IZCMP-NEXT:    sw t0, 96(a2)
+; RV64IZCMP-NEXT:    sw s1, 92(a2)
+; RV64IZCMP-NEXT:    sw ra, 88(a2)
+; RV64IZCMP-NEXT:    sw s11, 84(a2)
+; RV64IZCMP-NEXT:    sw s10, 80(a2)
+; RV64IZCMP-NEXT:    sw s9, 76(a2)
+; RV64IZCMP-NEXT:    sw s8, 72(a2)
+; RV64IZCMP-NEXT:    sw s7, 68(a2)
+; RV64IZCMP-NEXT:    sw s6, 64(a2)
+; RV64IZCMP-NEXT:    sw s5, 60(a2)
+; RV64IZCMP-NEXT:    sw s4, 56(a2)
+; RV64IZCMP-NEXT:    sw s3, 52(a2)
+; RV64IZCMP-NEXT:    sw s2, 48(a2)
+; RV64IZCMP-NEXT:    sw t6, 44(a2)
+; RV64IZCMP-NEXT:    sw t5, 40(a2)
+; RV64IZCMP-NEXT:    sw t4, 36(a2)
 ; RV64IZCMP-NEXT:    ld a0, 0(sp) # 8-byte Folded Reload
-; RV64IZCMP-NEXT:    sw a0, 20(a5)
+; RV64IZCMP-NEXT:    sw a0, 32(a2)
 ; RV64IZCMP-NEXT:    ld a0, 8(sp) # 8-byte Folded Reload
-; RV64IZCMP-NEXT:    sw a0, 16(a5)
+; RV64IZCMP-NEXT:    sw a0, 28(a2)
 ; RV64IZCMP-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
-; RV64IZCMP-NEXT:    sw a0, %lo(var+12)(a6)
+; RV64IZCMP-NEXT:    sw a0, 24(a2)
 ; RV64IZCMP-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
-; RV64IZCMP-NEXT:    sw a0, %lo(var+8)(a6)
+; RV64IZCMP-NEXT:    sw a0, 20(a2)
 ; RV64IZCMP-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
-; RV64IZCMP-NEXT:    sw a0, %lo(var+4)(a6)
+; RV64IZCMP-NEXT:    sw a0, 16(a2)
+; RV64IZCMP-NEXT:    sw t1, %lo(var+12)(a5)
+; RV64IZCMP-NEXT:    sw t2, %lo(var+8)(a5)
+; RV64IZCMP-NEXT:    sw t3, %lo(var+4)(a5)
 ; RV64IZCMP-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
-; RV64IZCMP-NEXT:    sw a0, %lo(var)(a6)
+; RV64IZCMP-NEXT:    sw a0, %lo(var)(a5)
 ; RV64IZCMP-NEXT:    cm.popret {ra, s0-s11}, 160
 ;
 ; RV64IZCMP-WITH-FP-LABEL: callee:
@@ -779,81 +779,81 @@ define void @callee() nounwind {
 ; RV64IZCMP-WITH-FP-NEXT:    lui a6, %hi(var)
 ; RV64IZCMP-WITH-FP-NEXT:    lw a0, %lo(var)(a6)
 ; RV64IZCMP-WITH-FP-NEXT:    sd a0, -112(s0) # 8-byte Folded Spill
-; RV64IZCMP-WITH-FP-NEXT:    lw a0, %lo(var+4)(a6)
+; RV64IZCMP-WITH-FP-NEXT:    addi a2, a6, %lo(var)
+; RV64IZCMP-WITH-FP-NEXT:    lw a0, 16(a2)
 ; RV64IZCMP-WITH-FP-NEXT:    sd a0, -120(s0) # 8-byte Folded Spill
-; RV64IZCMP-WITH-FP-NEXT:    lw a0, %lo(var+8)(a6)
+; RV64IZCMP-WITH-FP-NEXT:    lw a0, 20(a2)
 ; RV64IZCMP-WITH-FP-NEXT:    sd a0, -128(s0) # 8-byte Folded Spill
-; RV64IZCMP-WITH-FP-NEXT:    lw a0, %lo(var+12)(a6)
+; RV64IZCMP-WITH-FP-NEXT:    lw a0, 24(a2)
 ; RV64IZCMP-WITH-FP-NEXT:    sd a0, -136(s0) # 8-byte Folded Spill
-; RV64IZCMP-WITH-FP-NEXT:    addi a5, a6, %lo(var)
-; RV64IZCMP-WITH-FP-NEXT:    lw a0, 16(a5)
+; RV64IZCMP-WITH-FP-NEXT:    lw a0, 28(a2)
 ; RV64IZCMP-WITH-FP-NEXT:    sd a0, -144(s0) # 8-byte Folded Spill
-; RV64IZCMP-WITH-FP-NEXT:    lw a0, 20(a5)
+; RV64IZCMP-WITH-FP-NEXT:    lw a0, 32(a2)
 ; RV64IZCMP-WITH-FP-NEXT:    sd a0, -152(s0) # 8-byte Folded Spill
-; RV64IZCMP-WITH-FP-NEXT:    lw a0, 24(a5)
+; RV64IZCMP-WITH-FP-NEXT:    lw a0, 36(a2)
 ; RV64IZCMP-WITH-FP-NEXT:    sd a0, -160(s0) # 8-byte Folded Spill
-; RV64IZCMP-WITH-FP-NEXT:    lw t5, 28(a5)
-; RV64IZCMP-WITH-FP-NEXT:    lw t6, 32(a5)
-; RV64IZCMP-WITH-FP-NEXT:    lw s2, 36(a5)
-; RV64IZCMP-WITH-FP-NEXT:    lw s3, 40(a5)
-; RV64IZCMP-WITH-FP-NEXT:    lw s4, 44(a5)
-; RV64IZCMP-WITH-FP-NEXT:    lw s5, 48(a5)
-; RV64IZCMP-WITH-FP-NEXT:    lw s6, 52(a5)
-; RV64IZCMP-WITH-FP-NEXT:    lw s7, 56(a5)
-; RV64IZCMP-WITH-FP-NEXT:    lw s8, 60(a5)
-; RV64IZCMP-WITH-FP-NEXT:    lw s9, 64(a5)
-; RV64IZCMP-WITH-FP-NEXT:    lw s10, 68(a5)
-; RV64IZCMP-WITH-FP-NEXT:    lw s11, 72(a5)
-; RV64IZCMP-WITH-FP-NEXT:    lw ra, 76(a5)
-; RV64IZCMP-WITH-FP-NEXT:    lw t4, 80(a5)
-; RV64IZCMP-WITH-FP-NEXT:    lw t3, 84(a5)
-; RV64IZCMP-WITH-FP-NEXT:    lw t2, 88(a5)
-; RV64IZCMP-WITH-FP-NEXT:    lw s1, 92(a5)
-; RV64IZCMP-WITH-FP-NEXT:    lw t1, 96(a5)
-; RV64IZCMP-WITH-FP-NEXT:    lw t0, 100(a5)
-; RV64IZCMP-WITH-FP-NEXT:    lw a7, 104(a5)
-; RV64IZCMP-WITH-FP-NEXT:    lw a4, 108(a5)
-; RV64IZCMP-WITH-FP-NEXT:    lw a0, 124(a5)
-; RV64IZCMP-WITH-FP-NEXT:    lw a1, 120(a5)
-; RV64IZCMP-WITH-FP-NEXT:    lw a2, 116(a5)
-; RV64IZCMP-WITH-FP-NEXT:    lw a3, 112(a5)
-; RV64IZCMP-WITH-FP-NEXT:    sw a0, 124(a5)
-; RV64IZCMP-WITH-FP-NEXT:    sw a1, 120(a5)
-; RV64IZCMP-WITH-FP-NEXT:    sw a2, 116(a5)
-; RV64IZCMP-WITH-FP-NEXT:    sw a3, 112(a5)
-; RV64IZCMP-WITH-FP-NEXT:    sw a4, 108(a5)
-; RV64IZCMP-WITH-FP-NEXT:    sw a7, 104(a5)
-; RV64IZCMP-WITH-FP-NEXT:    sw t0, 100(a5)
-; RV64IZCMP-WITH-FP-NEXT:    sw t1, 96(a5)
-; RV64IZCMP-WITH-FP-NEXT:    sw s1, 92(a5)
-; RV64IZCMP-WITH-FP-NEXT:    sw t2, 88(a5)
-; RV64IZCMP-WITH-FP-NEXT:    sw t3, 84(a5)
-; RV64IZCMP-WITH-FP-NEXT:    sw t4, 80(a5)
-; RV64IZCMP-WITH-FP-NEXT:    sw ra, 76(a5)
-; RV64IZCMP-WITH-FP-NEXT:    sw s11, 72(a5)
-; RV64IZCMP-WITH-FP-NEXT:    sw s10, 68(a5)
-; RV64IZCMP-WITH-FP-NEXT:    sw s9, 64(a5)
-; RV64IZCMP-WITH-FP-NEXT:    sw s8, 60(a5)
-; RV64IZCMP-WITH-FP-NEXT:    sw s7, 56(a5)
-; RV64IZCMP-WITH-FP-NEXT:    sw s6, 52(a5)
-; RV64IZCMP-WITH-FP-NEXT:    sw s5, 48(a5)
-; RV64IZCMP-WITH-FP-NEXT:    sw s4, 44(a5)
-; RV64IZCMP-WITH-FP-NEXT:    sw s3, 40(a5)
-; RV64IZCMP-WITH-FP-NEXT:    sw s2, 36(a5)
-; RV64IZCMP-WITH-FP-NEXT:    sw t6, 32(a5)
-; RV64IZCMP-WITH-FP-NEXT:    sw t5, 28(a5)
+; RV64IZCMP-WITH-FP-NEXT:    lw t5, 40(a2)
+; RV64IZCMP-WITH-FP-NEXT:    lw t6, 44(a2)
+; RV64IZCMP-WITH-FP-NEXT:    lw s2, 48(a2)
+; RV64IZCMP-WITH-FP-NEXT:    lw s3, 52(a2)
+; RV64IZCMP-WITH-FP-NEXT:    lw s4, 56(a2)
+; RV64IZCMP-WITH-FP-NEXT:    lw s5, 60(a2)
+; RV64IZCMP-WITH-FP-NEXT:    lw s6, 64(a2)
+; RV64IZCMP-WITH-FP-NEXT:    lw s7, 68(a2)
+; RV64IZCMP-WITH-FP-NEXT:    lw s8, 72(a2)
+; RV64IZCMP-WITH-FP-NEXT:    lw s9, 76(a2)
+; RV64IZCMP-WITH-FP-NEXT:    lw s10, 80(a2)
+; RV64IZCMP-WITH-FP-NEXT:    lw s11, 84(a2)
+; RV64IZCMP-WITH-FP-NEXT:    lw ra, 88(a2)
+; RV64IZCMP-WITH-FP-NEXT:    lw t1, 92(a2)
+; RV64IZCMP-WITH-FP-NEXT:    lw t0, 96(a2)
+; RV64IZCMP-WITH-FP-NEXT:    lw a7, 100(a2)
+; RV64IZCMP-WITH-FP-NEXT:    lw s1, 104(a2)
+; RV64IZCMP-WITH-FP-NEXT:    lw a5, 108(a2)
+; RV64IZCMP-WITH-FP-NEXT:    lw a4, 112(a2)
+; RV64IZCMP-WITH-FP-NEXT:    lw a3, 116(a2)
+; RV64IZCMP-WITH-FP-NEXT:    lw a1, 120(a2)
+; RV64IZCMP-WITH-FP-NEXT:    lw a0, 124(a2)
+; RV64IZCMP-WITH-FP-NEXT:    lw t4, %lo(var+4)(a6)
+; RV64IZCMP-WITH-FP-NEXT:    lw t3, %lo(var+8)(a6)
+; RV64IZCMP-WITH-FP-NEXT:    lw t2, %lo(var+12)(a6)
+; RV64IZCMP-WITH-FP-NEXT:    sw a0, 124(a2)
+; RV64IZCMP-WITH-FP-NEXT:    sw a1, 120(a2)
+; RV64IZCMP-WITH-FP-NEXT:    sw a3, 116(a2)
+; RV64IZCMP-WITH-FP-NEXT:    sw a4, 112(a2)
+; RV64IZCMP-WITH-FP-NEXT:    sw a5, 108(a2)
+; RV64IZCMP-WITH-FP-NEXT:    sw s1, 104(a2)
+; RV64IZCMP-WITH-FP-NEXT:    sw a7, 100(a2)
+; RV64IZCMP-WITH-FP-NEXT:    sw t0, 96(a2)
+; RV64IZCMP-WITH-FP-NEXT:    sw t1, 92(a2)
+; RV64IZCMP-WITH-FP-NEXT:    sw ra, 88(a2)
+; RV64IZCMP-WITH-FP-NEXT:    sw s11, 84(a2)
+; RV64IZCMP-WITH-FP-NEXT:    sw s10, 80(a2)
+; RV64IZCMP-WITH-FP-NEXT:    sw s9, 76(a2)
+; RV64IZCMP-WITH-FP-NEXT:    sw s8, 72(a2)
+; RV64IZCMP-WITH-FP-NEXT:    sw s7, 68(a2)
+; RV64IZCMP-WITH-FP-NEXT:    sw s6, 64(a2)
+; RV64IZCMP-WITH-FP-NEXT:    sw s5, 60(a2)
+; RV64IZCMP-WITH-FP-NEXT:    sw s4, 56(a2)
+; RV64IZCMP-WITH-FP-NEXT:    sw s3, 52(a2)
+; RV64IZCMP-WITH-FP-NEXT:    sw s2, 48(a2)
+; RV64IZCMP-WITH-FP-NEXT:    sw t6, 44(a2)
+; RV64IZCMP-WITH-FP-NEXT:    sw t5, 40(a2)
 ; RV64IZCMP-WITH-FP-NEXT:    ld a0, -160(s0) # 8-byte Folded Reload
-; RV64IZCMP-WITH-FP-NEXT:    sw a0, 24(a5)
+; RV64IZCMP-WITH-FP-NEXT:    sw a0, 36(a2)
 ; RV64IZCMP-WITH-FP-NEXT:    ld a0, -152(s0) # 8-byte Folded Reload
-; RV64IZCMP-WITH-FP-NEXT:    sw a0, 20(a5)
+; RV64IZCMP-WITH-FP-NEXT:    sw a0, 32(a2)
 ; RV64IZCMP-WITH-FP-NEXT:    ld a0, -144(s0) # 8-byte Folded Reload
-; RV64IZCMP-WITH-FP-NEXT:    sw a0, 16(a5)
+; RV64IZCMP-WITH-FP-NEXT:    sw a0, 28(a2)
 ; RV64IZCMP-WITH-FP-NEXT:    ld a0, -136(s0) # 8-byte Folded Reload
-; RV64IZCMP-WITH-FP-NEXT:    sw a0, %lo(var+12)(a6)
+; RV64IZCMP-WITH-FP-NEXT:    sw a0, 24(a2)
 ; RV64IZCMP-WITH-FP-NEXT:    ld a0, -128(s0) # 8-byte Folded Reload
-; RV64IZCMP-WITH-FP-NEXT:    sw a0, %lo(var+8)(a6)
+; RV64IZCMP-WITH-FP-NEXT:    sw a0, 20(a2)
 ; RV64IZCMP-WITH-FP-NEXT:    ld a0, -120(s0) # 8-byte Folded Reload
-; RV64IZCMP-WITH-FP-NEXT:    sw a0, %lo(var+4)(a6)
+; RV64IZCMP-WITH-FP-NEXT:    sw a0, 16(a2)
+; RV64IZCMP-WITH-FP-NEXT:    sw t2, %lo(var+12)(a6)
+; RV64IZCMP-WITH-FP-NEXT:    sw t3, %lo(var+8)(a6)
+; RV64IZCMP-WITH-FP-NEXT:    sw t4, %lo(var+4)(a6)
 ; RV64IZCMP-WITH-FP-NEXT:    ld a0, -112(s0) # 8-byte Folded Reload
 ; RV64IZCMP-WITH-FP-NEXT:    sw a0, %lo(var)(a6)
 ; RV64IZCMP-WITH-FP-NEXT:    ld ra, 152(sp) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-ilp32d-common.ll b/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-ilp32d-common.ll
index 649234efaad903..15ce89b5d35646 100644
--- a/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-ilp32d-common.ll
+++ b/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-ilp32d-common.ll
@@ -190,21 +190,21 @@ define i32 @caller_many_scalars() nounwind {
 define i32 @callee_large_scalars(i128 %a, fp128 %b) nounwind {
 ; RV32I-FPELIM-LABEL: callee_large_scalars:
 ; RV32I-FPELIM:       # %bb.0:
-; RV32I-FPELIM-NEXT:    lw a2, 0(a1)
-; RV32I-FPELIM-NEXT:    lw a3, 0(a0)
-; RV32I-FPELIM-NEXT:    lw a4, 4(a1)
-; RV32I-FPELIM-NEXT:    lw a5, 12(a1)
-; RV32I-FPELIM-NEXT:    lw a6, 12(a0)
-; RV32I-FPELIM-NEXT:    lw a7, 4(a0)
+; RV32I-FPELIM-NEXT:    lw a2, 0(a0)
+; RV32I-FPELIM-NEXT:    lw a3, 4(a0)
+; RV32I-FPELIM-NEXT:    lw a4, 12(a1)
+; RV32I-FPELIM-NEXT:    lw a5, 12(a0)
+; RV32I-FPELIM-NEXT:    lw a6, 0(a1)
+; RV32I-FPELIM-NEXT:    lw a7, 4(a1)
 ; RV32I-FPELIM-NEXT:    lw a1, 8(a1)
 ; RV32I-FPELIM-NEXT:    lw a0, 8(a0)
-; RV32I-FPELIM-NEXT:    xor a5, a6, a5
-; RV32I-FPELIM-NEXT:    xor a4, a7, a4
-; RV32I-FPELIM-NEXT:    or a4, a4, a5
+; RV32I-FPELIM-NEXT:    xor a4, a5, a4
+; RV32I-FPELIM-NEXT:    xor a3, a3, a7
+; RV32I-FPELIM-NEXT:    or a3, a3, a4
 ; RV32I-FPELIM-NEXT:    xor a0, a0, a1
-; RV32I-FPELIM-NEXT:    xor a2, a3, a2
-; RV32I-FPELIM-NEXT:    or a0, a2, a0
-; RV32I-FPELIM-NEXT:    or a0, a0, a4
+; RV32I-FPELIM-NEXT:    xor a1, a2, a6
+; RV32I-FPELIM-NEXT:    or a0, a1, a0
+; RV32I-FPELIM-NEXT:    or a0, a0, a3
 ; RV32I-FPELIM-NEXT:    seqz a0, a0
 ; RV32I-FPELIM-NEXT:    ret
 ;
@@ -214,21 +214,21 @@ define i32 @callee_large_scalars(i128 %a, fp128 %b) nounwind {
 ; RV32I-WITHFP-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32I-WITHFP-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32I-WITHFP-NEXT:    addi s0, sp, 16
-; RV32I-WITHFP-NEXT:    lw a2, 0(a1)
-; RV32I-WITHFP-NEXT:    lw a3, 0(a0)
-; RV32I-WITHFP-NEXT:    lw a4, 4(a1)
-; RV32I-WITHFP-NEXT:    lw a5, 12(a1)
-; RV32I-WITHFP-NEXT:    lw a6, 12(a0)
-; RV32I-WITHFP-NEXT:    lw a7, 4(a0)
+; RV32I-WITHFP-NEXT:    lw a2, 0(a0)
+; RV32I-WITHFP-NEXT:    lw a3, 4(a0)
+; RV32I-WITHFP-NEXT:    lw a4, 12(a1)
+; RV32I-WITHFP-NEXT:    lw a5, 12(a0)
+; RV32I-WITHFP-NEXT:    lw a6, 0(a1)
+; RV32I-WITHFP-NEXT:    lw a7, 4(a1)
 ; RV32I-WITHFP-NEXT:    lw a1, 8(a1)
 ; RV32I-WITHFP-NEXT:    lw a0, 8(a0)
-; RV32I-WITHFP-NEXT:    xor a5, a6, a5
-; RV32I-WITHFP-NEXT:    xor a4, a7, a4
-; RV32I-WITHFP-NEXT:    or a4, a4, a5
+; RV32I-WITHFP-NEXT:    xor a4, a5, a4
+; RV32I-WITHFP-NEXT:    xor a3, a3, a7
+; RV32I-WITHFP-NEXT:    or a3, a3, a4
 ; RV32I-WITHFP-NEXT:    xor a0, a0, a1
-; RV32I-WITHFP-NEXT:    xor a2, a3, a2
-; RV32I-WITHFP-NEXT:    or a0, a2, a0
-; RV32I-WITHFP-NEXT:    or a0, a0, a4
+; RV32I-WITHFP-NEXT:    xor a1, a2, a6
+; RV32I-WITHFP-NEXT:    or a0, a1, a0
+; RV32I-WITHFP-NEXT:    or a0, a0, a3
 ; RV32I-WITHFP-NEXT:    seqz a0, a0
 ; RV32I-WITHFP-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-WITHFP-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -297,21 +297,21 @@ define i32 @callee_large_scalars_exhausted_regs(i32 %a, i32 %b, i32 %c, i32 %d,
 ; RV32I-FPELIM-LABEL: callee_large_scalars_exhausted_regs:
 ; RV32I-FPELIM:       # %bb.0:
 ; RV32I-FPELIM-NEXT:    lw a0, 4(sp)
-; RV32I-FPELIM-NEXT:    lw a1, 0(a0)
-; RV32I-FPELIM-NEXT:    lw a2, 0(a7)
-; RV32I-FPELIM-NEXT:    lw a3, 4(a0)
-; RV32I-FPELIM-NEXT:    lw a4, 12(a0)
-; RV32I-FPELIM-NEXT:    lw a5, 12(a7)
-; RV32I-FPELIM-NEXT:    lw a6, 4(a7)
+; RV32I-FPELIM-NEXT:    lw a1, 0(a7)
+; RV32I-FPELIM-NEXT:    lw a2, 4(a7)
+; RV32I-FPELIM-NEXT:    lw a3, 12(a0)
+; RV32I-FPELIM-NEXT:    lw a4, 12(a7)
+; RV32I-FPELIM-NEXT:    lw a5, 0(a0)
+; RV32I-FPELIM-NEXT:    lw a6, 4(a0)
 ; RV32I-FPELIM-NEXT:    lw a0, 8(a0)
 ; RV32I-FPELIM-NEXT:    lw a7, 8(a7)
-; RV32I-FPELIM-NEXT:    xor a4, a5, a4
-; RV32I-FPELIM-NEXT:    xor a3, a6, a3
-; RV32I-FPELIM-NEXT:    or a3, a3, a4
+; RV32I-FPELIM-NEXT:    xor a3, a4, a3
+; RV32I-FPELIM-NEXT:    xor a2, a2, a6
+; RV32I-FPELIM-NEXT:    or a2, a2, a3
 ; RV32I-FPELIM-NEXT:    xor a0, a7, a0
-; RV32I-FPELIM-NEXT:    xor a1, a2, a1
+; RV32I-FPELIM-NEXT:    xor a1, a1, a5
 ; RV32I-FPELIM-NEXT:    or a0, a1, a0
-; RV32I-FPELIM-NEXT:    or a0, a0, a3
+; RV32I-FPELIM-NEXT:    or a0, a0, a2
 ; RV32I-FPELIM-NEXT:    seqz a0, a0
 ; RV32I-FPELIM-NEXT:    ret
 ;
@@ -322,21 +322,21 @@ define i32 @callee_large_scalars_exhausted_regs(i32 %a, i32 %b, i32 %c, i32 %d,
 ; RV32I-WITHFP-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32I-WITHFP-NEXT:    addi s0, sp, 16
 ; RV32I-WITHFP-NEXT:    lw a0, 4(s0)
-; RV32I-WITHFP-NEXT:    lw a1, 0(a0)
-; RV32I-WITHFP-NEXT:    lw a2, 0(a7)
-; RV32I-WITHFP-NEXT:    lw a3, 4(a0)
-; RV32I-WITHFP-NEXT:    lw a4, 12(a0)
-; RV32I-WITHFP-NEXT:    lw a5, 12(a7)
-; RV32I-WITHFP-NEXT:    lw a6, 4(a7)
+; RV32I-WITHFP-NEXT:    lw a1, 0(a7)
+; RV32I-WITHFP-NEXT:    lw a2, 4(a7)
+; RV32I-WITHFP-NEXT:    lw a3, 12(a0)
+; RV32I-WITHFP-NEXT:    lw a4, 12(a7)
+; RV32I-WITHFP-NEXT:    lw a5, 0(a0)
+; RV32I-WITHFP-NEXT:    lw a6, 4(a0)
 ; RV32I-WITHFP-NEXT:    lw a0, 8(a0)
 ; RV32I-WITHFP-NEXT:    lw a7, 8(a7)
-; RV32I-WITHFP-NEXT:    xor a4, a5, a4
-; RV32I-WITHFP-NEXT:    xor a3, a6, a3
-; RV32I-WITHFP-NEXT:    or a3, a3, a4
+; RV32I-WITHFP-NEXT:    xor a3, a4, a3
+; RV32I-WITHFP-NEXT:    xor a2, a2, a6
+; RV32I-WITHFP-NEXT:    or a2, a2, a3
 ; RV32I-WITHFP-NEXT:    xor a0, a7, a0
-; RV32I-WITHFP-NEXT:    xor a1, a2, a1
+; RV32I-WITHFP-NEXT:    xor a1, a1, a5
 ; RV32I-WITHFP-NEXT:    or a0, a1, a0
-; RV32I-WITHFP-NEXT:    or a0, a0, a3
+; RV32I-WITHFP-NEXT:    or a0, a0, a2
 ; RV32I-WITHFP-NEXT:    seqz a0, a0
 ; RV32I-WITHFP-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-WITHFP-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/calling-conv-lp64-lp64f-lp64d-common.ll b/llvm/test/CodeGen/RISCV/calling-conv-lp64-lp64f-lp64d-common.ll
index c2690d15665e21..a174c3422dbb27 100644
--- a/llvm/test/CodeGen/RISCV/calling-conv-lp64-lp64f-lp64d-common.ll
+++ b/llvm/test/CodeGen/RISCV/calling-conv-lp64-lp64f-lp64d-common.ll
@@ -106,21 +106,21 @@ define i32 @caller_many_scalars() nounwind {
 define i64 @callee_large_scalars(i256 %a, i256 %b) nounwind {
 ; RV64I-LABEL: callee_large_scalars:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    ld a2, 0(a1)
-; RV64I-NEXT:    ld a3, 0(a0)
-; RV64I-NEXT:    ld a4, 8(a1)
-; RV64I-NEXT:    ld a5, 24(a1)
-; RV64I-NEXT:    ld a6, 24(a0)
-; RV64I-NEXT:    ld a7, 8(a0)
+; RV64I-NEXT:    ld a2, 0(a0)
+; RV64I-NEXT:    ld a3, 8(a0)
+; RV64I-NEXT:    ld a4, 24(a1)
+; RV64I-NEXT:    ld a5, 24(a0)
+; RV64I-NEXT:    ld a6, 0(a1)
+; RV64I-NEXT:    ld a7, 8(a1)
 ; RV64I-NEXT:    ld a1, 16(a1)
 ; RV64I-NEXT:    ld a0, 16(a0)
-; RV64I-NEXT:    xor a5, a6, a5
-; RV64I-NEXT:    xor a4, a7, a4
-; RV64I-NEXT:    or a4, a4, a5
+; RV64I-NEXT:    xor a4, a5, a4
+; RV64I-NEXT:    xor a3, a3, a7
+; RV64I-NEXT:    or a3, a3, a4
 ; RV64I-NEXT:    xor a0, a0, a1
-; RV64I-NEXT:    xor a2, a3, a2
-; RV64I-NEXT:    or a0, a2, a0
-; RV64I-NEXT:    or a0, a0, a4
+; RV64I-NEXT:    xor a1, a2, a6
+; RV64I-NEXT:    or a0, a1, a0
+; RV64I-NEXT:    or a0, a0, a3
 ; RV64I-NEXT:    seqz a0, a0
 ; RV64I-NEXT:    ret
   %1 = icmp eq i256 %a, %b
@@ -161,21 +161,21 @@ define i64 @callee_large_scalars_exhausted_regs(i64 %a, i64 %b, i64 %c, i64 %d,
 ; RV64I-LABEL: callee_large_scalars_exhausted_regs:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    ld a0, 8(sp)
-; RV64I-NEXT:    ld a1, 0(a0)
-; RV64I-NEXT:    ld a2, 0(a7)
-; RV64I-NEXT:    ld a3, 8(a0)
-; RV64I-NEXT:    ld a4, 24(a0)
-; RV64I-NEXT:    ld a5, 24(a7)
-; RV64I-NEXT:    ld a6, 8(a7)
+; RV64I-NEXT:    ld a1, 0(a7)
+; RV64I-NEXT:    ld a2, 8(a7)
+; RV64I-NEXT:    ld a3, 24(a0)
+; RV64I-NEXT:    ld a4, 24(a7)
+; RV64I-NEXT:    ld a5, 0(a0)
+; RV64I-NEXT:    ld a6, 8(a0)
 ; RV64I-NEXT:    ld a0, 16(a0)
 ; RV64I-NEXT:    ld a7, 16(a7)
-; RV64I-NEXT:    xor a4, a5, a4
-; RV64I-NEXT:    xor a3, a6, a3
-; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    xor a3, a4, a3
+; RV64I-NEXT:    xor a2, a2, a6
+; RV64I-NEXT:    or a2, a2, a3
 ; RV64I-NEXT:    xor a0, a7, a0
-; RV64I-NEXT:    xor a1, a2, a1
+; RV64I-NEXT:    xor a1, a1, a5
 ; RV64I-NEXT:    or a0, a1, a0
-; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    or a0, a0, a2
 ; RV64I-NEXT:    seqz a0, a0
 ; RV64I-NEXT:    ret
   %1 = icmp eq i256 %h, %j
diff --git a/llvm/test/CodeGen/RISCV/forced-atomics.ll b/llvm/test/CodeGen/RISCV/forced-atomics.ll
index f2079e314d51c1..bfa14b52f18e93 100644
--- a/llvm/test/CodeGen/RISCV/forced-atomics.ll
+++ b/llvm/test/CodeGen/RISCV/forced-atomics.ll
@@ -3348,8 +3348,8 @@ define i64 @rmw64_max_seq_cst(ptr %p) nounwind {
 ; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    mv s0, a0
-; RV32-NEXT:    lw a1, 4(a0)
 ; RV32-NEXT:    lw a4, 0(a0)
+; RV32-NEXT:    lw a1, 4(a0)
 ; RV32-NEXT:    j .LBB49_2
 ; RV32-NEXT:  .LBB49_1: # %atomicrmw.start
 ; RV32-NEXT:    # in Loop: Header=BB49_2 Depth=1
@@ -3362,8 +3362,8 @@ define i64 @rmw64_max_seq_cst(ptr %p) nounwind {
 ; RV32-NEXT:    li a5, 5
 ; RV32-NEXT:    mv a0, s0
 ; RV32-NEXT:    call __atomic_compare_exchange_8 at plt
-; RV32-NEXT:    lw a1, 4(sp)
 ; RV32-NEXT:    lw a4, 0(sp)
+; RV32-NEXT:    lw a1, 4(sp)
 ; RV32-NEXT:    bnez a0, .LBB49_6
 ; RV32-NEXT:  .LBB49_2: # %atomicrmw.start
 ; RV32-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -3453,8 +3453,8 @@ define i64 @rmw64_min_seq_cst(ptr %p) nounwind {
 ; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    mv s0, a0
-; RV32-NEXT:    lw a1, 4(a0)
 ; RV32-NEXT:    lw a4, 0(a0)
+; RV32-NEXT:    lw a1, 4(a0)
 ; RV32-NEXT:    j .LBB50_2
 ; RV32-NEXT:  .LBB50_1: # %atomicrmw.start
 ; RV32-NEXT:    # in Loop: Header=BB50_2 Depth=1
@@ -3467,8 +3467,8 @@ define i64 @rmw64_min_seq_cst(ptr %p) nounwind {
 ; RV32-NEXT:    li a5, 5
 ; RV32-NEXT:    mv a0, s0
 ; RV32-NEXT:    call __atomic_compare_exchange_8 at plt
-; RV32-NEXT:    lw a1, 4(sp)
 ; RV32-NEXT:    lw a4, 0(sp)
+; RV32-NEXT:    lw a1, 4(sp)
 ; RV32-NEXT:    bnez a0, .LBB50_6
 ; RV32-NEXT:  .LBB50_2: # %atomicrmw.start
 ; RV32-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -3560,8 +3560,8 @@ define i64 @rmw64_umax_seq_cst(ptr %p) nounwind {
 ; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    mv s0, a0
-; RV32-NEXT:    lw a1, 4(a0)
 ; RV32-NEXT:    lw a4, 0(a0)
+; RV32-NEXT:    lw a1, 4(a0)
 ; RV32-NEXT:    j .LBB51_2
 ; RV32-NEXT:  .LBB51_1: # %atomicrmw.start
 ; RV32-NEXT:    # in Loop: Header=BB51_2 Depth=1
@@ -3574,8 +3574,8 @@ define i64 @rmw64_umax_seq_cst(ptr %p) nounwind {
 ; RV32-NEXT:    li a5, 5
 ; RV32-NEXT:    mv a0, s0
 ; RV32-NEXT:    call __atomic_compare_exchange_8 at plt
-; RV32-NEXT:    lw a1, 4(sp)
 ; RV32-NEXT:    lw a4, 0(sp)
+; RV32-NEXT:    lw a1, 4(sp)
 ; RV32-NEXT:    bnez a0, .LBB51_4
 ; RV32-NEXT:  .LBB51_2: # %atomicrmw.start
 ; RV32-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -3652,8 +3652,8 @@ define i64 @rmw64_umin_seq_cst(ptr %p) nounwind {
 ; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    mv s0, a0
-; RV32-NEXT:    lw a1, 4(a0)
 ; RV32-NEXT:    lw a4, 0(a0)
+; RV32-NEXT:    lw a1, 4(a0)
 ; RV32-NEXT:    j .LBB52_2
 ; RV32-NEXT:  .LBB52_1: # %atomicrmw.start
 ; RV32-NEXT:    # in Loop: Header=BB52_2 Depth=1
@@ -3666,8 +3666,8 @@ define i64 @rmw64_umin_seq_cst(ptr %p) nounwind {
 ; RV32-NEXT:    li a5, 5
 ; RV32-NEXT:    mv a0, s0
 ; RV32-NEXT:    call __atomic_compare_exchange_8 at plt
-; RV32-NEXT:    lw a1, 4(sp)
 ; RV32-NEXT:    lw a4, 0(sp)
+; RV32-NEXT:    lw a1, 4(sp)
 ; RV32-NEXT:    bnez a0, .LBB52_4
 ; RV32-NEXT:  .LBB52_2: # %atomicrmw.start
 ; RV32-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -3802,30 +3802,30 @@ define double @rmw64_fadd_seq_cst(ptr %p) nounwind {
 ; RV32-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    mv s0, a0
-; RV32-NEXT:    lw s1, 4(a0)
-; RV32-NEXT:    lw s2, 0(a0)
+; RV32-NEXT:    lw s1, 0(a0)
+; RV32-NEXT:    lw s2, 4(a0)
 ; RV32-NEXT:  .LBB54_1: # %atomicrmw.start
 ; RV32-NEXT:    # =>This Inner Loop Header: Depth=1
 ; RV32-NEXT:    lui a3, 261888
-; RV32-NEXT:    mv a0, s2
-; RV32-NEXT:    mv a1, s1
+; RV32-NEXT:    mv a0, s1
+; RV32-NEXT:    mv a1, s2
 ; RV32-NEXT:    li a2, 0
 ; RV32-NEXT:    call __adddf3 at plt
 ; RV32-NEXT:    mv a2, a0
 ; RV32-NEXT:    mv a3, a1
-; RV32-NEXT:    sw s2, 8(sp)
-; RV32-NEXT:    sw s1, 12(sp)
+; RV32-NEXT:    sw s1, 8(sp)
+; RV32-NEXT:    sw s2, 12(sp)
 ; RV32-NEXT:    addi a1, sp, 8
 ; RV32-NEXT:    li a4, 5
 ; RV32-NEXT:    li a5, 5
 ; RV32-NEXT:    mv a0, s0
 ; RV32-NEXT:    call __atomic_compare_exchange_8 at plt
-; RV32-NEXT:    lw s1, 12(sp)
-; RV32-NEXT:    lw s2, 8(sp)
+; RV32-NEXT:    lw s1, 8(sp)
+; RV32-NEXT:    lw s2, 12(sp)
 ; RV32-NEXT:    beqz a0, .LBB54_1
 ; RV32-NEXT:  # %bb.2: # %atomicrmw.end
-; RV32-NEXT:    mv a0, s2
-; RV32-NEXT:    mv a1, s1
+; RV32-NEXT:    mv a0, s1
+; RV32-NEXT:    mv a1, s2
 ; RV32-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -3937,30 +3937,30 @@ define double @rmw64_fsub_seq_cst(ptr %p) nounwind {
 ; RV32-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    mv s0, a0
-; RV32-NEXT:    lw s1, 4(a0)
-; RV32-NEXT:    lw s2, 0(a0)
+; RV32-NEXT:    lw s1, 0(a0)
+; RV32-NEXT:    lw s2, 4(a0)
 ; RV32-NEXT:  .LBB55_1: # %atomicrmw.start
 ; RV32-NEXT:    # =>This Inner Loop Header: Depth=1
 ; RV32-NEXT:    lui a3, 786176
-; RV32-NEXT:    mv a0, s2
-; RV32-NEXT:    mv a1, s1
+; RV32-NEXT:    mv a0, s1
+; RV32-NEXT:    mv a1, s2
 ; RV32-NEXT:    li a2, 0
 ; RV32-NEXT:    call __adddf3 at plt
 ; RV32-NEXT:    mv a2, a0
 ; RV32-NEXT:    mv a3, a1
-; RV32-NEXT:    sw s2, 8(sp)
-; RV32-NEXT:    sw s1, 12(sp)
+; RV32-NEXT:    sw s1, 8(sp)
+; RV32-NEXT:    sw s2, 12(sp)
 ; RV32-NEXT:    addi a1, sp, 8
 ; RV32-NEXT:    li a4, 5
 ; RV32-NEXT:    li a5, 5
 ; RV32-NEXT:    mv a0, s0
 ; RV32-NEXT:    call __atomic_compare_exchange_8 at plt
-; RV32-NEXT:    lw s1, 12(sp)
-; RV32-NEXT:    lw s2, 8(sp)
+; RV32-NEXT:    lw s1, 8(sp)
+; RV32-NEXT:    lw s2, 12(sp)
 ; RV32-NEXT:    beqz a0, .LBB55_1
 ; RV32-NEXT:  # %bb.2: # %atomicrmw.end
-; RV32-NEXT:    mv a0, s2
-; RV32-NEXT:    mv a1, s1
+; RV32-NEXT:    mv a0, s1
+; RV32-NEXT:    mv a1, s2
 ; RV32-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -4072,30 +4072,30 @@ define double @rmw64_fmin_seq_cst(ptr %p) nounwind {
 ; RV32-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    mv s0, a0
-; RV32-NEXT:    lw s1, 4(a0)
-; RV32-NEXT:    lw s2, 0(a0)
+; RV32-NEXT:    lw s1, 0(a0)
+; RV32-NEXT:    lw s2, 4(a0)
 ; RV32-NEXT:  .LBB56_1: # %atomicrmw.start
 ; RV32-NEXT:    # =>This Inner Loop Header: Depth=1
 ; RV32-NEXT:    lui a3, 261888
-; RV32-NEXT:    mv a0, s2
-; RV32-NEXT:    mv a1, s1
+; RV32-NEXT:    mv a0, s1
+; RV32-NEXT:    mv a1, s2
 ; RV32-NEXT:    li a2, 0
 ; RV32-NEXT:    call fmin at plt
 ; RV32-NEXT:    mv a2, a0
 ; RV32-NEXT:    mv a3, a1
-; RV32-NEXT:    sw s2, 8(sp)
-; RV32-NEXT:    sw s1, 12(sp)
+; RV32-NEXT:    sw s1, 8(sp)
+; RV32-NEXT:    sw s2, 12(sp)
 ; RV32-NEXT:    addi a1, sp, 8
 ; RV32-NEXT:    li a4, 5
 ; RV32-NEXT:    li a5, 5
 ; RV32-NEXT:    mv a0, s0
 ; RV32-NEXT:    call __atomic_compare_exchange_8 at plt
-; RV32-NEXT:    lw s1, 12(sp)
-; RV32-NEXT:    lw s2, 8(sp)
+; RV32-NEXT:    lw s1, 8(sp)
+; RV32-NEXT:    lw s2, 12(sp)
 ; RV32-NEXT:    beqz a0, .LBB56_1
 ; RV32-NEXT:  # %bb.2: # %atomicrmw.end
-; RV32-NEXT:    mv a0, s2
-; RV32-NEXT:    mv a1, s1
+; RV32-NEXT:    mv a0, s1
+; RV32-NEXT:    mv a1, s2
 ; RV32-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -4207,30 +4207,30 @@ define double @rmw64_fmax_seq_cst(ptr %p) nounwind {
 ; RV32-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    mv s0, a0
-; RV32-NEXT:    lw s1, 4(a0)
-; RV32-NEXT:    lw s2, 0(a0)
+; RV32-NEXT:    lw s1, 0(a0)
+; RV32-NEXT:    lw s2, 4(a0)
 ; RV32-NEXT:  .LBB57_1: # %atomicrmw.start
 ; RV32-NEXT:    # =>This Inner Loop Header: Depth=1
 ; RV32-NEXT:    lui a3, 261888
-; RV32-NEXT:    mv a0, s2
-; RV32-NEXT:    mv a1, s1
+; RV32-NEXT:    mv a0, s1
+; RV32-NEXT:    mv a1, s2
 ; RV32-NEXT:    li a2, 0
 ; RV32-NEXT:    call fmax at plt
 ; RV32-NEXT:    mv a2, a0
 ; RV32-NEXT:    mv a3, a1
-; RV32-NEXT:    sw s2, 8(sp)
-; RV32-NEXT:    sw s1, 12(sp)
+; RV32-NEXT:    sw s1, 8(sp)
+; RV32-NEXT:    sw s2, 12(sp)
 ; RV32-NEXT:    addi a1, sp, 8
 ; RV32-NEXT:    li a4, 5
 ; RV32-NEXT:    li a5, 5
 ; RV32-NEXT:    mv a0, s0
 ; RV32-NEXT:    call __atomic_compare_exchange_8 at plt
-; RV32-NEXT:    lw s1, 12(sp)
-; RV32-NEXT:    lw s2, 8(sp)
+; RV32-NEXT:    lw s1, 8(sp)
+; RV32-NEXT:    lw s2, 12(sp)
 ; RV32-NEXT:    beqz a0, .LBB57_1
 ; RV32-NEXT:  # %bb.2: # %atomicrmw.end
-; RV32-NEXT:    mv a0, s2
-; RV32-NEXT:    mv a1, s1
+; RV32-NEXT:    mv a0, s1
+; RV32-NEXT:    mv a1, s2
 ; RV32-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -4346,8 +4346,8 @@ define i64 @cmpxchg64_monotonic(ptr %p) nounwind {
 ; RV32-NEXT:    li a4, 0
 ; RV32-NEXT:    li a5, 0
 ; RV32-NEXT:    call __atomic_compare_exchange_8 at plt
-; RV32-NEXT:    lw a1, 4(sp)
 ; RV32-NEXT:    lw a0, 0(sp)
+; RV32-NEXT:    lw a1, 4(sp)
 ; RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
@@ -4406,8 +4406,8 @@ define i64 @cmpxchg64_seq_cst(ptr %p) nounwind {
 ; RV32-NEXT:    li a5, 5
 ; RV32-NEXT:    li a3, 0
 ; RV32-NEXT:    call __atomic_compare_exchange_8 at plt
-; RV32-NEXT:    lw a1, 4(sp)
 ; RV32-NEXT:    lw a0, 0(sp)
+; RV32-NEXT:    lw a1, 4(sp)
 ; RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
@@ -4531,25 +4531,25 @@ define i128 @rmw128(ptr %p) nounwind {
 ; RV32-NEXT:    sw s0, 40(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    sw s1, 36(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    mv s0, a1
-; RV32-NEXT:    lw a1, 12(a1)
-; RV32-NEXT:    lw a2, 8(s0)
-; RV32-NEXT:    lw a3, 4(s0)
-; RV32-NEXT:    lw a4, 0(s0)
+; RV32-NEXT:    lw a1, 0(a1)
+; RV32-NEXT:    lw a2, 4(s0)
+; RV32-NEXT:    lw a3, 8(s0)
+; RV32-NEXT:    lw a4, 12(s0)
 ; RV32-NEXT:    mv s1, a0
 ; RV32-NEXT:  .LBB62_1: # %atomicrmw.start
 ; RV32-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32-NEXT:    addi a0, a4, 1
+; RV32-NEXT:    addi a0, a1, 1
 ; RV32-NEXT:    seqz a5, a0
-; RV32-NEXT:    add a5, a3, a5
+; RV32-NEXT:    add a5, a2, a5
 ; RV32-NEXT:    or a6, a0, a5
 ; RV32-NEXT:    seqz a6, a6
-; RV32-NEXT:    add a6, a2, a6
-; RV32-NEXT:    sltu a7, a6, a2
-; RV32-NEXT:    add a7, a1, a7
-; RV32-NEXT:    sw a4, 16(sp)
-; RV32-NEXT:    sw a3, 20(sp)
-; RV32-NEXT:    sw a2, 24(sp)
-; RV32-NEXT:    sw a1, 28(sp)
+; RV32-NEXT:    add a6, a3, a6
+; RV32-NEXT:    sltu a7, a6, a3
+; RV32-NEXT:    add a7, a4, a7
+; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    sw a2, 20(sp)
+; RV32-NEXT:    sw a3, 24(sp)
+; RV32-NEXT:    sw a4, 28(sp)
 ; RV32-NEXT:    sw a5, 4(sp)
 ; RV32-NEXT:    sw a0, 0(sp)
 ; RV32-NEXT:    sw a6, 8(sp)
@@ -4561,16 +4561,16 @@ define i128 @rmw128(ptr %p) nounwind {
 ; RV32-NEXT:    li a5, 5
 ; RV32-NEXT:    mv a1, s0
 ; RV32-NEXT:    call __atomic_compare_exchange at plt
-; RV32-NEXT:    lw a1, 28(sp)
-; RV32-NEXT:    lw a2, 24(sp)
-; RV32-NEXT:    lw a3, 20(sp)
-; RV32-NEXT:    lw a4, 16(sp)
+; RV32-NEXT:    lw a1, 16(sp)
+; RV32-NEXT:    lw a2, 20(sp)
+; RV32-NEXT:    lw a3, 24(sp)
+; RV32-NEXT:    lw a4, 28(sp)
 ; RV32-NEXT:    beqz a0, .LBB62_1
 ; RV32-NEXT:  # %bb.2: # %atomicrmw.end
-; RV32-NEXT:    sw a4, 0(s1)
-; RV32-NEXT:    sw a3, 4(s1)
-; RV32-NEXT:    sw a2, 8(s1)
-; RV32-NEXT:    sw a1, 12(s1)
+; RV32-NEXT:    sw a1, 0(s1)
+; RV32-NEXT:    sw a2, 4(s1)
+; RV32-NEXT:    sw a3, 8(s1)
+; RV32-NEXT:    sw a4, 12(s1)
 ; RV32-NEXT:    lw ra, 44(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    lw s0, 40(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    lw s1, 36(sp) # 4-byte Folded Reload
@@ -4639,8 +4639,8 @@ define i128 @cmpxchg128(ptr %p) nounwind {
 ; RV64-NEXT:    li a5, 5
 ; RV64-NEXT:    li a3, 0
 ; RV64-NEXT:    call __atomic_compare_exchange_16 at plt
-; RV64-NEXT:    ld a1, 8(sp)
 ; RV64-NEXT:    ld a0, 0(sp)
+; RV64-NEXT:    ld a1, 8(sp)
 ; RV64-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    addi sp, sp, 32
 ; RV64-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/fpclamptosat.ll b/llvm/test/CodeGen/RISCV/fpclamptosat.ll
index b091b0613c0f30..7695815cf50686 100644
--- a/llvm/test/CodeGen/RISCV/fpclamptosat.ll
+++ b/llvm/test/CodeGen/RISCV/fpclamptosat.ll
@@ -1043,24 +1043,24 @@ define i64 @stest_f64i64(double %x) {
 ; RV32IF-NEXT:    mv a1, a0
 ; RV32IF-NEXT:    addi a0, sp, 8
 ; RV32IF-NEXT:    call __fixdfti at plt
-; RV32IF-NEXT:    lw a0, 20(sp)
-; RV32IF-NEXT:    lw a2, 16(sp)
+; RV32IF-NEXT:    lw a3, 8(sp)
 ; RV32IF-NEXT:    lw a1, 12(sp)
-; RV32IF-NEXT:    lw a4, 8(sp)
-; RV32IF-NEXT:    lui a3, 524288
-; RV32IF-NEXT:    addi a5, a3, -1
+; RV32IF-NEXT:    lw a2, 16(sp)
+; RV32IF-NEXT:    lw a4, 20(sp)
+; RV32IF-NEXT:    lui a0, 524288
+; RV32IF-NEXT:    addi a5, a0, -1
 ; RV32IF-NEXT:    beq a1, a5, .LBB18_2
 ; RV32IF-NEXT:  # %bb.1: # %entry
 ; RV32IF-NEXT:    sltu a6, a1, a5
-; RV32IF-NEXT:    or a7, a2, a0
+; RV32IF-NEXT:    or a7, a2, a4
 ; RV32IF-NEXT:    bnez a7, .LBB18_3
 ; RV32IF-NEXT:    j .LBB18_4
 ; RV32IF-NEXT:  .LBB18_2:
-; RV32IF-NEXT:    sltiu a6, a4, -1
-; RV32IF-NEXT:    or a7, a2, a0
+; RV32IF-NEXT:    sltiu a6, a3, -1
+; RV32IF-NEXT:    or a7, a2, a4
 ; RV32IF-NEXT:    beqz a7, .LBB18_4
 ; RV32IF-NEXT:  .LBB18_3: # %entry
-; RV32IF-NEXT:    slti a6, a0, 0
+; RV32IF-NEXT:    slti a6, a4, 0
 ; RV32IF-NEXT:  .LBB18_4: # %entry
 ; RV32IF-NEXT:    neg a7, a6
 ; RV32IF-NEXT:    addi t0, a6, -1
@@ -1068,21 +1068,21 @@ define i64 @stest_f64i64(double %x) {
 ; RV32IF-NEXT:  # %bb.5: # %entry
 ; RV32IF-NEXT:    mv a1, a5
 ; RV32IF-NEXT:  .LBB18_6: # %entry
-; RV32IF-NEXT:    or a4, t0, a4
-; RV32IF-NEXT:    and a5, a7, a0
+; RV32IF-NEXT:    or a3, t0, a3
+; RV32IF-NEXT:    and a4, a7, a4
 ; RV32IF-NEXT:    and a2, a7, a2
-; RV32IF-NEXT:    beq a1, a3, .LBB18_8
+; RV32IF-NEXT:    beq a1, a0, .LBB18_8
 ; RV32IF-NEXT:  # %bb.7: # %entry
-; RV32IF-NEXT:    sltu a0, a3, a1
+; RV32IF-NEXT:    sltu a0, a0, a1
 ; RV32IF-NEXT:    j .LBB18_9
 ; RV32IF-NEXT:  .LBB18_8:
-; RV32IF-NEXT:    snez a0, a4
+; RV32IF-NEXT:    snez a0, a3
 ; RV32IF-NEXT:  .LBB18_9: # %entry
-; RV32IF-NEXT:    and a2, a2, a5
-; RV32IF-NEXT:    li a3, -1
-; RV32IF-NEXT:    beq a2, a3, .LBB18_11
+; RV32IF-NEXT:    and a2, a2, a4
+; RV32IF-NEXT:    li a5, -1
+; RV32IF-NEXT:    beq a2, a5, .LBB18_11
 ; RV32IF-NEXT:  # %bb.10: # %entry
-; RV32IF-NEXT:    slti a0, a5, 0
+; RV32IF-NEXT:    slti a0, a4, 0
 ; RV32IF-NEXT:    xori a0, a0, 1
 ; RV32IF-NEXT:  .LBB18_11: # %entry
 ; RV32IF-NEXT:    bnez a0, .LBB18_13
@@ -1090,7 +1090,7 @@ define i64 @stest_f64i64(double %x) {
 ; RV32IF-NEXT:    lui a1, 524288
 ; RV32IF-NEXT:  .LBB18_13: # %entry
 ; RV32IF-NEXT:    neg a0, a0
-; RV32IF-NEXT:    and a0, a0, a4
+; RV32IF-NEXT:    and a0, a0, a3
 ; RV32IF-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IF-NEXT:    addi sp, sp, 32
 ; RV32IF-NEXT:    ret
@@ -1142,24 +1142,24 @@ define i64 @stest_f64i64(double %x) {
 ; RV32IFD-NEXT:    .cfi_offset ra, -4
 ; RV32IFD-NEXT:    addi a0, sp, 8
 ; RV32IFD-NEXT:    call __fixdfti at plt
-; RV32IFD-NEXT:    lw a0, 20(sp)
-; RV32IFD-NEXT:    lw a2, 16(sp)
+; RV32IFD-NEXT:    lw a3, 8(sp)
 ; RV32IFD-NEXT:    lw a1, 12(sp)
-; RV32IFD-NEXT:    lw a4, 8(sp)
-; RV32IFD-NEXT:    lui a3, 524288
-; RV32IFD-NEXT:    addi a5, a3, -1
+; RV32IFD-NEXT:    lw a2, 16(sp)
+; RV32IFD-NEXT:    lw a4, 20(sp)
+; RV32IFD-NEXT:    lui a0, 524288
+; RV32IFD-NEXT:    addi a5, a0, -1
 ; RV32IFD-NEXT:    beq a1, a5, .LBB18_2
 ; RV32IFD-NEXT:  # %bb.1: # %entry
 ; RV32IFD-NEXT:    sltu a6, a1, a5
-; RV32IFD-NEXT:    or a7, a2, a0
+; RV32IFD-NEXT:    or a7, a2, a4
 ; RV32IFD-NEXT:    bnez a7, .LBB18_3
 ; RV32IFD-NEXT:    j .LBB18_4
 ; RV32IFD-NEXT:  .LBB18_2:
-; RV32IFD-NEXT:    sltiu a6, a4, -1
-; RV32IFD-NEXT:    or a7, a2, a0
+; RV32IFD-NEXT:    sltiu a6, a3, -1
+; RV32IFD-NEXT:    or a7, a2, a4
 ; RV32IFD-NEXT:    beqz a7, .LBB18_4
 ; RV32IFD-NEXT:  .LBB18_3: # %entry
-; RV32IFD-NEXT:    slti a6, a0, 0
+; RV32IFD-NEXT:    slti a6, a4, 0
 ; RV32IFD-NEXT:  .LBB18_4: # %entry
 ; RV32IFD-NEXT:    neg a7, a6
 ; RV32IFD-NEXT:    addi t0, a6, -1
@@ -1167,21 +1167,21 @@ define i64 @stest_f64i64(double %x) {
 ; RV32IFD-NEXT:  # %bb.5: # %entry
 ; RV32IFD-NEXT:    mv a1, a5
 ; RV32IFD-NEXT:  .LBB18_6: # %entry
-; RV32IFD-NEXT:    or a4, t0, a4
-; RV32IFD-NEXT:    and a5, a7, a0
+; RV32IFD-NEXT:    or a3, t0, a3
+; RV32IFD-NEXT:    and a4, a7, a4
 ; RV32IFD-NEXT:    and a2, a7, a2
-; RV32IFD-NEXT:    beq a1, a3, .LBB18_8
+; RV32IFD-NEXT:    beq a1, a0, .LBB18_8
 ; RV32IFD-NEXT:  # %bb.7: # %entry
-; RV32IFD-NEXT:    sltu a0, a3, a1
+; RV32IFD-NEXT:    sltu a0, a0, a1
 ; RV32IFD-NEXT:    j .LBB18_9
 ; RV32IFD-NEXT:  .LBB18_8:
-; RV32IFD-NEXT:    snez a0, a4
+; RV32IFD-NEXT:    snez a0, a3
 ; RV32IFD-NEXT:  .LBB18_9: # %entry
-; RV32IFD-NEXT:    and a2, a2, a5
-; RV32IFD-NEXT:    li a3, -1
-; RV32IFD-NEXT:    beq a2, a3, .LBB18_11
+; RV32IFD-NEXT:    and a2, a2, a4
+; RV32IFD-NEXT:    li a5, -1
+; RV32IFD-NEXT:    beq a2, a5, .LBB18_11
 ; RV32IFD-NEXT:  # %bb.10: # %entry
-; RV32IFD-NEXT:    slti a0, a5, 0
+; RV32IFD-NEXT:    slti a0, a4, 0
 ; RV32IFD-NEXT:    xori a0, a0, 1
 ; RV32IFD-NEXT:  .LBB18_11: # %entry
 ; RV32IFD-NEXT:    bnez a0, .LBB18_13
@@ -1189,7 +1189,7 @@ define i64 @stest_f64i64(double %x) {
 ; RV32IFD-NEXT:    lui a1, 524288
 ; RV32IFD-NEXT:  .LBB18_13: # %entry
 ; RV32IFD-NEXT:    neg a0, a0
-; RV32IFD-NEXT:    and a0, a0, a4
+; RV32IFD-NEXT:    and a0, a0, a3
 ; RV32IFD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IFD-NEXT:    addi sp, sp, 32
 ; RV32IFD-NEXT:    ret
@@ -1440,24 +1440,24 @@ define i64 @stest_f32i64(float %x) {
 ; RV32-NEXT:    .cfi_offset ra, -4
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    call __fixsfti at plt
-; RV32-NEXT:    lw a0, 20(sp)
-; RV32-NEXT:    lw a2, 16(sp)
+; RV32-NEXT:    lw a3, 8(sp)
 ; RV32-NEXT:    lw a1, 12(sp)
-; RV32-NEXT:    lw a4, 8(sp)
-; RV32-NEXT:    lui a3, 524288
-; RV32-NEXT:    addi a5, a3, -1
+; RV32-NEXT:    lw a2, 16(sp)
+; RV32-NEXT:    lw a4, 20(sp)
+; RV32-NEXT:    lui a0, 524288
+; RV32-NEXT:    addi a5, a0, -1
 ; RV32-NEXT:    beq a1, a5, .LBB21_2
 ; RV32-NEXT:  # %bb.1: # %entry
 ; RV32-NEXT:    sltu a6, a1, a5
-; RV32-NEXT:    or a7, a2, a0
+; RV32-NEXT:    or a7, a2, a4
 ; RV32-NEXT:    bnez a7, .LBB21_3
 ; RV32-NEXT:    j .LBB21_4
 ; RV32-NEXT:  .LBB21_2:
-; RV32-NEXT:    sltiu a6, a4, -1
-; RV32-NEXT:    or a7, a2, a0
+; RV32-NEXT:    sltiu a6, a3, -1
+; RV32-NEXT:    or a7, a2, a4
 ; RV32-NEXT:    beqz a7, .LBB21_4
 ; RV32-NEXT:  .LBB21_3: # %entry
-; RV32-NEXT:    slti a6, a0, 0
+; RV32-NEXT:    slti a6, a4, 0
 ; RV32-NEXT:  .LBB21_4: # %entry
 ; RV32-NEXT:    neg a7, a6
 ; RV32-NEXT:    addi t0, a6, -1
@@ -1465,21 +1465,21 @@ define i64 @stest_f32i64(float %x) {
 ; RV32-NEXT:  # %bb.5: # %entry
 ; RV32-NEXT:    mv a1, a5
 ; RV32-NEXT:  .LBB21_6: # %entry
-; RV32-NEXT:    or a4, t0, a4
-; RV32-NEXT:    and a5, a7, a0
+; RV32-NEXT:    or a3, t0, a3
+; RV32-NEXT:    and a4, a7, a4
 ; RV32-NEXT:    and a2, a7, a2
-; RV32-NEXT:    beq a1, a3, .LBB21_8
+; RV32-NEXT:    beq a1, a0, .LBB21_8
 ; RV32-NEXT:  # %bb.7: # %entry
-; RV32-NEXT:    sltu a0, a3, a1
+; RV32-NEXT:    sltu a0, a0, a1
 ; RV32-NEXT:    j .LBB21_9
 ; RV32-NEXT:  .LBB21_8:
-; RV32-NEXT:    snez a0, a4
+; RV32-NEXT:    snez a0, a3
 ; RV32-NEXT:  .LBB21_9: # %entry
-; RV32-NEXT:    and a2, a2, a5
-; RV32-NEXT:    li a3, -1
-; RV32-NEXT:    beq a2, a3, .LBB21_11
+; RV32-NEXT:    and a2, a2, a4
+; RV32-NEXT:    li a5, -1
+; RV32-NEXT:    beq a2, a5, .LBB21_11
 ; RV32-NEXT:  # %bb.10: # %entry
-; RV32-NEXT:    slti a0, a5, 0
+; RV32-NEXT:    slti a0, a4, 0
 ; RV32-NEXT:    xori a0, a0, 1
 ; RV32-NEXT:  .LBB21_11: # %entry
 ; RV32-NEXT:    bnez a0, .LBB21_13
@@ -1487,7 +1487,7 @@ define i64 @stest_f32i64(float %x) {
 ; RV32-NEXT:    lui a1, 524288
 ; RV32-NEXT:  .LBB21_13: # %entry
 ; RV32-NEXT:    neg a0, a0
-; RV32-NEXT:    and a0, a0, a4
+; RV32-NEXT:    and a0, a0, a3
 ; RV32-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    addi sp, sp, 32
 ; RV32-NEXT:    ret
@@ -1657,24 +1657,24 @@ define i64 @stest_f16i64(half %x) {
 ; RV32-NEXT:    call __extendhfsf2 at plt
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    call __fixsfti at plt
-; RV32-NEXT:    lw a0, 20(sp)
-; RV32-NEXT:    lw a2, 16(sp)
+; RV32-NEXT:    lw a3, 8(sp)
 ; RV32-NEXT:    lw a1, 12(sp)
-; RV32-NEXT:    lw a4, 8(sp)
-; RV32-NEXT:    lui a3, 524288
-; RV32-NEXT:    addi a5, a3, -1
+; RV32-NEXT:    lw a2, 16(sp)
+; RV32-NEXT:    lw a4, 20(sp)
+; RV32-NEXT:    lui a0, 524288
+; RV32-NEXT:    addi a5, a0, -1
 ; RV32-NEXT:    beq a1, a5, .LBB24_2
 ; RV32-NEXT:  # %bb.1: # %entry
 ; RV32-NEXT:    sltu a6, a1, a5
-; RV32-NEXT:    or a7, a2, a0
+; RV32-NEXT:    or a7, a2, a4
 ; RV32-NEXT:    bnez a7, .LBB24_3
 ; RV32-NEXT:    j .LBB24_4
 ; RV32-NEXT:  .LBB24_2:
-; RV32-NEXT:    sltiu a6, a4, -1
-; RV32-NEXT:    or a7, a2, a0
+; RV32-NEXT:    sltiu a6, a3, -1
+; RV32-NEXT:    or a7, a2, a4
 ; RV32-NEXT:    beqz a7, .LBB24_4
 ; RV32-NEXT:  .LBB24_3: # %entry
-; RV32-NEXT:    slti a6, a0, 0
+; RV32-NEXT:    slti a6, a4, 0
 ; RV32-NEXT:  .LBB24_4: # %entry
 ; RV32-NEXT:    neg a7, a6
 ; RV32-NEXT:    addi t0, a6, -1
@@ -1682,21 +1682,21 @@ define i64 @stest_f16i64(half %x) {
 ; RV32-NEXT:  # %bb.5: # %entry
 ; RV32-NEXT:    mv a1, a5
 ; RV32-NEXT:  .LBB24_6: # %entry
-; RV32-NEXT:    or a4, t0, a4
-; RV32-NEXT:    and a5, a7, a0
+; RV32-NEXT:    or a3, t0, a3
+; RV32-NEXT:    and a4, a7, a4
 ; RV32-NEXT:    and a2, a7, a2
-; RV32-NEXT:    beq a1, a3, .LBB24_8
+; RV32-NEXT:    beq a1, a0, .LBB24_8
 ; RV32-NEXT:  # %bb.7: # %entry
-; RV32-NEXT:    sltu a0, a3, a1
+; RV32-NEXT:    sltu a0, a0, a1
 ; RV32-NEXT:    j .LBB24_9
 ; RV32-NEXT:  .LBB24_8:
-; RV32-NEXT:    snez a0, a4
+; RV32-NEXT:    snez a0, a3
 ; RV32-NEXT:  .LBB24_9: # %entry
-; RV32-NEXT:    and a2, a2, a5
-; RV32-NEXT:    li a3, -1
-; RV32-NEXT:    beq a2, a3, .LBB24_11
+; RV32-NEXT:    and a2, a2, a4
+; RV32-NEXT:    li a5, -1
+; RV32-NEXT:    beq a2, a5, .LBB24_11
 ; RV32-NEXT:  # %bb.10: # %entry
-; RV32-NEXT:    slti a0, a5, 0
+; RV32-NEXT:    slti a0, a4, 0
 ; RV32-NEXT:    xori a0, a0, 1
 ; RV32-NEXT:  .LBB24_11: # %entry
 ; RV32-NEXT:    bnez a0, .LBB24_13
@@ -1704,7 +1704,7 @@ define i64 @stest_f16i64(half %x) {
 ; RV32-NEXT:    lui a1, 524288
 ; RV32-NEXT:  .LBB24_13: # %entry
 ; RV32-NEXT:    neg a0, a0
-; RV32-NEXT:    and a0, a0, a4
+; RV32-NEXT:    and a0, a0, a3
 ; RV32-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    addi sp, sp, 32
 ; RV32-NEXT:    ret
@@ -2892,24 +2892,24 @@ define i64 @stest_f64i64_mm(double %x) {
 ; RV32IF-NEXT:    mv a1, a0
 ; RV32IF-NEXT:    addi a0, sp, 8
 ; RV32IF-NEXT:    call __fixdfti at plt
-; RV32IF-NEXT:    lw a0, 20(sp)
-; RV32IF-NEXT:    lw a2, 16(sp)
+; RV32IF-NEXT:    lw a3, 8(sp)
 ; RV32IF-NEXT:    lw a1, 12(sp)
-; RV32IF-NEXT:    lw a4, 8(sp)
-; RV32IF-NEXT:    lui a3, 524288
-; RV32IF-NEXT:    addi a5, a3, -1
+; RV32IF-NEXT:    lw a2, 16(sp)
+; RV32IF-NEXT:    lw a4, 20(sp)
+; RV32IF-NEXT:    lui a0, 524288
+; RV32IF-NEXT:    addi a5, a0, -1
 ; RV32IF-NEXT:    beq a1, a5, .LBB45_2
 ; RV32IF-NEXT:  # %bb.1: # %entry
 ; RV32IF-NEXT:    sltu a6, a1, a5
-; RV32IF-NEXT:    or a7, a2, a0
+; RV32IF-NEXT:    or a7, a2, a4
 ; RV32IF-NEXT:    bnez a7, .LBB45_3
 ; RV32IF-NEXT:    j .LBB45_4
 ; RV32IF-NEXT:  .LBB45_2:
-; RV32IF-NEXT:    sltiu a6, a4, -1
-; RV32IF-NEXT:    or a7, a2, a0
+; RV32IF-NEXT:    sltiu a6, a3, -1
+; RV32IF-NEXT:    or a7, a2, a4
 ; RV32IF-NEXT:    beqz a7, .LBB45_4
 ; RV32IF-NEXT:  .LBB45_3: # %entry
-; RV32IF-NEXT:    slti a6, a0, 0
+; RV32IF-NEXT:    slti a6, a4, 0
 ; RV32IF-NEXT:  .LBB45_4: # %entry
 ; RV32IF-NEXT:    neg a7, a6
 ; RV32IF-NEXT:    addi t0, a6, -1
@@ -2917,21 +2917,21 @@ define i64 @stest_f64i64_mm(double %x) {
 ; RV32IF-NEXT:  # %bb.5: # %entry
 ; RV32IF-NEXT:    mv a1, a5
 ; RV32IF-NEXT:  .LBB45_6: # %entry
-; RV32IF-NEXT:    or a4, t0, a4
-; RV32IF-NEXT:    and a5, a7, a0
+; RV32IF-NEXT:    or a3, t0, a3
+; RV32IF-NEXT:    and a4, a7, a4
 ; RV32IF-NEXT:    and a2, a7, a2
-; RV32IF-NEXT:    beq a1, a3, .LBB45_8
+; RV32IF-NEXT:    beq a1, a0, .LBB45_8
 ; RV32IF-NEXT:  # %bb.7: # %entry
-; RV32IF-NEXT:    sltu a0, a3, a1
+; RV32IF-NEXT:    sltu a0, a0, a1
 ; RV32IF-NEXT:    j .LBB45_9
 ; RV32IF-NEXT:  .LBB45_8:
-; RV32IF-NEXT:    snez a0, a4
+; RV32IF-NEXT:    snez a0, a3
 ; RV32IF-NEXT:  .LBB45_9: # %entry
-; RV32IF-NEXT:    and a2, a2, a5
-; RV32IF-NEXT:    li a3, -1
-; RV32IF-NEXT:    beq a2, a3, .LBB45_11
+; RV32IF-NEXT:    and a2, a2, a4
+; RV32IF-NEXT:    li a5, -1
+; RV32IF-NEXT:    beq a2, a5, .LBB45_11
 ; RV32IF-NEXT:  # %bb.10: # %entry
-; RV32IF-NEXT:    slti a0, a5, 0
+; RV32IF-NEXT:    slti a0, a4, 0
 ; RV32IF-NEXT:    xori a0, a0, 1
 ; RV32IF-NEXT:  .LBB45_11: # %entry
 ; RV32IF-NEXT:    bnez a0, .LBB45_13
@@ -2939,7 +2939,7 @@ define i64 @stest_f64i64_mm(double %x) {
 ; RV32IF-NEXT:    lui a1, 524288
 ; RV32IF-NEXT:  .LBB45_13: # %entry
 ; RV32IF-NEXT:    neg a0, a0
-; RV32IF-NEXT:    and a0, a0, a4
+; RV32IF-NEXT:    and a0, a0, a3
 ; RV32IF-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IF-NEXT:    addi sp, sp, 32
 ; RV32IF-NEXT:    ret
@@ -2991,24 +2991,24 @@ define i64 @stest_f64i64_mm(double %x) {
 ; RV32IFD-NEXT:    .cfi_offset ra, -4
 ; RV32IFD-NEXT:    addi a0, sp, 8
 ; RV32IFD-NEXT:    call __fixdfti at plt
-; RV32IFD-NEXT:    lw a0, 20(sp)
-; RV32IFD-NEXT:    lw a2, 16(sp)
+; RV32IFD-NEXT:    lw a3, 8(sp)
 ; RV32IFD-NEXT:    lw a1, 12(sp)
-; RV32IFD-NEXT:    lw a4, 8(sp)
-; RV32IFD-NEXT:    lui a3, 524288
-; RV32IFD-NEXT:    addi a5, a3, -1
+; RV32IFD-NEXT:    lw a2, 16(sp)
+; RV32IFD-NEXT:    lw a4, 20(sp)
+; RV32IFD-NEXT:    lui a0, 524288
+; RV32IFD-NEXT:    addi a5, a0, -1
 ; RV32IFD-NEXT:    beq a1, a5, .LBB45_2
 ; RV32IFD-NEXT:  # %bb.1: # %entry
 ; RV32IFD-NEXT:    sltu a6, a1, a5
-; RV32IFD-NEXT:    or a7, a2, a0
+; RV32IFD-NEXT:    or a7, a2, a4
 ; RV32IFD-NEXT:    bnez a7, .LBB45_3
 ; RV32IFD-NEXT:    j .LBB45_4
 ; RV32IFD-NEXT:  .LBB45_2:
-; RV32IFD-NEXT:    sltiu a6, a4, -1
-; RV32IFD-NEXT:    or a7, a2, a0
+; RV32IFD-NEXT:    sltiu a6, a3, -1
+; RV32IFD-NEXT:    or a7, a2, a4
 ; RV32IFD-NEXT:    beqz a7, .LBB45_4
 ; RV32IFD-NEXT:  .LBB45_3: # %entry
-; RV32IFD-NEXT:    slti a6, a0, 0
+; RV32IFD-NEXT:    slti a6, a4, 0
 ; RV32IFD-NEXT:  .LBB45_4: # %entry
 ; RV32IFD-NEXT:    neg a7, a6
 ; RV32IFD-NEXT:    addi t0, a6, -1
@@ -3016,21 +3016,21 @@ define i64 @stest_f64i64_mm(double %x) {
 ; RV32IFD-NEXT:  # %bb.5: # %entry
 ; RV32IFD-NEXT:    mv a1, a5
 ; RV32IFD-NEXT:  .LBB45_6: # %entry
-; RV32IFD-NEXT:    or a4, t0, a4
-; RV32IFD-NEXT:    and a5, a7, a0
+; RV32IFD-NEXT:    or a3, t0, a3
+; RV32IFD-NEXT:    and a4, a7, a4
 ; RV32IFD-NEXT:    and a2, a7, a2
-; RV32IFD-NEXT:    beq a1, a3, .LBB45_8
+; RV32IFD-NEXT:    beq a1, a0, .LBB45_8
 ; RV32IFD-NEXT:  # %bb.7: # %entry
-; RV32IFD-NEXT:    sltu a0, a3, a1
+; RV32IFD-NEXT:    sltu a0, a0, a1
 ; RV32IFD-NEXT:    j .LBB45_9
 ; RV32IFD-NEXT:  .LBB45_8:
-; RV32IFD-NEXT:    snez a0, a4
+; RV32IFD-NEXT:    snez a0, a3
 ; RV32IFD-NEXT:  .LBB45_9: # %entry
-; RV32IFD-NEXT:    and a2, a2, a5
-; RV32IFD-NEXT:    li a3, -1
-; RV32IFD-NEXT:    beq a2, a3, .LBB45_11
+; RV32IFD-NEXT:    and a2, a2, a4
+; RV32IFD-NEXT:    li a5, -1
+; RV32IFD-NEXT:    beq a2, a5, .LBB45_11
 ; RV32IFD-NEXT:  # %bb.10: # %entry
-; RV32IFD-NEXT:    slti a0, a5, 0
+; RV32IFD-NEXT:    slti a0, a4, 0
 ; RV32IFD-NEXT:    xori a0, a0, 1
 ; RV32IFD-NEXT:  .LBB45_11: # %entry
 ; RV32IFD-NEXT:    bnez a0, .LBB45_13
@@ -3038,7 +3038,7 @@ define i64 @stest_f64i64_mm(double %x) {
 ; RV32IFD-NEXT:    lui a1, 524288
 ; RV32IFD-NEXT:  .LBB45_13: # %entry
 ; RV32IFD-NEXT:    neg a0, a0
-; RV32IFD-NEXT:    and a0, a0, a4
+; RV32IFD-NEXT:    and a0, a0, a3
 ; RV32IFD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IFD-NEXT:    addi sp, sp, 32
 ; RV32IFD-NEXT:    ret
@@ -3145,30 +3145,30 @@ define i64 @ustest_f64i64_mm(double %x) {
 ; RV32IF-NEXT:    mv a1, a0
 ; RV32IF-NEXT:    addi a0, sp, 8
 ; RV32IF-NEXT:    call __fixdfti at plt
-; RV32IF-NEXT:    lw a0, 8(sp)
-; RV32IF-NEXT:    lw a1, 12(sp)
-; RV32IF-NEXT:    lw a2, 20(sp)
+; RV32IF-NEXT:    lw a0, 20(sp)
+; RV32IF-NEXT:    lw a1, 8(sp)
+; RV32IF-NEXT:    lw a2, 12(sp)
 ; RV32IF-NEXT:    lw a3, 16(sp)
-; RV32IF-NEXT:    beqz a2, .LBB47_2
+; RV32IF-NEXT:    beqz a0, .LBB47_2
 ; RV32IF-NEXT:  # %bb.1: # %entry
-; RV32IF-NEXT:    slti a4, a2, 0
+; RV32IF-NEXT:    slti a4, a0, 0
 ; RV32IF-NEXT:    j .LBB47_3
 ; RV32IF-NEXT:  .LBB47_2:
 ; RV32IF-NEXT:    seqz a4, a3
 ; RV32IF-NEXT:  .LBB47_3: # %entry
 ; RV32IF-NEXT:    xori a3, a3, 1
-; RV32IF-NEXT:    or a3, a3, a2
+; RV32IF-NEXT:    or a3, a3, a0
 ; RV32IF-NEXT:    seqz a3, a3
 ; RV32IF-NEXT:    addi a3, a3, -1
 ; RV32IF-NEXT:    and a3, a3, a4
 ; RV32IF-NEXT:    neg a3, a3
+; RV32IF-NEXT:    and a2, a3, a2
 ; RV32IF-NEXT:    and a1, a3, a1
 ; RV32IF-NEXT:    and a0, a3, a0
-; RV32IF-NEXT:    and a2, a3, a2
-; RV32IF-NEXT:    slti a2, a2, 0
-; RV32IF-NEXT:    addi a2, a2, -1
-; RV32IF-NEXT:    and a0, a2, a0
-; RV32IF-NEXT:    and a1, a2, a1
+; RV32IF-NEXT:    slti a0, a0, 0
+; RV32IF-NEXT:    addi a3, a0, -1
+; RV32IF-NEXT:    and a0, a3, a1
+; RV32IF-NEXT:    and a1, a3, a2
 ; RV32IF-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IF-NEXT:    addi sp, sp, 32
 ; RV32IF-NEXT:    ret
@@ -3203,30 +3203,30 @@ define i64 @ustest_f64i64_mm(double %x) {
 ; RV32IFD-NEXT:    .cfi_offset ra, -4
 ; RV32IFD-NEXT:    addi a0, sp, 8
 ; RV32IFD-NEXT:    call __fixdfti at plt
-; RV32IFD-NEXT:    lw a0, 8(sp)
-; RV32IFD-NEXT:    lw a1, 12(sp)
-; RV32IFD-NEXT:    lw a2, 20(sp)
+; RV32IFD-NEXT:    lw a0, 20(sp)
+; RV32IFD-NEXT:    lw a1, 8(sp)
+; RV32IFD-NEXT:    lw a2, 12(sp)
 ; RV32IFD-NEXT:    lw a3, 16(sp)
-; RV32IFD-NEXT:    beqz a2, .LBB47_2
+; RV32IFD-NEXT:    beqz a0, .LBB47_2
 ; RV32IFD-NEXT:  # %bb.1: # %entry
-; RV32IFD-NEXT:    slti a4, a2, 0
+; RV32IFD-NEXT:    slti a4, a0, 0
 ; RV32IFD-NEXT:    j .LBB47_3
 ; RV32IFD-NEXT:  .LBB47_2:
 ; RV32IFD-NEXT:    seqz a4, a3
 ; RV32IFD-NEXT:  .LBB47_3: # %entry
 ; RV32IFD-NEXT:    xori a3, a3, 1
-; RV32IFD-NEXT:    or a3, a3, a2
+; RV32IFD-NEXT:    or a3, a3, a0
 ; RV32IFD-NEXT:    seqz a3, a3
 ; RV32IFD-NEXT:    addi a3, a3, -1
 ; RV32IFD-NEXT:    and a3, a3, a4
 ; RV32IFD-NEXT:    neg a3, a3
+; RV32IFD-NEXT:    and a2, a3, a2
 ; RV32IFD-NEXT:    and a1, a3, a1
 ; RV32IFD-NEXT:    and a0, a3, a0
-; RV32IFD-NEXT:    and a2, a3, a2
-; RV32IFD-NEXT:    slti a2, a2, 0
-; RV32IFD-NEXT:    addi a2, a2, -1
-; RV32IFD-NEXT:    and a0, a2, a0
-; RV32IFD-NEXT:    and a1, a2, a1
+; RV32IFD-NEXT:    slti a0, a0, 0
+; RV32IFD-NEXT:    addi a3, a0, -1
+; RV32IFD-NEXT:    and a0, a3, a1
+; RV32IFD-NEXT:    and a1, a3, a2
 ; RV32IFD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IFD-NEXT:    addi sp, sp, 32
 ; RV32IFD-NEXT:    ret
@@ -3247,24 +3247,24 @@ define i64 @stest_f32i64_mm(float %x) {
 ; RV32-NEXT:    .cfi_offset ra, -4
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    call __fixsfti at plt
-; RV32-NEXT:    lw a0, 20(sp)
-; RV32-NEXT:    lw a2, 16(sp)
+; RV32-NEXT:    lw a3, 8(sp)
 ; RV32-NEXT:    lw a1, 12(sp)
-; RV32-NEXT:    lw a4, 8(sp)
-; RV32-NEXT:    lui a3, 524288
-; RV32-NEXT:    addi a5, a3, -1
+; RV32-NEXT:    lw a2, 16(sp)
+; RV32-NEXT:    lw a4, 20(sp)
+; RV32-NEXT:    lui a0, 524288
+; RV32-NEXT:    addi a5, a0, -1
 ; RV32-NEXT:    beq a1, a5, .LBB48_2
 ; RV32-NEXT:  # %bb.1: # %entry
 ; RV32-NEXT:    sltu a6, a1, a5
-; RV32-NEXT:    or a7, a2, a0
+; RV32-NEXT:    or a7, a2, a4
 ; RV32-NEXT:    bnez a7, .LBB48_3
 ; RV32-NEXT:    j .LBB48_4
 ; RV32-NEXT:  .LBB48_2:
-; RV32-NEXT:    sltiu a6, a4, -1
-; RV32-NEXT:    or a7, a2, a0
+; RV32-NEXT:    sltiu a6, a3, -1
+; RV32-NEXT:    or a7, a2, a4
 ; RV32-NEXT:    beqz a7, .LBB48_4
 ; RV32-NEXT:  .LBB48_3: # %entry
-; RV32-NEXT:    slti a6, a0, 0
+; RV32-NEXT:    slti a6, a4, 0
 ; RV32-NEXT:  .LBB48_4: # %entry
 ; RV32-NEXT:    neg a7, a6
 ; RV32-NEXT:    addi t0, a6, -1
@@ -3272,21 +3272,21 @@ define i64 @stest_f32i64_mm(float %x) {
 ; RV32-NEXT:  # %bb.5: # %entry
 ; RV32-NEXT:    mv a1, a5
 ; RV32-NEXT:  .LBB48_6: # %entry
-; RV32-NEXT:    or a4, t0, a4
-; RV32-NEXT:    and a5, a7, a0
+; RV32-NEXT:    or a3, t0, a3
+; RV32-NEXT:    and a4, a7, a4
 ; RV32-NEXT:    and a2, a7, a2
-; RV32-NEXT:    beq a1, a3, .LBB48_8
+; RV32-NEXT:    beq a1, a0, .LBB48_8
 ; RV32-NEXT:  # %bb.7: # %entry
-; RV32-NEXT:    sltu a0, a3, a1
+; RV32-NEXT:    sltu a0, a0, a1
 ; RV32-NEXT:    j .LBB48_9
 ; RV32-NEXT:  .LBB48_8:
-; RV32-NEXT:    snez a0, a4
+; RV32-NEXT:    snez a0, a3
 ; RV32-NEXT:  .LBB48_9: # %entry
-; RV32-NEXT:    and a2, a2, a5
-; RV32-NEXT:    li a3, -1
-; RV32-NEXT:    beq a2, a3, .LBB48_11
+; RV32-NEXT:    and a2, a2, a4
+; RV32-NEXT:    li a5, -1
+; RV32-NEXT:    beq a2, a5, .LBB48_11
 ; RV32-NEXT:  # %bb.10: # %entry
-; RV32-NEXT:    slti a0, a5, 0
+; RV32-NEXT:    slti a0, a4, 0
 ; RV32-NEXT:    xori a0, a0, 1
 ; RV32-NEXT:  .LBB48_11: # %entry
 ; RV32-NEXT:    bnez a0, .LBB48_13
@@ -3294,7 +3294,7 @@ define i64 @stest_f32i64_mm(float %x) {
 ; RV32-NEXT:    lui a1, 524288
 ; RV32-NEXT:  .LBB48_13: # %entry
 ; RV32-NEXT:    neg a0, a0
-; RV32-NEXT:    and a0, a0, a4
+; RV32-NEXT:    and a0, a0, a3
 ; RV32-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    addi sp, sp, 32
 ; RV32-NEXT:    ret
@@ -3371,30 +3371,30 @@ define i64 @ustest_f32i64_mm(float %x) {
 ; RV32-NEXT:    .cfi_offset ra, -4
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    call __fixsfti at plt
-; RV32-NEXT:    lw a0, 8(sp)
-; RV32-NEXT:    lw a1, 12(sp)
-; RV32-NEXT:    lw a2, 20(sp)
+; RV32-NEXT:    lw a0, 20(sp)
+; RV32-NEXT:    lw a1, 8(sp)
+; RV32-NEXT:    lw a2, 12(sp)
 ; RV32-NEXT:    lw a3, 16(sp)
-; RV32-NEXT:    beqz a2, .LBB50_2
+; RV32-NEXT:    beqz a0, .LBB50_2
 ; RV32-NEXT:  # %bb.1: # %entry
-; RV32-NEXT:    slti a4, a2, 0
+; RV32-NEXT:    slti a4, a0, 0
 ; RV32-NEXT:    j .LBB50_3
 ; RV32-NEXT:  .LBB50_2:
 ; RV32-NEXT:    seqz a4, a3
 ; RV32-NEXT:  .LBB50_3: # %entry
 ; RV32-NEXT:    xori a3, a3, 1
-; RV32-NEXT:    or a3, a3, a2
+; RV32-NEXT:    or a3, a3, a0
 ; RV32-NEXT:    seqz a3, a3
 ; RV32-NEXT:    addi a3, a3, -1
 ; RV32-NEXT:    and a3, a3, a4
 ; RV32-NEXT:    neg a3, a3
+; RV32-NEXT:    and a2, a3, a2
 ; RV32-NEXT:    and a1, a3, a1
 ; RV32-NEXT:    and a0, a3, a0
-; RV32-NEXT:    and a2, a3, a2
-; RV32-NEXT:    slti a2, a2, 0
-; RV32-NEXT:    addi a2, a2, -1
-; RV32-NEXT:    and a0, a2, a0
-; RV32-NEXT:    and a1, a2, a1
+; RV32-NEXT:    slti a0, a0, 0
+; RV32-NEXT:    addi a3, a0, -1
+; RV32-NEXT:    and a0, a3, a1
+; RV32-NEXT:    and a1, a3, a2
 ; RV32-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    addi sp, sp, 32
 ; RV32-NEXT:    ret
@@ -3438,24 +3438,24 @@ define i64 @stest_f16i64_mm(half %x) {
 ; RV32-NEXT:    call __extendhfsf2 at plt
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    call __fixsfti at plt
-; RV32-NEXT:    lw a0, 20(sp)
-; RV32-NEXT:    lw a2, 16(sp)
+; RV32-NEXT:    lw a3, 8(sp)
 ; RV32-NEXT:    lw a1, 12(sp)
-; RV32-NEXT:    lw a4, 8(sp)
-; RV32-NEXT:    lui a3, 524288
-; RV32-NEXT:    addi a5, a3, -1
+; RV32-NEXT:    lw a2, 16(sp)
+; RV32-NEXT:    lw a4, 20(sp)
+; RV32-NEXT:    lui a0, 524288
+; RV32-NEXT:    addi a5, a0, -1
 ; RV32-NEXT:    beq a1, a5, .LBB51_2
 ; RV32-NEXT:  # %bb.1: # %entry
 ; RV32-NEXT:    sltu a6, a1, a5
-; RV32-NEXT:    or a7, a2, a0
+; RV32-NEXT:    or a7, a2, a4
 ; RV32-NEXT:    bnez a7, .LBB51_3
 ; RV32-NEXT:    j .LBB51_4
 ; RV32-NEXT:  .LBB51_2:
-; RV32-NEXT:    sltiu a6, a4, -1
-; RV32-NEXT:    or a7, a2, a0
+; RV32-NEXT:    sltiu a6, a3, -1
+; RV32-NEXT:    or a7, a2, a4
 ; RV32-NEXT:    beqz a7, .LBB51_4
 ; RV32-NEXT:  .LBB51_3: # %entry
-; RV32-NEXT:    slti a6, a0, 0
+; RV32-NEXT:    slti a6, a4, 0
 ; RV32-NEXT:  .LBB51_4: # %entry
 ; RV32-NEXT:    neg a7, a6
 ; RV32-NEXT:    addi t0, a6, -1
@@ -3463,21 +3463,21 @@ define i64 @stest_f16i64_mm(half %x) {
 ; RV32-NEXT:  # %bb.5: # %entry
 ; RV32-NEXT:    mv a1, a5
 ; RV32-NEXT:  .LBB51_6: # %entry
-; RV32-NEXT:    or a4, t0, a4
-; RV32-NEXT:    and a5, a7, a0
+; RV32-NEXT:    or a3, t0, a3
+; RV32-NEXT:    and a4, a7, a4
 ; RV32-NEXT:    and a2, a7, a2
-; RV32-NEXT:    beq a1, a3, .LBB51_8
+; RV32-NEXT:    beq a1, a0, .LBB51_8
 ; RV32-NEXT:  # %bb.7: # %entry
-; RV32-NEXT:    sltu a0, a3, a1
+; RV32-NEXT:    sltu a0, a0, a1
 ; RV32-NEXT:    j .LBB51_9
 ; RV32-NEXT:  .LBB51_8:
-; RV32-NEXT:    snez a0, a4
+; RV32-NEXT:    snez a0, a3
 ; RV32-NEXT:  .LBB51_9: # %entry
-; RV32-NEXT:    and a2, a2, a5
-; RV32-NEXT:    li a3, -1
-; RV32-NEXT:    beq a2, a3, .LBB51_11
+; RV32-NEXT:    and a2, a2, a4
+; RV32-NEXT:    li a5, -1
+; RV32-NEXT:    beq a2, a5, .LBB51_11
 ; RV32-NEXT:  # %bb.10: # %entry
-; RV32-NEXT:    slti a0, a5, 0
+; RV32-NEXT:    slti a0, a4, 0
 ; RV32-NEXT:    xori a0, a0, 1
 ; RV32-NEXT:  .LBB51_11: # %entry
 ; RV32-NEXT:    bnez a0, .LBB51_13
@@ -3485,7 +3485,7 @@ define i64 @stest_f16i64_mm(half %x) {
 ; RV32-NEXT:    lui a1, 524288
 ; RV32-NEXT:  .LBB51_13: # %entry
 ; RV32-NEXT:    neg a0, a0
-; RV32-NEXT:    and a0, a0, a4
+; RV32-NEXT:    and a0, a0, a3
 ; RV32-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    addi sp, sp, 32
 ; RV32-NEXT:    ret
@@ -3596,30 +3596,30 @@ define i64 @ustest_f16i64_mm(half %x) {
 ; RV32-NEXT:    call __extendhfsf2 at plt
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    call __fixsfti at plt
-; RV32-NEXT:    lw a0, 8(sp)
-; RV32-NEXT:    lw a1, 12(sp)
-; RV32-NEXT:    lw a2, 20(sp)
+; RV32-NEXT:    lw a0, 20(sp)
+; RV32-NEXT:    lw a1, 8(sp)
+; RV32-NEXT:    lw a2, 12(sp)
 ; RV32-NEXT:    lw a3, 16(sp)
-; RV32-NEXT:    beqz a2, .LBB53_2
+; RV32-NEXT:    beqz a0, .LBB53_2
 ; RV32-NEXT:  # %bb.1: # %entry
-; RV32-NEXT:    slti a4, a2, 0
+; RV32-NEXT:    slti a4, a0, 0
 ; RV32-NEXT:    j .LBB53_3
 ; RV32-NEXT:  .LBB53_2:
 ; RV32-NEXT:    seqz a4, a3
 ; RV32-NEXT:  .LBB53_3: # %entry
 ; RV32-NEXT:    xori a3, a3, 1
-; RV32-NEXT:    or a3, a3, a2
+; RV32-NEXT:    or a3, a3, a0
 ; RV32-NEXT:    seqz a3, a3
 ; RV32-NEXT:    addi a3, a3, -1
 ; RV32-NEXT:    and a3, a3, a4
 ; RV32-NEXT:    neg a3, a3
+; RV32-NEXT:    and a2, a3, a2
 ; RV32-NEXT:    and a1, a3, a1
 ; RV32-NEXT:    and a0, a3, a0
-; RV32-NEXT:    and a2, a3, a2
-; RV32-NEXT:    slti a2, a2, 0
-; RV32-NEXT:    addi a2, a2, -1
-; RV32-NEXT:    and a0, a2, a0
-; RV32-NEXT:    and a1, a2, a1
+; RV32-NEXT:    slti a0, a0, 0
+; RV32-NEXT:    addi a3, a0, -1
+; RV32-NEXT:    and a0, a3, a1
+; RV32-NEXT:    and a1, a3, a2
 ; RV32-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    addi sp, sp, 32
 ; RV32-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/iabs.ll b/llvm/test/CodeGen/RISCV/iabs.ll
index cb64e24128b5e3..6e276b7a7a597f 100644
--- a/llvm/test/CodeGen/RISCV/iabs.ll
+++ b/llvm/test/CodeGen/RISCV/iabs.ll
@@ -302,8 +302,8 @@ define i128 @abs128(i128 %x) {
 ; RV32I-LABEL: abs128:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lw a2, 12(a1)
-; RV32I-NEXT:    lw a3, 4(a1)
 ; RV32I-NEXT:    lw a4, 0(a1)
+; RV32I-NEXT:    lw a3, 4(a1)
 ; RV32I-NEXT:    lw a1, 8(a1)
 ; RV32I-NEXT:    bgez a2, .LBB8_2
 ; RV32I-NEXT:  # %bb.1:
@@ -330,8 +330,8 @@ define i128 @abs128(i128 %x) {
 ; RV32ZBB-LABEL: abs128:
 ; RV32ZBB:       # %bb.0:
 ; RV32ZBB-NEXT:    lw a2, 12(a1)
-; RV32ZBB-NEXT:    lw a3, 4(a1)
 ; RV32ZBB-NEXT:    lw a4, 0(a1)
+; RV32ZBB-NEXT:    lw a3, 4(a1)
 ; RV32ZBB-NEXT:    lw a1, 8(a1)
 ; RV32ZBB-NEXT:    bgez a2, .LBB8_2
 ; RV32ZBB-NEXT:  # %bb.1:
@@ -384,8 +384,8 @@ define i128 @select_abs128(i128 %x) {
 ; RV32I-LABEL: select_abs128:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lw a2, 12(a1)
-; RV32I-NEXT:    lw a3, 4(a1)
 ; RV32I-NEXT:    lw a4, 0(a1)
+; RV32I-NEXT:    lw a3, 4(a1)
 ; RV32I-NEXT:    lw a1, 8(a1)
 ; RV32I-NEXT:    bgez a2, .LBB9_2
 ; RV32I-NEXT:  # %bb.1:
@@ -412,8 +412,8 @@ define i128 @select_abs128(i128 %x) {
 ; RV32ZBB-LABEL: select_abs128:
 ; RV32ZBB:       # %bb.0:
 ; RV32ZBB-NEXT:    lw a2, 12(a1)
-; RV32ZBB-NEXT:    lw a3, 4(a1)
 ; RV32ZBB-NEXT:    lw a4, 0(a1)
+; RV32ZBB-NEXT:    lw a3, 4(a1)
 ; RV32ZBB-NEXT:    lw a1, 8(a1)
 ; RV32ZBB-NEXT:    bgez a2, .LBB9_2
 ; RV32ZBB-NEXT:  # %bb.1:
diff --git a/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts.ll b/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts.ll
index 15abc9b75883c8..5b296274e552f1 100644
--- a/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts.ll
+++ b/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts.ll
@@ -9,8 +9,8 @@ define i16 @ctz_v4i32(<4 x i32> %a) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    lw a3, 0(a0)
 ; RV32-NEXT:    lw a1, 4(a0)
-; RV32-NEXT:    lw a2, 12(a0)
 ; RV32-NEXT:    lw a4, 8(a0)
+; RV32-NEXT:    lw a2, 12(a0)
 ; RV32-NEXT:    seqz a0, a3
 ; RV32-NEXT:    addi a0, a0, -1
 ; RV32-NEXT:    andi a0, a0, 4
@@ -42,8 +42,8 @@ define i16 @ctz_v4i32(<4 x i32> %a) {
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    lw a3, 0(a0)
 ; RV64-NEXT:    lw a1, 8(a0)
-; RV64-NEXT:    lw a2, 24(a0)
 ; RV64-NEXT:    lw a4, 16(a0)
+; RV64-NEXT:    lw a2, 24(a0)
 ; RV64-NEXT:    seqz a0, a3
 ; RV64-NEXT:    addi a0, a0, -1
 ; RV64-NEXT:    andi a0, a0, 4
diff --git a/llvm/test/CodeGen/RISCV/legalize-fneg.ll b/llvm/test/CodeGen/RISCV/legalize-fneg.ll
index 13d03c5217fb1b..dfd62e8d5f9f56 100644
--- a/llvm/test/CodeGen/RISCV/legalize-fneg.ll
+++ b/llvm/test/CodeGen/RISCV/legalize-fneg.ll
@@ -56,16 +56,16 @@ entry:
 define void @test3(ptr %a, ptr %b) nounwind {
 ; RV32-LABEL: test3:
 ; RV32:       # %bb.0: # %entry
-; RV32-NEXT:    lw a2, 4(a1)
-; RV32-NEXT:    lw a3, 12(a1)
+; RV32-NEXT:    lw a2, 12(a1)
+; RV32-NEXT:    lw a3, 4(a1)
 ; RV32-NEXT:    lw a4, 8(a1)
 ; RV32-NEXT:    lw a1, 0(a1)
 ; RV32-NEXT:    lui a5, 524288
-; RV32-NEXT:    xor a3, a3, a5
+; RV32-NEXT:    xor a2, a2, a5
 ; RV32-NEXT:    sw a4, 8(a0)
 ; RV32-NEXT:    sw a1, 0(a0)
-; RV32-NEXT:    sw a2, 4(a0)
-; RV32-NEXT:    sw a3, 12(a0)
+; RV32-NEXT:    sw a3, 4(a0)
+; RV32-NEXT:    sw a2, 12(a0)
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: test3:
diff --git a/llvm/test/CodeGen/RISCV/llvm.exp10.ll b/llvm/test/CodeGen/RISCV/llvm.exp10.ll
index bfac15e009f00e..22199eedc231c5 100644
--- a/llvm/test/CodeGen/RISCV/llvm.exp10.ll
+++ b/llvm/test/CodeGen/RISCV/llvm.exp10.ll
@@ -222,32 +222,32 @@ define <3 x half> @exp10_v3f16(<3 x half> %x) {
 ; RV64IFD-NEXT:    .cfi_offset s1, -24
 ; RV64IFD-NEXT:    .cfi_offset s2, -32
 ; RV64IFD-NEXT:    .cfi_offset fs0, -40
-; RV64IFD-NEXT:    lhu s1, 16(a1)
-; RV64IFD-NEXT:    lhu s2, 0(a1)
-; RV64IFD-NEXT:    lhu a1, 8(a1)
+; RV64IFD-NEXT:    lhu s1, 0(a1)
+; RV64IFD-NEXT:    lhu a2, 8(a1)
+; RV64IFD-NEXT:    lhu s2, 16(a1)
 ; RV64IFD-NEXT:    mv s0, a0
-; RV64IFD-NEXT:    fmv.w.x fa0, a1
+; RV64IFD-NEXT:    fmv.w.x fa0, a2
 ; RV64IFD-NEXT:    call __extendhfsf2 at plt
 ; RV64IFD-NEXT:    call exp10f at plt
 ; RV64IFD-NEXT:    call __truncsfhf2 at plt
 ; RV64IFD-NEXT:    fmv.s fs0, fa0
-; RV64IFD-NEXT:    fmv.w.x fa0, s2
+; RV64IFD-NEXT:    fmv.w.x fa0, s1
 ; RV64IFD-NEXT:    call __extendhfsf2 at plt
 ; RV64IFD-NEXT:    call exp10f at plt
 ; RV64IFD-NEXT:    fmv.x.w a0, fs0
-; RV64IFD-NEXT:    slli s2, a0, 16
+; RV64IFD-NEXT:    slli s1, a0, 16
 ; RV64IFD-NEXT:    call __truncsfhf2 at plt
 ; RV64IFD-NEXT:    fmv.x.w a0, fa0
 ; RV64IFD-NEXT:    slli a0, a0, 48
 ; RV64IFD-NEXT:    srli a0, a0, 48
-; RV64IFD-NEXT:    or s2, a0, s2
-; RV64IFD-NEXT:    fmv.w.x fa0, s1
+; RV64IFD-NEXT:    or s1, a0, s1
+; RV64IFD-NEXT:    fmv.w.x fa0, s2
 ; RV64IFD-NEXT:    call __extendhfsf2 at plt
 ; RV64IFD-NEXT:    call exp10f at plt
 ; RV64IFD-NEXT:    call __truncsfhf2 at plt
 ; RV64IFD-NEXT:    fmv.x.w a0, fa0
 ; RV64IFD-NEXT:    sh a0, 4(s0)
-; RV64IFD-NEXT:    sw s2, 0(s0)
+; RV64IFD-NEXT:    sw s1, 0(s0)
 ; RV64IFD-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64IFD-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64IFD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
@@ -349,27 +349,27 @@ define <4 x half> @exp10_v4f16(<4 x half> %x) {
 ; RV64IFD-NEXT:    .cfi_offset fs0, -48
 ; RV64IFD-NEXT:    .cfi_offset fs1, -56
 ; RV64IFD-NEXT:    .cfi_offset fs2, -64
-; RV64IFD-NEXT:    lhu s1, 24(a1)
-; RV64IFD-NEXT:    lhu s2, 0(a1)
-; RV64IFD-NEXT:    lhu s3, 8(a1)
-; RV64IFD-NEXT:    lhu a1, 16(a1)
+; RV64IFD-NEXT:    lhu s1, 0(a1)
+; RV64IFD-NEXT:    lhu s2, 8(a1)
+; RV64IFD-NEXT:    lhu a2, 16(a1)
+; RV64IFD-NEXT:    lhu s3, 24(a1)
 ; RV64IFD-NEXT:    mv s0, a0
-; RV64IFD-NEXT:    fmv.w.x fa0, a1
+; RV64IFD-NEXT:    fmv.w.x fa0, a2
 ; RV64IFD-NEXT:    call __extendhfsf2 at plt
 ; RV64IFD-NEXT:    call exp10f at plt
 ; RV64IFD-NEXT:    call __truncsfhf2 at plt
 ; RV64IFD-NEXT:    fmv.s fs0, fa0
-; RV64IFD-NEXT:    fmv.w.x fa0, s3
+; RV64IFD-NEXT:    fmv.w.x fa0, s2
 ; RV64IFD-NEXT:    call __extendhfsf2 at plt
 ; RV64IFD-NEXT:    call exp10f at plt
 ; RV64IFD-NEXT:    call __truncsfhf2 at plt
 ; RV64IFD-NEXT:    fmv.s fs1, fa0
-; RV64IFD-NEXT:    fmv.w.x fa0, s2
+; RV64IFD-NEXT:    fmv.w.x fa0, s1
 ; RV64IFD-NEXT:    call __extendhfsf2 at plt
 ; RV64IFD-NEXT:    call exp10f at plt
 ; RV64IFD-NEXT:    call __truncsfhf2 at plt
 ; RV64IFD-NEXT:    fmv.s fs2, fa0
-; RV64IFD-NEXT:    fmv.w.x fa0, s1
+; RV64IFD-NEXT:    fmv.w.x fa0, s3
 ; RV64IFD-NEXT:    call __extendhfsf2 at plt
 ; RV64IFD-NEXT:    call exp10f at plt
 ; RV64IFD-NEXT:    fmv.x.w s1, fs2
diff --git a/llvm/test/CodeGen/RISCV/llvm.frexp.ll b/llvm/test/CodeGen/RISCV/llvm.frexp.ll
index 94b9444dfaf8c5..fa80f1f3c48d28 100644
--- a/llvm/test/CodeGen/RISCV/llvm.frexp.ll
+++ b/llvm/test/CodeGen/RISCV/llvm.frexp.ll
@@ -738,25 +738,25 @@ define { <4 x float>, <4 x i32> } @test_frexp_v4f32_v4i32(<4 x float> %a) nounwi
 ; RV32I-NEXT:    sw s2, 32(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s3, 28(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s4, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lw s0, 12(a1)
-; RV32I-NEXT:    lw s1, 8(a1)
-; RV32I-NEXT:    lw s2, 4(a1)
 ; RV32I-NEXT:    lw a2, 0(a1)
+; RV32I-NEXT:    lw s0, 4(a1)
+; RV32I-NEXT:    lw s1, 8(a1)
+; RV32I-NEXT:    lw s2, 12(a1)
 ; RV32I-NEXT:    mv s3, a0
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:    call frexpf at plt
 ; RV32I-NEXT:    mv s4, a0
 ; RV32I-NEXT:    addi a1, sp, 12
-; RV32I-NEXT:    mv a0, s2
+; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    call frexpf at plt
-; RV32I-NEXT:    mv s2, a0
+; RV32I-NEXT:    mv s0, a0
 ; RV32I-NEXT:    addi a1, sp, 16
 ; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call frexpf at plt
 ; RV32I-NEXT:    mv s1, a0
 ; RV32I-NEXT:    addi a1, sp, 20
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s2
 ; RV32I-NEXT:    call frexpf at plt
 ; RV32I-NEXT:    lw a1, 8(sp)
 ; RV32I-NEXT:    lw a2, 12(sp)
@@ -764,7 +764,7 @@ define { <4 x float>, <4 x i32> } @test_frexp_v4f32_v4i32(<4 x float> %a) nounwi
 ; RV32I-NEXT:    lw a4, 20(sp)
 ; RV32I-NEXT:    sw a0, 12(s3)
 ; RV32I-NEXT:    sw s1, 8(s3)
-; RV32I-NEXT:    sw s2, 4(s3)
+; RV32I-NEXT:    sw s0, 4(s3)
 ; RV32I-NEXT:    sw s4, 0(s3)
 ; RV32I-NEXT:    sw a4, 28(s3)
 ; RV32I-NEXT:    sw a3, 24(s3)
@@ -788,25 +788,25 @@ define { <4 x float>, <4 x i32> } @test_frexp_v4f32_v4i32(<4 x float> %a) nounwi
 ; RV64I-NEXT:    sd s2, 32(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s3, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s4, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lw s0, 24(a1)
-; RV64I-NEXT:    lw s1, 16(a1)
-; RV64I-NEXT:    lw s2, 8(a1)
 ; RV64I-NEXT:    lw a2, 0(a1)
+; RV64I-NEXT:    lw s0, 8(a1)
+; RV64I-NEXT:    lw s1, 16(a1)
+; RV64I-NEXT:    lw s2, 24(a1)
 ; RV64I-NEXT:    mv s3, a0
 ; RV64I-NEXT:    mv a1, sp
 ; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:    call frexpf at plt
 ; RV64I-NEXT:    mv s4, a0
 ; RV64I-NEXT:    addi a1, sp, 4
-; RV64I-NEXT:    mv a0, s2
+; RV64I-NEXT:    mv a0, s0
 ; RV64I-NEXT:    call frexpf at plt
-; RV64I-NEXT:    mv s2, a0
+; RV64I-NEXT:    mv s0, a0
 ; RV64I-NEXT:    addi a1, sp, 8
 ; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call frexpf at plt
 ; RV64I-NEXT:    mv s1, a0
 ; RV64I-NEXT:    addi a1, sp, 12
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s2
 ; RV64I-NEXT:    call frexpf at plt
 ; RV64I-NEXT:    lw a1, 0(sp)
 ; RV64I-NEXT:    lw a2, 4(sp)
@@ -814,7 +814,7 @@ define { <4 x float>, <4 x i32> } @test_frexp_v4f32_v4i32(<4 x float> %a) nounwi
 ; RV64I-NEXT:    lw a4, 12(sp)
 ; RV64I-NEXT:    sw a0, 12(s3)
 ; RV64I-NEXT:    sw s1, 8(s3)
-; RV64I-NEXT:    sw s2, 4(s3)
+; RV64I-NEXT:    sw s0, 4(s3)
 ; RV64I-NEXT:    sw s4, 0(s3)
 ; RV64I-NEXT:    sw a4, 28(s3)
 ; RV64I-NEXT:    sw a3, 24(s3)
@@ -1006,29 +1006,29 @@ define <4 x float> @test_frexp_v4f32_v4i32_only_use_fract(<4 x float> %a) nounwi
 ; RV32I-NEXT:    sw s2, 32(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s3, 28(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s4, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lw s0, 12(a1)
-; RV32I-NEXT:    lw s1, 8(a1)
-; RV32I-NEXT:    lw s2, 4(a1)
 ; RV32I-NEXT:    lw a2, 0(a1)
+; RV32I-NEXT:    lw s0, 4(a1)
+; RV32I-NEXT:    lw s1, 8(a1)
+; RV32I-NEXT:    lw s2, 12(a1)
 ; RV32I-NEXT:    mv s3, a0
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:    call frexpf at plt
 ; RV32I-NEXT:    mv s4, a0
 ; RV32I-NEXT:    addi a1, sp, 12
-; RV32I-NEXT:    mv a0, s2
+; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    call frexpf at plt
-; RV32I-NEXT:    mv s2, a0
+; RV32I-NEXT:    mv s0, a0
 ; RV32I-NEXT:    addi a1, sp, 16
 ; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call frexpf at plt
 ; RV32I-NEXT:    mv s1, a0
 ; RV32I-NEXT:    addi a1, sp, 20
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s2
 ; RV32I-NEXT:    call frexpf at plt
 ; RV32I-NEXT:    sw a0, 12(s3)
 ; RV32I-NEXT:    sw s1, 8(s3)
-; RV32I-NEXT:    sw s2, 4(s3)
+; RV32I-NEXT:    sw s0, 4(s3)
 ; RV32I-NEXT:    sw s4, 0(s3)
 ; RV32I-NEXT:    lw ra, 44(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 40(sp) # 4-byte Folded Reload
@@ -1048,29 +1048,29 @@ define <4 x float> @test_frexp_v4f32_v4i32_only_use_fract(<4 x float> %a) nounwi
 ; RV64I-NEXT:    sd s2, 32(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s3, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s4, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lw s0, 24(a1)
-; RV64I-NEXT:    lw s1, 16(a1)
-; RV64I-NEXT:    lw s2, 8(a1)
 ; RV64I-NEXT:    lw a2, 0(a1)
+; RV64I-NEXT:    lw s0, 8(a1)
+; RV64I-NEXT:    lw s1, 16(a1)
+; RV64I-NEXT:    lw s2, 24(a1)
 ; RV64I-NEXT:    mv s3, a0
 ; RV64I-NEXT:    mv a1, sp
 ; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:    call frexpf at plt
 ; RV64I-NEXT:    mv s4, a0
 ; RV64I-NEXT:    addi a1, sp, 4
-; RV64I-NEXT:    mv a0, s2
+; RV64I-NEXT:    mv a0, s0
 ; RV64I-NEXT:    call frexpf at plt
-; RV64I-NEXT:    mv s2, a0
+; RV64I-NEXT:    mv s0, a0
 ; RV64I-NEXT:    addi a1, sp, 8
 ; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call frexpf at plt
 ; RV64I-NEXT:    mv s1, a0
 ; RV64I-NEXT:    addi a1, sp, 12
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s2
 ; RV64I-NEXT:    call frexpf at plt
 ; RV64I-NEXT:    sw a0, 12(s3)
 ; RV64I-NEXT:    sw s1, 8(s3)
-; RV64I-NEXT:    sw s2, 4(s3)
+; RV64I-NEXT:    sw s0, 4(s3)
 ; RV64I-NEXT:    sw s4, 0(s3)
 ; RV64I-NEXT:    ld ra, 56(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 48(sp) # 8-byte Folded Reload
@@ -1254,22 +1254,22 @@ define <4 x i32> @test_frexp_v4f32_v4i32_only_use_exp(<4 x float> %a) nounwind {
 ; RV32I-NEXT:    sw s1, 36(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 32(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s3, 28(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lw s0, 12(a1)
-; RV32I-NEXT:    lw s1, 8(a1)
-; RV32I-NEXT:    lw s2, 4(a1)
 ; RV32I-NEXT:    lw a2, 0(a1)
+; RV32I-NEXT:    lw s0, 4(a1)
+; RV32I-NEXT:    lw s1, 8(a1)
+; RV32I-NEXT:    lw s2, 12(a1)
 ; RV32I-NEXT:    mv s3, a0
 ; RV32I-NEXT:    addi a1, sp, 12
 ; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:    call frexpf at plt
 ; RV32I-NEXT:    addi a1, sp, 16
-; RV32I-NEXT:    mv a0, s2
+; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    call frexpf at plt
 ; RV32I-NEXT:    addi a1, sp, 20
 ; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call frexpf at plt
 ; RV32I-NEXT:    addi a1, sp, 24
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s2
 ; RV32I-NEXT:    call frexpf at plt
 ; RV32I-NEXT:    lw a0, 24(sp)
 ; RV32I-NEXT:    lw a1, 20(sp)
@@ -1295,22 +1295,22 @@ define <4 x i32> @test_frexp_v4f32_v4i32_only_use_exp(<4 x float> %a) nounwind {
 ; RV64I-NEXT:    sd s1, 40(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s2, 32(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s3, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lw s0, 24(a1)
-; RV64I-NEXT:    lw s1, 16(a1)
-; RV64I-NEXT:    lw s2, 8(a1)
 ; RV64I-NEXT:    lw a2, 0(a1)
+; RV64I-NEXT:    lw s0, 8(a1)
+; RV64I-NEXT:    lw s1, 16(a1)
+; RV64I-NEXT:    lw s2, 24(a1)
 ; RV64I-NEXT:    mv s3, a0
 ; RV64I-NEXT:    addi a1, sp, 8
 ; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:    call frexpf at plt
 ; RV64I-NEXT:    addi a1, sp, 12
-; RV64I-NEXT:    mv a0, s2
+; RV64I-NEXT:    mv a0, s0
 ; RV64I-NEXT:    call frexpf at plt
 ; RV64I-NEXT:    addi a1, sp, 16
 ; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call frexpf at plt
 ; RV64I-NEXT:    addi a1, sp, 20
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s2
 ; RV64I-NEXT:    call frexpf at plt
 ; RV64I-NEXT:    lw a0, 20(sp)
 ; RV64I-NEXT:    lw a1, 16(sp)
@@ -1584,16 +1584,16 @@ define { fp128, i32 } @test_frexp_f128_i32(fp128 %a) nounwind {
 ; RV32IFD-NEXT:    addi a2, sp, 36
 ; RV32IFD-NEXT:    sw a3, 0(sp)
 ; RV32IFD-NEXT:    call frexpl at plt
-; RV32IFD-NEXT:    lw a0, 36(sp)
+; RV32IFD-NEXT:    lw a0, 24(sp)
 ; RV32IFD-NEXT:    lw a1, 28(sp)
-; RV32IFD-NEXT:    lw a2, 24(sp)
+; RV32IFD-NEXT:    lw a2, 16(sp)
 ; RV32IFD-NEXT:    lw a3, 20(sp)
-; RV32IFD-NEXT:    lw a4, 16(sp)
+; RV32IFD-NEXT:    lw a4, 36(sp)
 ; RV32IFD-NEXT:    sw a1, 12(s0)
-; RV32IFD-NEXT:    sw a2, 8(s0)
+; RV32IFD-NEXT:    sw a0, 8(s0)
 ; RV32IFD-NEXT:    sw a3, 4(s0)
-; RV32IFD-NEXT:    sw a4, 0(s0)
-; RV32IFD-NEXT:    sw a0, 16(s0)
+; RV32IFD-NEXT:    sw a2, 0(s0)
+; RV32IFD-NEXT:    sw a4, 16(s0)
 ; RV32IFD-NEXT:    lw ra, 44(sp) # 4-byte Folded Reload
 ; RV32IFD-NEXT:    lw s0, 40(sp) # 4-byte Folded Reload
 ; RV32IFD-NEXT:    addi sp, sp, 48
@@ -1637,16 +1637,16 @@ define { fp128, i32 } @test_frexp_f128_i32(fp128 %a) nounwind {
 ; RV32IZFINXZDINX-NEXT:    addi a2, sp, 36
 ; RV32IZFINXZDINX-NEXT:    sw a3, 0(sp)
 ; RV32IZFINXZDINX-NEXT:    call frexpl at plt
-; RV32IZFINXZDINX-NEXT:    lw a0, 36(sp)
+; RV32IZFINXZDINX-NEXT:    lw a0, 24(sp)
 ; RV32IZFINXZDINX-NEXT:    lw a1, 28(sp)
-; RV32IZFINXZDINX-NEXT:    lw a2, 24(sp)
+; RV32IZFINXZDINX-NEXT:    lw a2, 16(sp)
 ; RV32IZFINXZDINX-NEXT:    lw a3, 20(sp)
-; RV32IZFINXZDINX-NEXT:    lw a4, 16(sp)
+; RV32IZFINXZDINX-NEXT:    lw a4, 36(sp)
 ; RV32IZFINXZDINX-NEXT:    sw a1, 12(s0)
-; RV32IZFINXZDINX-NEXT:    sw a2, 8(s0)
+; RV32IZFINXZDINX-NEXT:    sw a0, 8(s0)
 ; RV32IZFINXZDINX-NEXT:    sw a3, 4(s0)
-; RV32IZFINXZDINX-NEXT:    sw a4, 0(s0)
-; RV32IZFINXZDINX-NEXT:    sw a0, 16(s0)
+; RV32IZFINXZDINX-NEXT:    sw a2, 0(s0)
+; RV32IZFINXZDINX-NEXT:    sw a4, 16(s0)
 ; RV32IZFINXZDINX-NEXT:    lw ra, 44(sp) # 4-byte Folded Reload
 ; RV32IZFINXZDINX-NEXT:    lw s0, 40(sp) # 4-byte Folded Reload
 ; RV32IZFINXZDINX-NEXT:    addi sp, sp, 48
@@ -1690,16 +1690,16 @@ define { fp128, i32 } @test_frexp_f128_i32(fp128 %a) nounwind {
 ; RV32I-NEXT:    addi a2, sp, 36
 ; RV32I-NEXT:    sw a3, 0(sp)
 ; RV32I-NEXT:    call frexpl at plt
-; RV32I-NEXT:    lw a0, 36(sp)
+; RV32I-NEXT:    lw a0, 24(sp)
 ; RV32I-NEXT:    lw a1, 28(sp)
-; RV32I-NEXT:    lw a2, 24(sp)
+; RV32I-NEXT:    lw a2, 16(sp)
 ; RV32I-NEXT:    lw a3, 20(sp)
-; RV32I-NEXT:    lw a4, 16(sp)
+; RV32I-NEXT:    lw a4, 36(sp)
 ; RV32I-NEXT:    sw a1, 12(s0)
-; RV32I-NEXT:    sw a2, 8(s0)
+; RV32I-NEXT:    sw a0, 8(s0)
 ; RV32I-NEXT:    sw a3, 4(s0)
-; RV32I-NEXT:    sw a4, 0(s0)
-; RV32I-NEXT:    sw a0, 16(s0)
+; RV32I-NEXT:    sw a2, 0(s0)
+; RV32I-NEXT:    sw a4, 16(s0)
 ; RV32I-NEXT:    lw ra, 44(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 40(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    addi sp, sp, 48
diff --git a/llvm/test/CodeGen/RISCV/memcpy.ll b/llvm/test/CodeGen/RISCV/memcpy.ll
index 26ad872509194f..44e9631d3442d6 100644
--- a/llvm/test/CodeGen/RISCV/memcpy.ll
+++ b/llvm/test/CodeGen/RISCV/memcpy.ll
@@ -25,16 +25,16 @@ define i32 @t0() {
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    lui a0, %hi(src)
 ; RV32-NEXT:    lw a1, %lo(src)(a0)
-; RV32-NEXT:    lui a2, %hi(dst)
-; RV32-NEXT:    sw a1, %lo(dst)(a2)
 ; RV32-NEXT:    addi a0, a0, %lo(src)
-; RV32-NEXT:    lbu a1, 10(a0)
+; RV32-NEXT:    lw a2, 4(a0)
 ; RV32-NEXT:    lh a3, 8(a0)
-; RV32-NEXT:    lw a0, 4(a0)
-; RV32-NEXT:    addi a2, a2, %lo(dst)
-; RV32-NEXT:    sb a1, 10(a2)
-; RV32-NEXT:    sh a3, 8(a2)
-; RV32-NEXT:    sw a0, 4(a2)
+; RV32-NEXT:    lbu a0, 10(a0)
+; RV32-NEXT:    lui a4, %hi(dst)
+; RV32-NEXT:    sw a1, %lo(dst)(a4)
+; RV32-NEXT:    addi a1, a4, %lo(dst)
+; RV32-NEXT:    sb a0, 10(a1)
+; RV32-NEXT:    sh a3, 8(a1)
+; RV32-NEXT:    sw a2, 4(a1)
 ; RV32-NEXT:    li a0, 0
 ; RV32-NEXT:    ret
 ;
@@ -42,14 +42,14 @@ define i32 @t0() {
 ; RV64:       # %bb.0: # %entry
 ; RV64-NEXT:    lui a0, %hi(src)
 ; RV64-NEXT:    ld a1, %lo(src)(a0)
-; RV64-NEXT:    lui a2, %hi(dst)
 ; RV64-NEXT:    addi a0, a0, %lo(src)
-; RV64-NEXT:    lbu a3, 10(a0)
-; RV64-NEXT:    lh a0, 8(a0)
-; RV64-NEXT:    sd a1, %lo(dst)(a2)
-; RV64-NEXT:    addi a1, a2, %lo(dst)
-; RV64-NEXT:    sb a3, 10(a1)
-; RV64-NEXT:    sh a0, 8(a1)
+; RV64-NEXT:    lh a2, 8(a0)
+; RV64-NEXT:    lbu a0, 10(a0)
+; RV64-NEXT:    lui a3, %hi(dst)
+; RV64-NEXT:    sd a1, %lo(dst)(a3)
+; RV64-NEXT:    addi a1, a3, %lo(dst)
+; RV64-NEXT:    sb a0, 10(a1)
+; RV64-NEXT:    sh a2, 8(a1)
 ; RV64-NEXT:    li a0, 0
 ; RV64-NEXT:    ret
 ;
@@ -57,14 +57,14 @@ define i32 @t0() {
 ; RV32-FAST:       # %bb.0: # %entry
 ; RV32-FAST-NEXT:    lui a0, %hi(src)
 ; RV32-FAST-NEXT:    lw a1, %lo(src)(a0)
-; RV32-FAST-NEXT:    lui a2, %hi(dst)
 ; RV32-FAST-NEXT:    addi a0, a0, %lo(src)
-; RV32-FAST-NEXT:    lw a3, 7(a0)
-; RV32-FAST-NEXT:    lw a0, 4(a0)
-; RV32-FAST-NEXT:    sw a1, %lo(dst)(a2)
-; RV32-FAST-NEXT:    addi a1, a2, %lo(dst)
-; RV32-FAST-NEXT:    sw a3, 7(a1)
-; RV32-FAST-NEXT:    sw a0, 4(a1)
+; RV32-FAST-NEXT:    lw a2, 4(a0)
+; RV32-FAST-NEXT:    lw a0, 7(a0)
+; RV32-FAST-NEXT:    lui a3, %hi(dst)
+; RV32-FAST-NEXT:    sw a1, %lo(dst)(a3)
+; RV32-FAST-NEXT:    addi a1, a3, %lo(dst)
+; RV32-FAST-NEXT:    sw a0, 7(a1)
+; RV32-FAST-NEXT:    sw a2, 4(a1)
 ; RV32-FAST-NEXT:    li a0, 0
 ; RV32-FAST-NEXT:    ret
 ;
@@ -166,16 +166,16 @@ define void @t2(ptr nocapture %C) nounwind {
 ; RV64-FAST-NEXT:    lui a1, %hi(.L.str2)
 ; RV64-FAST-NEXT:    ld a2, %lo(.L.str2)(a1)
 ; RV64-FAST-NEXT:    sd a2, 0(a0)
-; RV64-FAST-NEXT:    lui a2, 1156
-; RV64-FAST-NEXT:    addi a2, a2, 332
 ; RV64-FAST-NEXT:    addi a1, a1, %lo(.L.str2)
-; RV64-FAST-NEXT:    ld a3, 24(a1)
-; RV64-FAST-NEXT:    ld a4, 16(a1)
-; RV64-FAST-NEXT:    ld a1, 8(a1)
-; RV64-FAST-NEXT:    sw a2, 32(a0)
-; RV64-FAST-NEXT:    sd a3, 24(a0)
-; RV64-FAST-NEXT:    sd a4, 16(a0)
-; RV64-FAST-NEXT:    sd a1, 8(a0)
+; RV64-FAST-NEXT:    ld a2, 8(a1)
+; RV64-FAST-NEXT:    ld a3, 16(a1)
+; RV64-FAST-NEXT:    ld a1, 24(a1)
+; RV64-FAST-NEXT:    lui a4, 1156
+; RV64-FAST-NEXT:    addi a4, a4, 332
+; RV64-FAST-NEXT:    sw a4, 32(a0)
+; RV64-FAST-NEXT:    sd a1, 24(a0)
+; RV64-FAST-NEXT:    sd a3, 16(a0)
+; RV64-FAST-NEXT:    sd a2, 8(a0)
 ; RV64-FAST-NEXT:    ret
 entry:
   tail call void @llvm.memcpy.p0.p0.i64(ptr %C, ptr @.str2, i64 36, i1 false)
diff --git a/llvm/test/CodeGen/RISCV/misched-load-clustering.ll b/llvm/test/CodeGen/RISCV/misched-load-clustering.ll
index d046aaf98f5edd..d84f24aaac8b8c 100644
--- a/llvm/test/CodeGen/RISCV/misched-load-clustering.ll
+++ b/llvm/test/CodeGen/RISCV/misched-load-clustering.ll
@@ -1,25 +1,10 @@
 ; REQUIRES: asserts
 ; RUN: llc -mtriple=riscv32 -verify-misched -debug-only=machine-scheduler -o - 2>&1 < %s \
-; RUN:   | FileCheck -check-prefix=NOCLUSTER %s
-; RUN: llc -mtriple=riscv64 -verify-misched -debug-only=machine-scheduler -o - 2>&1 < %s \
-; RUN:   | FileCheck -check-prefix=NOCLUSTER %s
-; RUN: llc -mtriple=riscv32 -riscv-misched-load-clustering -verify-misched \
-; RUN:     -debug-only=machine-scheduler -o - 2>&1 < %s \
 ; RUN:   | FileCheck -check-prefix=LDCLUSTER %s
-; RUN: llc -mtriple=riscv64 -riscv-misched-load-clustering -verify-misched \
-; RUN:     -debug-only=machine-scheduler -o - 2>&1 < %s \
+; RUN: llc -mtriple=riscv64 -verify-misched -debug-only=machine-scheduler -o - 2>&1 < %s \
 ; RUN:   | FileCheck -check-prefix=LDCLUSTER %s
 
-
 define i32 @load_clustering_1(ptr nocapture %p) {
-; NOCLUSTER: ********** MI Scheduling **********
-; NOCLUSTER-LABEL: load_clustering_1:%bb.0
-; NOCLUSTER: *** Final schedule for %bb.0 ***
-; NOCLUSTER: SU(1): %1:gpr = LW %0:gpr, 12
-; NOCLUSTER: SU(2): %2:gpr = LW %0:gpr, 8
-; NOCLUSTER: SU(4): %4:gpr = LW %0:gpr, 4
-; NOCLUSTER: SU(5): %6:gpr = LW %0:gpr, 16
-;
 ; LDCLUSTER: ********** MI Scheduling **********
 ; LDCLUSTER-LABEL: load_clustering_1:%bb.0
 ; LDCLUSTER: *** Final schedule for %bb.0 ***
diff --git a/llvm/test/CodeGen/RISCV/mul.ll b/llvm/test/CodeGen/RISCV/mul.ll
index f2b7e8d26328d5..dd6131e064cac2 100644
--- a/llvm/test/CodeGen/RISCV/mul.ll
+++ b/llvm/test/CodeGen/RISCV/mul.ll
@@ -1167,48 +1167,48 @@ define i128 @muli128_m3840(i128 %a) nounwind {
 ; RV32IM-NEXT:    addi sp, sp, -16
 ; RV32IM-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
 ; RV32IM-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
-; RV32IM-NEXT:    lw a2, 12(a1)
-; RV32IM-NEXT:    lw a3, 8(a1)
-; RV32IM-NEXT:    lw a4, 0(a1)
-; RV32IM-NEXT:    lw a1, 4(a1)
+; RV32IM-NEXT:    lw a2, 0(a1)
+; RV32IM-NEXT:    lw a3, 4(a1)
+; RV32IM-NEXT:    lw a4, 8(a1)
+; RV32IM-NEXT:    lw a1, 12(a1)
 ; RV32IM-NEXT:    li a5, -15
 ; RV32IM-NEXT:    slli a5, a5, 8
-; RV32IM-NEXT:    mulhu a6, a4, a5
-; RV32IM-NEXT:    mul a7, a1, a5
+; RV32IM-NEXT:    mulhu a6, a2, a5
+; RV32IM-NEXT:    mul a7, a3, a5
 ; RV32IM-NEXT:    add a6, a7, a6
 ; RV32IM-NEXT:    sltu a7, a6, a7
-; RV32IM-NEXT:    mulhu t0, a1, a5
+; RV32IM-NEXT:    mulhu t0, a3, a5
 ; RV32IM-NEXT:    add a7, t0, a7
-; RV32IM-NEXT:    sub a6, a6, a4
-; RV32IM-NEXT:    neg t0, a4
+; RV32IM-NEXT:    sub a6, a6, a2
+; RV32IM-NEXT:    neg t0, a2
 ; RV32IM-NEXT:    sltu t1, a6, t0
 ; RV32IM-NEXT:    li t2, -1
-; RV32IM-NEXT:    mulhu t3, a4, t2
+; RV32IM-NEXT:    mulhu t3, a2, t2
 ; RV32IM-NEXT:    add t1, t3, t1
 ; RV32IM-NEXT:    add t1, a7, t1
-; RV32IM-NEXT:    sub t4, t1, a1
-; RV32IM-NEXT:    mul t5, a3, a5
-; RV32IM-NEXT:    sub t5, t5, a4
+; RV32IM-NEXT:    sub t4, t1, a3
+; RV32IM-NEXT:    mul t5, a4, a5
+; RV32IM-NEXT:    sub t5, t5, a2
 ; RV32IM-NEXT:    add t6, t4, t5
 ; RV32IM-NEXT:    sltu s0, t6, t4
-; RV32IM-NEXT:    neg s1, a1
+; RV32IM-NEXT:    neg s1, a3
 ; RV32IM-NEXT:    sltu t4, t4, s1
 ; RV32IM-NEXT:    sltu a7, t1, a7
-; RV32IM-NEXT:    mulhu t1, a1, t2
+; RV32IM-NEXT:    mulhu t1, a3, t2
 ; RV32IM-NEXT:    add a7, t1, a7
 ; RV32IM-NEXT:    add a7, a7, t4
 ; RV32IM-NEXT:    sltu t0, t5, t0
-; RV32IM-NEXT:    mul a2, a2, a5
-; RV32IM-NEXT:    mulhu t1, a3, a5
-; RV32IM-NEXT:    sub a3, t1, a3
-; RV32IM-NEXT:    add a2, a3, a2
+; RV32IM-NEXT:    mul a1, a1, a5
+; RV32IM-NEXT:    mulhu t1, a4, a5
+; RV32IM-NEXT:    sub a4, t1, a4
 ; RV32IM-NEXT:    add a1, a4, a1
-; RV32IM-NEXT:    sub a1, t3, a1
-; RV32IM-NEXT:    add a1, a1, a2
+; RV32IM-NEXT:    add a3, a2, a3
+; RV32IM-NEXT:    sub a3, t3, a3
+; RV32IM-NEXT:    add a1, a3, a1
 ; RV32IM-NEXT:    add a1, a1, t0
 ; RV32IM-NEXT:    add a1, a7, a1
 ; RV32IM-NEXT:    add a1, a1, s0
-; RV32IM-NEXT:    mul a2, a4, a5
+; RV32IM-NEXT:    mul a2, a2, a5
 ; RV32IM-NEXT:    sw a2, 0(a0)
 ; RV32IM-NEXT:    sw a6, 4(a0)
 ; RV32IM-NEXT:    sw t6, 8(a0)
@@ -1252,39 +1252,39 @@ define i128 @muli128_m63(i128 %a) nounwind {
 ; RV32I-LABEL: muli128_m63:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lw a2, 0(a1)
-; RV32I-NEXT:    lw a4, 12(a1)
+; RV32I-NEXT:    lw a3, 4(a1)
 ; RV32I-NEXT:    lw a6, 8(a1)
-; RV32I-NEXT:    lw a1, 4(a1)
-; RV32I-NEXT:    slli a3, a2, 6
-; RV32I-NEXT:    sltu a5, a2, a3
+; RV32I-NEXT:    lw a5, 12(a1)
+; RV32I-NEXT:    slli a1, a2, 6
+; RV32I-NEXT:    sltu a4, a2, a1
 ; RV32I-NEXT:    srli a7, a2, 26
-; RV32I-NEXT:    slli t0, a1, 6
+; RV32I-NEXT:    slli t0, a3, 6
 ; RV32I-NEXT:    or a7, t0, a7
-; RV32I-NEXT:    mv t0, a5
-; RV32I-NEXT:    beq a1, a7, .LBB31_2
+; RV32I-NEXT:    mv t0, a4
+; RV32I-NEXT:    beq a3, a7, .LBB31_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    sltu t0, a1, a7
+; RV32I-NEXT:    sltu t0, a3, a7
 ; RV32I-NEXT:  .LBB31_2:
-; RV32I-NEXT:    srli t1, a1, 26
+; RV32I-NEXT:    srli t1, a3, 26
 ; RV32I-NEXT:    slli t2, a6, 6
 ; RV32I-NEXT:    or t1, t2, t1
 ; RV32I-NEXT:    sub t2, a6, t1
 ; RV32I-NEXT:    sltu t3, t2, t0
 ; RV32I-NEXT:    sltu t1, a6, t1
 ; RV32I-NEXT:    srli a6, a6, 26
-; RV32I-NEXT:    slli t4, a4, 6
+; RV32I-NEXT:    slli t4, a5, 6
 ; RV32I-NEXT:    or a6, t4, a6
-; RV32I-NEXT:    sub a4, a4, a6
-; RV32I-NEXT:    sub a4, a4, t1
-; RV32I-NEXT:    sub a4, a4, t3
+; RV32I-NEXT:    sub a5, a5, a6
+; RV32I-NEXT:    sub a5, a5, t1
+; RV32I-NEXT:    sub a5, a5, t3
 ; RV32I-NEXT:    sub a6, t2, t0
-; RV32I-NEXT:    sub a1, a1, a7
-; RV32I-NEXT:    sub a1, a1, a5
-; RV32I-NEXT:    sub a2, a2, a3
+; RV32I-NEXT:    sub a3, a3, a7
+; RV32I-NEXT:    sub a3, a3, a4
+; RV32I-NEXT:    sub a2, a2, a1
 ; RV32I-NEXT:    sw a2, 0(a0)
-; RV32I-NEXT:    sw a1, 4(a0)
+; RV32I-NEXT:    sw a3, 4(a0)
 ; RV32I-NEXT:    sw a6, 8(a0)
-; RV32I-NEXT:    sw a4, 12(a0)
+; RV32I-NEXT:    sw a5, 12(a0)
 ; RV32I-NEXT:    ret
 ;
 ; RV32IM-LABEL: muli128_m63:
@@ -1292,52 +1292,52 @@ define i128 @muli128_m63(i128 %a) nounwind {
 ; RV32IM-NEXT:    addi sp, sp, -16
 ; RV32IM-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
 ; RV32IM-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
-; RV32IM-NEXT:    lw a2, 12(a1)
-; RV32IM-NEXT:    lw a3, 0(a1)
-; RV32IM-NEXT:    lw a4, 4(a1)
-; RV32IM-NEXT:    lw a1, 8(a1)
+; RV32IM-NEXT:    lw a2, 0(a1)
+; RV32IM-NEXT:    lw a3, 4(a1)
+; RV32IM-NEXT:    lw a4, 8(a1)
+; RV32IM-NEXT:    lw a1, 12(a1)
 ; RV32IM-NEXT:    li a5, -63
-; RV32IM-NEXT:    mulhu a6, a3, a5
-; RV32IM-NEXT:    slli a7, a4, 6
-; RV32IM-NEXT:    sub a7, a4, a7
+; RV32IM-NEXT:    mulhu a6, a2, a5
+; RV32IM-NEXT:    slli a7, a3, 6
+; RV32IM-NEXT:    sub a7, a3, a7
 ; RV32IM-NEXT:    add a6, a7, a6
 ; RV32IM-NEXT:    sltu a7, a6, a7
-; RV32IM-NEXT:    mulhu t0, a4, a5
+; RV32IM-NEXT:    mulhu t0, a3, a5
 ; RV32IM-NEXT:    add a7, t0, a7
-; RV32IM-NEXT:    sub a6, a6, a3
-; RV32IM-NEXT:    neg t0, a3
+; RV32IM-NEXT:    sub a6, a6, a2
+; RV32IM-NEXT:    neg t0, a2
 ; RV32IM-NEXT:    sltu t1, a6, t0
 ; RV32IM-NEXT:    li t2, -1
-; RV32IM-NEXT:    mulhu t3, a3, t2
+; RV32IM-NEXT:    mulhu t3, a2, t2
 ; RV32IM-NEXT:    add t1, t3, t1
 ; RV32IM-NEXT:    add t1, a7, t1
-; RV32IM-NEXT:    sub t4, t1, a4
-; RV32IM-NEXT:    slli t5, a1, 6
-; RV32IM-NEXT:    sub t6, a1, a3
+; RV32IM-NEXT:    sub t4, t1, a3
+; RV32IM-NEXT:    slli t5, a4, 6
+; RV32IM-NEXT:    sub t6, a4, a2
 ; RV32IM-NEXT:    sub t5, t6, t5
 ; RV32IM-NEXT:    add t6, t4, t5
 ; RV32IM-NEXT:    sltu s0, t6, t4
-; RV32IM-NEXT:    neg s1, a4
+; RV32IM-NEXT:    neg s1, a3
 ; RV32IM-NEXT:    sltu t4, t4, s1
 ; RV32IM-NEXT:    sltu a7, t1, a7
-; RV32IM-NEXT:    mulhu t1, a4, t2
+; RV32IM-NEXT:    mulhu t1, a3, t2
 ; RV32IM-NEXT:    add a7, t1, a7
 ; RV32IM-NEXT:    add a7, a7, t4
 ; RV32IM-NEXT:    sltu t0, t5, t0
-; RV32IM-NEXT:    slli t1, a2, 6
-; RV32IM-NEXT:    sub a2, a2, t1
-; RV32IM-NEXT:    mulhu a5, a1, a5
-; RV32IM-NEXT:    sub a5, a5, a1
-; RV32IM-NEXT:    add a2, a5, a2
-; RV32IM-NEXT:    add a4, a3, a4
-; RV32IM-NEXT:    sub a1, t3, a4
-; RV32IM-NEXT:    add a1, a1, a2
+; RV32IM-NEXT:    slli t1, a1, 6
+; RV32IM-NEXT:    sub a1, a1, t1
+; RV32IM-NEXT:    mulhu a5, a4, a5
+; RV32IM-NEXT:    sub a5, a5, a4
+; RV32IM-NEXT:    add a1, a5, a1
+; RV32IM-NEXT:    add a3, a2, a3
+; RV32IM-NEXT:    sub a3, t3, a3
+; RV32IM-NEXT:    add a1, a3, a1
 ; RV32IM-NEXT:    add a1, a1, t0
 ; RV32IM-NEXT:    add a1, a7, a1
 ; RV32IM-NEXT:    add a1, a1, s0
-; RV32IM-NEXT:    slli a2, a3, 6
-; RV32IM-NEXT:    sub a3, a3, a2
-; RV32IM-NEXT:    sw a3, 0(a0)
+; RV32IM-NEXT:    slli a3, a2, 6
+; RV32IM-NEXT:    sub a2, a2, a3
+; RV32IM-NEXT:    sw a2, 0(a0)
 ; RV32IM-NEXT:    sw a6, 4(a0)
 ; RV32IM-NEXT:    sw t6, 8(a0)
 ; RV32IM-NEXT:    sw a1, 12(a0)
diff --git a/llvm/test/CodeGen/RISCV/nontemporal.ll b/llvm/test/CodeGen/RISCV/nontemporal.ll
index 4c5c36fc72d14d..f2421970340b56 100644
--- a/llvm/test/CodeGen/RISCV/nontemporal.ll
+++ b/llvm/test/CodeGen/RISCV/nontemporal.ll
@@ -907,54 +907,54 @@ define void @test_nontemporal_store_v16i8(ptr %p, <16 x i8> %v) {
 ; CHECK-RV64-NEXT:    sd s1, 0(sp) # 8-byte Folded Spill
 ; CHECK-RV64-NEXT:    .cfi_offset s0, -8
 ; CHECK-RV64-NEXT:    .cfi_offset s1, -16
-; CHECK-RV64-NEXT:    lbu a2, 0(a1)
-; CHECK-RV64-NEXT:    lbu a3, 8(a1)
-; CHECK-RV64-NEXT:    lbu a4, 16(a1)
-; CHECK-RV64-NEXT:    lbu a5, 24(a1)
-; CHECK-RV64-NEXT:    lbu a6, 32(a1)
-; CHECK-RV64-NEXT:    lbu a7, 40(a1)
-; CHECK-RV64-NEXT:    lbu t0, 48(a1)
-; CHECK-RV64-NEXT:    lbu t1, 56(a1)
-; CHECK-RV64-NEXT:    lbu t2, 64(a1)
-; CHECK-RV64-NEXT:    lbu t3, 72(a1)
-; CHECK-RV64-NEXT:    lbu t4, 80(a1)
-; CHECK-RV64-NEXT:    lbu t5, 88(a1)
-; CHECK-RV64-NEXT:    lbu t6, 120(a1)
-; CHECK-RV64-NEXT:    lbu s0, 112(a1)
-; CHECK-RV64-NEXT:    lbu s1, 104(a1)
+; CHECK-RV64-NEXT:    lbu a2, 104(a1)
+; CHECK-RV64-NEXT:    lbu a3, 112(a1)
+; CHECK-RV64-NEXT:    lbu a4, 120(a1)
+; CHECK-RV64-NEXT:    lbu a5, 0(a1)
+; CHECK-RV64-NEXT:    lbu a6, 8(a1)
+; CHECK-RV64-NEXT:    lbu a7, 16(a1)
+; CHECK-RV64-NEXT:    lbu t0, 24(a1)
+; CHECK-RV64-NEXT:    lbu t1, 32(a1)
+; CHECK-RV64-NEXT:    lbu t2, 40(a1)
+; CHECK-RV64-NEXT:    lbu t3, 48(a1)
+; CHECK-RV64-NEXT:    lbu t4, 56(a1)
+; CHECK-RV64-NEXT:    lbu t5, 64(a1)
+; CHECK-RV64-NEXT:    lbu t6, 72(a1)
+; CHECK-RV64-NEXT:    lbu s0, 80(a1)
+; CHECK-RV64-NEXT:    lbu s1, 88(a1)
 ; CHECK-RV64-NEXT:    lbu a1, 96(a1)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb t6, 15(a0)
+; CHECK-RV64-NEXT:    sb a4, 15(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb s0, 14(a0)
+; CHECK-RV64-NEXT:    sb a3, 14(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb s1, 13(a0)
+; CHECK-RV64-NEXT:    sb a2, 13(a0)
 ; CHECK-RV64-NEXT:    ntl.all
 ; CHECK-RV64-NEXT:    sb a1, 12(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb t5, 11(a0)
+; CHECK-RV64-NEXT:    sb s1, 11(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb t4, 10(a0)
+; CHECK-RV64-NEXT:    sb s0, 10(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb t3, 9(a0)
+; CHECK-RV64-NEXT:    sb t6, 9(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb t2, 8(a0)
+; CHECK-RV64-NEXT:    sb t5, 8(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb t1, 7(a0)
+; CHECK-RV64-NEXT:    sb t4, 7(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb t0, 6(a0)
+; CHECK-RV64-NEXT:    sb t3, 6(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb a7, 5(a0)
+; CHECK-RV64-NEXT:    sb t2, 5(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb a6, 4(a0)
+; CHECK-RV64-NEXT:    sb t1, 4(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb a5, 3(a0)
+; CHECK-RV64-NEXT:    sb t0, 3(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb a4, 2(a0)
+; CHECK-RV64-NEXT:    sb a7, 2(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb a3, 1(a0)
+; CHECK-RV64-NEXT:    sb a6, 1(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb a2, 0(a0)
+; CHECK-RV64-NEXT:    sb a5, 0(a0)
 ; CHECK-RV64-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
 ; CHECK-RV64-NEXT:    ld s1, 0(sp) # 8-byte Folded Reload
 ; CHECK-RV64-NEXT:    addi sp, sp, 16
@@ -968,54 +968,54 @@ define void @test_nontemporal_store_v16i8(ptr %p, <16 x i8> %v) {
 ; CHECK-RV32-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
 ; CHECK-RV32-NEXT:    .cfi_offset s0, -4
 ; CHECK-RV32-NEXT:    .cfi_offset s1, -8
-; CHECK-RV32-NEXT:    lbu a2, 0(a1)
-; CHECK-RV32-NEXT:    lbu a3, 4(a1)
-; CHECK-RV32-NEXT:    lbu a4, 8(a1)
-; CHECK-RV32-NEXT:    lbu a5, 12(a1)
-; CHECK-RV32-NEXT:    lbu a6, 16(a1)
-; CHECK-RV32-NEXT:    lbu a7, 20(a1)
-; CHECK-RV32-NEXT:    lbu t0, 24(a1)
-; CHECK-RV32-NEXT:    lbu t1, 28(a1)
-; CHECK-RV32-NEXT:    lbu t2, 32(a1)
-; CHECK-RV32-NEXT:    lbu t3, 36(a1)
-; CHECK-RV32-NEXT:    lbu t4, 40(a1)
-; CHECK-RV32-NEXT:    lbu t5, 44(a1)
-; CHECK-RV32-NEXT:    lbu t6, 60(a1)
-; CHECK-RV32-NEXT:    lbu s0, 56(a1)
-; CHECK-RV32-NEXT:    lbu s1, 52(a1)
+; CHECK-RV32-NEXT:    lbu a2, 52(a1)
+; CHECK-RV32-NEXT:    lbu a3, 56(a1)
+; CHECK-RV32-NEXT:    lbu a4, 60(a1)
+; CHECK-RV32-NEXT:    lbu a5, 0(a1)
+; CHECK-RV32-NEXT:    lbu a6, 4(a1)
+; CHECK-RV32-NEXT:    lbu a7, 8(a1)
+; CHECK-RV32-NEXT:    lbu t0, 12(a1)
+; CHECK-RV32-NEXT:    lbu t1, 16(a1)
+; CHECK-RV32-NEXT:    lbu t2, 20(a1)
+; CHECK-RV32-NEXT:    lbu t3, 24(a1)
+; CHECK-RV32-NEXT:    lbu t4, 28(a1)
+; CHECK-RV32-NEXT:    lbu t5, 32(a1)
+; CHECK-RV32-NEXT:    lbu t6, 36(a1)
+; CHECK-RV32-NEXT:    lbu s0, 40(a1)
+; CHECK-RV32-NEXT:    lbu s1, 44(a1)
 ; CHECK-RV32-NEXT:    lbu a1, 48(a1)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb t6, 15(a0)
+; CHECK-RV32-NEXT:    sb a4, 15(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb s0, 14(a0)
+; CHECK-RV32-NEXT:    sb a3, 14(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb s1, 13(a0)
+; CHECK-RV32-NEXT:    sb a2, 13(a0)
 ; CHECK-RV32-NEXT:    ntl.all
 ; CHECK-RV32-NEXT:    sb a1, 12(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb t5, 11(a0)
+; CHECK-RV32-NEXT:    sb s1, 11(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb t4, 10(a0)
+; CHECK-RV32-NEXT:    sb s0, 10(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb t3, 9(a0)
+; CHECK-RV32-NEXT:    sb t6, 9(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb t2, 8(a0)
+; CHECK-RV32-NEXT:    sb t5, 8(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb t1, 7(a0)
+; CHECK-RV32-NEXT:    sb t4, 7(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb t0, 6(a0)
+; CHECK-RV32-NEXT:    sb t3, 6(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb a7, 5(a0)
+; CHECK-RV32-NEXT:    sb t2, 5(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb a6, 4(a0)
+; CHECK-RV32-NEXT:    sb t1, 4(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb a5, 3(a0)
+; CHECK-RV32-NEXT:    sb t0, 3(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb a4, 2(a0)
+; CHECK-RV32-NEXT:    sb a7, 2(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb a3, 1(a0)
+; CHECK-RV32-NEXT:    sb a6, 1(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb a2, 0(a0)
+; CHECK-RV32-NEXT:    sb a5, 0(a0)
 ; CHECK-RV32-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; CHECK-RV32-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
 ; CHECK-RV32-NEXT:    addi sp, sp, 16
@@ -1029,44 +1029,44 @@ define void @test_nontemporal_store_v16i8(ptr %p, <16 x i8> %v) {
 ; CHECK-RV64C-NEXT:    sd s1, 0(sp) # 8-byte Folded Spill
 ; CHECK-RV64C-NEXT:    .cfi_offset s0, -8
 ; CHECK-RV64C-NEXT:    .cfi_offset s1, -16
+; CHECK-RV64C-NEXT:    lbu t3, 104(a1)
+; CHECK-RV64C-NEXT:    lbu t6, 112(a1)
+; CHECK-RV64C-NEXT:    lbu a4, 120(a1)
 ; CHECK-RV64C-NEXT:    lbu a6, 0(a1)
 ; CHECK-RV64C-NEXT:    lbu a7, 8(a1)
 ; CHECK-RV64C-NEXT:    lbu t0, 16(a1)
 ; CHECK-RV64C-NEXT:    lbu t1, 24(a1)
 ; CHECK-RV64C-NEXT:    lbu t2, 32(a1)
-; CHECK-RV64C-NEXT:    lbu t3, 40(a1)
-; CHECK-RV64C-NEXT:    lbu t4, 48(a1)
-; CHECK-RV64C-NEXT:    lbu t5, 56(a1)
-; CHECK-RV64C-NEXT:    lbu t6, 64(a1)
+; CHECK-RV64C-NEXT:    lbu t4, 40(a1)
+; CHECK-RV64C-NEXT:    lbu t5, 48(a1)
+; CHECK-RV64C-NEXT:    lbu a5, 56(a1)
+; CHECK-RV64C-NEXT:    lbu a2, 64(a1)
 ; CHECK-RV64C-NEXT:    lbu a3, 72(a1)
-; CHECK-RV64C-NEXT:    lbu a4, 80(a1)
-; CHECK-RV64C-NEXT:    lbu a5, 88(a1)
-; CHECK-RV64C-NEXT:    lbu a2, 120(a1)
-; CHECK-RV64C-NEXT:    lbu s0, 112(a1)
-; CHECK-RV64C-NEXT:    lbu s1, 104(a1)
+; CHECK-RV64C-NEXT:    lbu s0, 80(a1)
+; CHECK-RV64C-NEXT:    lbu s1, 88(a1)
 ; CHECK-RV64C-NEXT:    lbu a1, 96(a1)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sb a2, 15(a0)
+; CHECK-RV64C-NEXT:    sb a4, 15(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sb s0, 14(a0)
+; CHECK-RV64C-NEXT:    sb t6, 14(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sb s1, 13(a0)
+; CHECK-RV64C-NEXT:    sb t3, 13(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
 ; CHECK-RV64C-NEXT:    sb a1, 12(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sb a5, 11(a0)
+; CHECK-RV64C-NEXT:    sb s1, 11(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sb a4, 10(a0)
+; CHECK-RV64C-NEXT:    sb s0, 10(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
 ; CHECK-RV64C-NEXT:    sb a3, 9(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sb t6, 8(a0)
+; CHECK-RV64C-NEXT:    sb a2, 8(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sb t5, 7(a0)
+; CHECK-RV64C-NEXT:    sb a5, 7(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sb t4, 6(a0)
+; CHECK-RV64C-NEXT:    sb t5, 6(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sb t3, 5(a0)
+; CHECK-RV64C-NEXT:    sb t4, 5(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
 ; CHECK-RV64C-NEXT:    sb t2, 4(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
@@ -1090,44 +1090,44 @@ define void @test_nontemporal_store_v16i8(ptr %p, <16 x i8> %v) {
 ; CHECK-RV32C-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
 ; CHECK-RV32C-NEXT:    .cfi_offset s0, -4
 ; CHECK-RV32C-NEXT:    .cfi_offset s1, -8
+; CHECK-RV32C-NEXT:    lbu t3, 52(a1)
+; CHECK-RV32C-NEXT:    lbu t6, 56(a1)
+; CHECK-RV32C-NEXT:    lbu a4, 60(a1)
 ; CHECK-RV32C-NEXT:    lbu a6, 0(a1)
 ; CHECK-RV32C-NEXT:    lbu a7, 4(a1)
 ; CHECK-RV32C-NEXT:    lbu t0, 8(a1)
 ; CHECK-RV32C-NEXT:    lbu t1, 12(a1)
 ; CHECK-RV32C-NEXT:    lbu t2, 16(a1)
-; CHECK-RV32C-NEXT:    lbu t3, 20(a1)
-; CHECK-RV32C-NEXT:    lbu t4, 24(a1)
-; CHECK-RV32C-NEXT:    lbu t5, 28(a1)
-; CHECK-RV32C-NEXT:    lbu t6, 32(a1)
+; CHECK-RV32C-NEXT:    lbu t4, 20(a1)
+; CHECK-RV32C-NEXT:    lbu t5, 24(a1)
+; CHECK-RV32C-NEXT:    lbu a5, 28(a1)
+; CHECK-RV32C-NEXT:    lbu a2, 32(a1)
 ; CHECK-RV32C-NEXT:    lbu a3, 36(a1)
-; CHECK-RV32C-NEXT:    lbu a4, 40(a1)
-; CHECK-RV32C-NEXT:    lbu a5, 44(a1)
-; CHECK-RV32C-NEXT:    lbu a2, 60(a1)
-; CHECK-RV32C-NEXT:    lbu s0, 56(a1)
-; CHECK-RV32C-NEXT:    lbu s1, 52(a1)
+; CHECK-RV32C-NEXT:    lbu s0, 40(a1)
+; CHECK-RV32C-NEXT:    lbu s1, 44(a1)
 ; CHECK-RV32C-NEXT:    lbu a1, 48(a1)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sb a2, 15(a0)
+; CHECK-RV32C-NEXT:    sb a4, 15(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sb s0, 14(a0)
+; CHECK-RV32C-NEXT:    sb t6, 14(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sb s1, 13(a0)
+; CHECK-RV32C-NEXT:    sb t3, 13(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
 ; CHECK-RV32C-NEXT:    sb a1, 12(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sb a5, 11(a0)
+; CHECK-RV32C-NEXT:    sb s1, 11(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sb a4, 10(a0)
+; CHECK-RV32C-NEXT:    sb s0, 10(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
 ; CHECK-RV32C-NEXT:    sb a3, 9(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sb t6, 8(a0)
+; CHECK-RV32C-NEXT:    sb a2, 8(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sb t5, 7(a0)
+; CHECK-RV32C-NEXT:    sb a5, 7(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sb t4, 6(a0)
+; CHECK-RV32C-NEXT:    sb t5, 6(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sb t3, 5(a0)
+; CHECK-RV32C-NEXT:    sb t4, 5(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
 ; CHECK-RV32C-NEXT:    sb t2, 4(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
@@ -1163,112 +1163,112 @@ define void @test_nontemporal_store_v16i8(ptr %p, <16 x i8> %v) {
 define void @test_nontemporal_store_v8i16(ptr %p, <8 x i16> %v) {
 ; CHECK-RV64-LABEL: test_nontemporal_store_v8i16:
 ; CHECK-RV64:       # %bb.0:
-; CHECK-RV64-NEXT:    lh a2, 0(a1)
-; CHECK-RV64-NEXT:    lh a3, 8(a1)
-; CHECK-RV64-NEXT:    lh a4, 16(a1)
-; CHECK-RV64-NEXT:    lh a5, 24(a1)
-; CHECK-RV64-NEXT:    lh a6, 56(a1)
-; CHECK-RV64-NEXT:    lh a7, 48(a1)
-; CHECK-RV64-NEXT:    lh t0, 40(a1)
+; CHECK-RV64-NEXT:    lh a2, 40(a1)
+; CHECK-RV64-NEXT:    lh a3, 48(a1)
+; CHECK-RV64-NEXT:    lh a4, 56(a1)
+; CHECK-RV64-NEXT:    lh a5, 0(a1)
+; CHECK-RV64-NEXT:    lh a6, 8(a1)
+; CHECK-RV64-NEXT:    lh a7, 16(a1)
+; CHECK-RV64-NEXT:    lh t0, 24(a1)
 ; CHECK-RV64-NEXT:    lh a1, 32(a1)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sh a6, 14(a0)
+; CHECK-RV64-NEXT:    sh a4, 14(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sh a7, 12(a0)
+; CHECK-RV64-NEXT:    sh a3, 12(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sh t0, 10(a0)
+; CHECK-RV64-NEXT:    sh a2, 10(a0)
 ; CHECK-RV64-NEXT:    ntl.all
 ; CHECK-RV64-NEXT:    sh a1, 8(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sh a5, 6(a0)
+; CHECK-RV64-NEXT:    sh t0, 6(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sh a4, 4(a0)
+; CHECK-RV64-NEXT:    sh a7, 4(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sh a3, 2(a0)
+; CHECK-RV64-NEXT:    sh a6, 2(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sh a2, 0(a0)
+; CHECK-RV64-NEXT:    sh a5, 0(a0)
 ; CHECK-RV64-NEXT:    ret
 ;
 ; CHECK-RV32-LABEL: test_nontemporal_store_v8i16:
 ; CHECK-RV32:       # %bb.0:
-; CHECK-RV32-NEXT:    lh a2, 0(a1)
-; CHECK-RV32-NEXT:    lh a3, 4(a1)
-; CHECK-RV32-NEXT:    lh a4, 8(a1)
-; CHECK-RV32-NEXT:    lh a5, 12(a1)
-; CHECK-RV32-NEXT:    lh a6, 28(a1)
-; CHECK-RV32-NEXT:    lh a7, 24(a1)
-; CHECK-RV32-NEXT:    lh t0, 20(a1)
+; CHECK-RV32-NEXT:    lh a2, 20(a1)
+; CHECK-RV32-NEXT:    lh a3, 24(a1)
+; CHECK-RV32-NEXT:    lh a4, 28(a1)
+; CHECK-RV32-NEXT:    lh a5, 0(a1)
+; CHECK-RV32-NEXT:    lh a6, 4(a1)
+; CHECK-RV32-NEXT:    lh a7, 8(a1)
+; CHECK-RV32-NEXT:    lh t0, 12(a1)
 ; CHECK-RV32-NEXT:    lh a1, 16(a1)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sh a6, 14(a0)
+; CHECK-RV32-NEXT:    sh a4, 14(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sh a7, 12(a0)
+; CHECK-RV32-NEXT:    sh a3, 12(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sh t0, 10(a0)
+; CHECK-RV32-NEXT:    sh a2, 10(a0)
 ; CHECK-RV32-NEXT:    ntl.all
 ; CHECK-RV32-NEXT:    sh a1, 8(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sh a5, 6(a0)
+; CHECK-RV32-NEXT:    sh t0, 6(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sh a4, 4(a0)
+; CHECK-RV32-NEXT:    sh a7, 4(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sh a3, 2(a0)
+; CHECK-RV32-NEXT:    sh a6, 2(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sh a2, 0(a0)
+; CHECK-RV32-NEXT:    sh a5, 0(a0)
 ; CHECK-RV32-NEXT:    ret
 ;
 ; CHECK-RV64C-LABEL: test_nontemporal_store_v8i16:
 ; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    lh a7, 40(a1)
+; CHECK-RV64C-NEXT:    lh a3, 48(a1)
+; CHECK-RV64C-NEXT:    lh a4, 56(a1)
 ; CHECK-RV64C-NEXT:    lh a6, 0(a1)
-; CHECK-RV64C-NEXT:    lh a7, 8(a1)
-; CHECK-RV64C-NEXT:    lh t0, 16(a1)
+; CHECK-RV64C-NEXT:    lh t0, 8(a1)
+; CHECK-RV64C-NEXT:    lh a2, 16(a1)
 ; CHECK-RV64C-NEXT:    lh a5, 24(a1)
-; CHECK-RV64C-NEXT:    lh a2, 56(a1)
-; CHECK-RV64C-NEXT:    lh a3, 48(a1)
-; CHECK-RV64C-NEXT:    lh a4, 40(a1)
 ; CHECK-RV64C-NEXT:    lh a1, 32(a1)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sh a2, 14(a0)
+; CHECK-RV64C-NEXT:    sh a4, 14(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
 ; CHECK-RV64C-NEXT:    sh a3, 12(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sh a4, 10(a0)
+; CHECK-RV64C-NEXT:    sh a7, 10(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
 ; CHECK-RV64C-NEXT:    sh a1, 8(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
 ; CHECK-RV64C-NEXT:    sh a5, 6(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sh t0, 4(a0)
+; CHECK-RV64C-NEXT:    sh a2, 4(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sh a7, 2(a0)
+; CHECK-RV64C-NEXT:    sh t0, 2(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
 ; CHECK-RV64C-NEXT:    sh a6, 0(a0)
 ; CHECK-RV64C-NEXT:    ret
 ;
 ; CHECK-RV32C-LABEL: test_nontemporal_store_v8i16:
 ; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    lh a7, 20(a1)
+; CHECK-RV32C-NEXT:    lh a3, 24(a1)
+; CHECK-RV32C-NEXT:    lh a4, 28(a1)
 ; CHECK-RV32C-NEXT:    lh a6, 0(a1)
-; CHECK-RV32C-NEXT:    lh a7, 4(a1)
-; CHECK-RV32C-NEXT:    lh t0, 8(a1)
+; CHECK-RV32C-NEXT:    lh t0, 4(a1)
+; CHECK-RV32C-NEXT:    lh a2, 8(a1)
 ; CHECK-RV32C-NEXT:    lh a5, 12(a1)
-; CHECK-RV32C-NEXT:    lh a2, 28(a1)
-; CHECK-RV32C-NEXT:    lh a3, 24(a1)
-; CHECK-RV32C-NEXT:    lh a4, 20(a1)
 ; CHECK-RV32C-NEXT:    lh a1, 16(a1)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sh a2, 14(a0)
+; CHECK-RV32C-NEXT:    sh a4, 14(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
 ; CHECK-RV32C-NEXT:    sh a3, 12(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sh a4, 10(a0)
+; CHECK-RV32C-NEXT:    sh a7, 10(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
 ; CHECK-RV32C-NEXT:    sh a1, 8(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
 ; CHECK-RV32C-NEXT:    sh a5, 6(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sh t0, 4(a0)
+; CHECK-RV32C-NEXT:    sh a2, 4(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sh a7, 2(a0)
+; CHECK-RV32C-NEXT:    sh t0, 2(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
 ; CHECK-RV32C-NEXT:    sh a6, 0(a0)
 ; CHECK-RV32C-NEXT:    ret
@@ -2321,54 +2321,54 @@ define void @test_nontemporal_P1_store_v16i8(ptr %p, <16 x i8> %v) {
 ; CHECK-RV64-NEXT:    sd s1, 0(sp) # 8-byte Folded Spill
 ; CHECK-RV64-NEXT:    .cfi_offset s0, -8
 ; CHECK-RV64-NEXT:    .cfi_offset s1, -16
-; CHECK-RV64-NEXT:    lbu a2, 0(a1)
-; CHECK-RV64-NEXT:    lbu a3, 8(a1)
-; CHECK-RV64-NEXT:    lbu a4, 16(a1)
-; CHECK-RV64-NEXT:    lbu a5, 24(a1)
-; CHECK-RV64-NEXT:    lbu a6, 32(a1)
-; CHECK-RV64-NEXT:    lbu a7, 40(a1)
-; CHECK-RV64-NEXT:    lbu t0, 48(a1)
-; CHECK-RV64-NEXT:    lbu t1, 56(a1)
-; CHECK-RV64-NEXT:    lbu t2, 64(a1)
-; CHECK-RV64-NEXT:    lbu t3, 72(a1)
-; CHECK-RV64-NEXT:    lbu t4, 80(a1)
-; CHECK-RV64-NEXT:    lbu t5, 88(a1)
-; CHECK-RV64-NEXT:    lbu t6, 120(a1)
-; CHECK-RV64-NEXT:    lbu s0, 112(a1)
-; CHECK-RV64-NEXT:    lbu s1, 104(a1)
+; CHECK-RV64-NEXT:    lbu a2, 104(a1)
+; CHECK-RV64-NEXT:    lbu a3, 112(a1)
+; CHECK-RV64-NEXT:    lbu a4, 120(a1)
+; CHECK-RV64-NEXT:    lbu a5, 0(a1)
+; CHECK-RV64-NEXT:    lbu a6, 8(a1)
+; CHECK-RV64-NEXT:    lbu a7, 16(a1)
+; CHECK-RV64-NEXT:    lbu t0, 24(a1)
+; CHECK-RV64-NEXT:    lbu t1, 32(a1)
+; CHECK-RV64-NEXT:    lbu t2, 40(a1)
+; CHECK-RV64-NEXT:    lbu t3, 48(a1)
+; CHECK-RV64-NEXT:    lbu t4, 56(a1)
+; CHECK-RV64-NEXT:    lbu t5, 64(a1)
+; CHECK-RV64-NEXT:    lbu t6, 72(a1)
+; CHECK-RV64-NEXT:    lbu s0, 80(a1)
+; CHECK-RV64-NEXT:    lbu s1, 88(a1)
 ; CHECK-RV64-NEXT:    lbu a1, 96(a1)
 ; CHECK-RV64-NEXT:    ntl.p1
-; CHECK-RV64-NEXT:    sb t6, 15(a0)
+; CHECK-RV64-NEXT:    sb a4, 15(a0)
 ; CHECK-RV64-NEXT:    ntl.p1
-; CHECK-RV64-NEXT:    sb s0, 14(a0)
+; CHECK-RV64-NEXT:    sb a3, 14(a0)
 ; CHECK-RV64-NEXT:    ntl.p1
-; CHECK-RV64-NEXT:    sb s1, 13(a0)
+; CHECK-RV64-NEXT:    sb a2, 13(a0)
 ; CHECK-RV64-NEXT:    ntl.p1
 ; CHECK-RV64-NEXT:    sb a1, 12(a0)
 ; CHECK-RV64-NEXT:    ntl.p1
-; CHECK-RV64-NEXT:    sb t5, 11(a0)
+; CHECK-RV64-NEXT:    sb s1, 11(a0)
 ; CHECK-RV64-NEXT:    ntl.p1
-; CHECK-RV64-NEXT:    sb t4, 10(a0)
+; CHECK-RV64-NEXT:    sb s0, 10(a0)
 ; CHECK-RV64-NEXT:    ntl.p1
-; CHECK-RV64-NEXT:    sb t3, 9(a0)
+; CHECK-RV64-NEXT:    sb t6, 9(a0)
 ; CHECK-RV64-NEXT:    ntl.p1
-; CHECK-RV64-NEXT:    sb t2, 8(a0)
+; CHECK-RV64-NEXT:    sb t5, 8(a0)
 ; CHECK-RV64-NEXT:    ntl.p1
-; CHECK-RV64-NEXT:    sb t1, 7(a0)
+; CHECK-RV64-NEXT:    sb t4, 7(a0)
 ; CHECK-RV64-NEXT:    ntl.p1
-; CHECK-RV64-NEXT:    sb t0, 6(a0)
+; CHECK-RV64-NEXT:    sb t3, 6(a0)
 ; CHECK-RV64-NEXT:    ntl.p1
-; CHECK-RV64-NEXT:    sb a7, 5(a0)
+; CHECK-RV64-NEXT:    sb t2, 5(a0)
 ; CHECK-RV64-NEXT:    ntl.p1
-; CHECK-RV64-NEXT:    sb a6, 4(a0)
+; CHECK-RV64-NEXT:    sb t1, 4(a0)
 ; CHECK-RV64-NEXT:    ntl.p1
-; CHECK-RV64-NEXT:    sb a5, 3(a0)
+; CHECK-RV64-NEXT:    sb t0, 3(a0)
 ; CHECK-RV64-NEXT:    ntl.p1
-; CHECK-RV64-NEXT:    sb a4, 2(a0)
+; CHECK-RV64-NEXT:    sb a7, 2(a0)
 ; CHECK-RV64-NEXT:    ntl.p1
-; CHECK-RV64-NEXT:    sb a3, 1(a0)
+; CHECK-RV64-NEXT:    sb a6, 1(a0)
 ; CHECK-RV64-NEXT:    ntl.p1
-; CHECK-RV64-NEXT:    sb a2, 0(a0)
+; CHECK-RV64-NEXT:    sb a5, 0(a0)
 ; CHECK-RV64-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
 ; CHECK-RV64-NEXT:    ld s1, 0(sp) # 8-byte Folded Reload
 ; CHECK-RV64-NEXT:    addi sp, sp, 16
@@ -2382,54 +2382,54 @@ define void @test_nontemporal_P1_store_v16i8(ptr %p, <16 x i8> %v) {
 ; CHECK-RV32-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
 ; CHECK-RV32-NEXT:    .cfi_offset s0, -4
 ; CHECK-RV32-NEXT:    .cfi_offset s1, -8
-; CHECK-RV32-NEXT:    lbu a2, 0(a1)
-; CHECK-RV32-NEXT:    lbu a3, 4(a1)
-; CHECK-RV32-NEXT:    lbu a4, 8(a1)
-; CHECK-RV32-NEXT:    lbu a5, 12(a1)
-; CHECK-RV32-NEXT:    lbu a6, 16(a1)
-; CHECK-RV32-NEXT:    lbu a7, 20(a1)
-; CHECK-RV32-NEXT:    lbu t0, 24(a1)
-; CHECK-RV32-NEXT:    lbu t1, 28(a1)
-; CHECK-RV32-NEXT:    lbu t2, 32(a1)
-; CHECK-RV32-NEXT:    lbu t3, 36(a1)
-; CHECK-RV32-NEXT:    lbu t4, 40(a1)
-; CHECK-RV32-NEXT:    lbu t5, 44(a1)
-; CHECK-RV32-NEXT:    lbu t6, 60(a1)
-; CHECK-RV32-NEXT:    lbu s0, 56(a1)
-; CHECK-RV32-NEXT:    lbu s1, 52(a1)
+; CHECK-RV32-NEXT:    lbu a2, 52(a1)
+; CHECK-RV32-NEXT:    lbu a3, 56(a1)
+; CHECK-RV32-NEXT:    lbu a4, 60(a1)
+; CHECK-RV32-NEXT:    lbu a5, 0(a1)
+; CHECK-RV32-NEXT:    lbu a6, 4(a1)
+; CHECK-RV32-NEXT:    lbu a7, 8(a1)
+; CHECK-RV32-NEXT:    lbu t0, 12(a1)
+; CHECK-RV32-NEXT:    lbu t1, 16(a1)
+; CHECK-RV32-NEXT:    lbu t2, 20(a1)
+; CHECK-RV32-NEXT:    lbu t3, 24(a1)
+; CHECK-RV32-NEXT:    lbu t4, 28(a1)
+; CHECK-RV32-NEXT:    lbu t5, 32(a1)
+; CHECK-RV32-NEXT:    lbu t6, 36(a1)
+; CHECK-RV32-NEXT:    lbu s0, 40(a1)
+; CHECK-RV32-NEXT:    lbu s1, 44(a1)
 ; CHECK-RV32-NEXT:    lbu a1, 48(a1)
 ; CHECK-RV32-NEXT:    ntl.p1
-; CHECK-RV32-NEXT:    sb t6, 15(a0)
+; CHECK-RV32-NEXT:    sb a4, 15(a0)
 ; CHECK-RV32-NEXT:    ntl.p1
-; CHECK-RV32-NEXT:    sb s0, 14(a0)
+; CHECK-RV32-NEXT:    sb a3, 14(a0)
 ; CHECK-RV32-NEXT:    ntl.p1
-; CHECK-RV32-NEXT:    sb s1, 13(a0)
+; CHECK-RV32-NEXT:    sb a2, 13(a0)
 ; CHECK-RV32-NEXT:    ntl.p1
 ; CHECK-RV32-NEXT:    sb a1, 12(a0)
 ; CHECK-RV32-NEXT:    ntl.p1
-; CHECK-RV32-NEXT:    sb t5, 11(a0)
+; CHECK-RV32-NEXT:    sb s1, 11(a0)
 ; CHECK-RV32-NEXT:    ntl.p1
-; CHECK-RV32-NEXT:    sb t4, 10(a0)
+; CHECK-RV32-NEXT:    sb s0, 10(a0)
 ; CHECK-RV32-NEXT:    ntl.p1
-; CHECK-RV32-NEXT:    sb t3, 9(a0)
+; CHECK-RV32-NEXT:    sb t6, 9(a0)
 ; CHECK-RV32-NEXT:    ntl.p1
-; CHECK-RV32-NEXT:    sb t2, 8(a0)
+; CHECK-RV32-NEXT:    sb t5, 8(a0)
 ; CHECK-RV32-NEXT:    ntl.p1
-; CHECK-RV32-NEXT:    sb t1, 7(a0)
+; CHECK-RV32-NEXT:    sb t4, 7(a0)
 ; CHECK-RV32-NEXT:    ntl.p1
-; CHECK-RV32-NEXT:    sb t0, 6(a0)
+; CHECK-RV32-NEXT:    sb t3, 6(a0)
 ; CHECK-RV32-NEXT:    ntl.p1
-; CHECK-RV32-NEXT:    sb a7, 5(a0)
+; CHECK-RV32-NEXT:    sb t2, 5(a0)
 ; CHECK-RV32-NEXT:    ntl.p1
-; CHECK-RV32-NEXT:    sb a6, 4(a0)
+; CHECK-RV32-NEXT:    sb t1, 4(a0)
 ; CHECK-RV32-NEXT:    ntl.p1
-; CHECK-RV32-NEXT:    sb a5, 3(a0)
+; CHECK-RV32-NEXT:    sb t0, 3(a0)
 ; CHECK-RV32-NEXT:    ntl.p1
-; CHECK-RV32-NEXT:    sb a4, 2(a0)
+; CHECK-RV32-NEXT:    sb a7, 2(a0)
 ; CHECK-RV32-NEXT:    ntl.p1
-; CHECK-RV32-NEXT:    sb a3, 1(a0)
+; CHECK-RV32-NEXT:    sb a6, 1(a0)
 ; CHECK-RV32-NEXT:    ntl.p1
-; CHECK-RV32-NEXT:    sb a2, 0(a0)
+; CHECK-RV32-NEXT:    sb a5, 0(a0)
 ; CHECK-RV32-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; CHECK-RV32-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
 ; CHECK-RV32-NEXT:    addi sp, sp, 16
@@ -2443,44 +2443,44 @@ define void @test_nontemporal_P1_store_v16i8(ptr %p, <16 x i8> %v) {
 ; CHECK-RV64C-NEXT:    sd s1, 0(sp) # 8-byte Folded Spill
 ; CHECK-RV64C-NEXT:    .cfi_offset s0, -8
 ; CHECK-RV64C-NEXT:    .cfi_offset s1, -16
+; CHECK-RV64C-NEXT:    lbu t3, 104(a1)
+; CHECK-RV64C-NEXT:    lbu t6, 112(a1)
+; CHECK-RV64C-NEXT:    lbu a4, 120(a1)
 ; CHECK-RV64C-NEXT:    lbu a6, 0(a1)
 ; CHECK-RV64C-NEXT:    lbu a7, 8(a1)
 ; CHECK-RV64C-NEXT:    lbu t0, 16(a1)
 ; CHECK-RV64C-NEXT:    lbu t1, 24(a1)
 ; CHECK-RV64C-NEXT:    lbu t2, 32(a1)
-; CHECK-RV64C-NEXT:    lbu t3, 40(a1)
-; CHECK-RV64C-NEXT:    lbu t4, 48(a1)
-; CHECK-RV64C-NEXT:    lbu t5, 56(a1)
-; CHECK-RV64C-NEXT:    lbu t6, 64(a1)
+; CHECK-RV64C-NEXT:    lbu t4, 40(a1)
+; CHECK-RV64C-NEXT:    lbu t5, 48(a1)
+; CHECK-RV64C-NEXT:    lbu a5, 56(a1)
+; CHECK-RV64C-NEXT:    lbu a2, 64(a1)
 ; CHECK-RV64C-NEXT:    lbu a3, 72(a1)
-; CHECK-RV64C-NEXT:    lbu a4, 80(a1)
-; CHECK-RV64C-NEXT:    lbu a5, 88(a1)
-; CHECK-RV64C-NEXT:    lbu a2, 120(a1)
-; CHECK-RV64C-NEXT:    lbu s0, 112(a1)
-; CHECK-RV64C-NEXT:    lbu s1, 104(a1)
+; CHECK-RV64C-NEXT:    lbu s0, 80(a1)
+; CHECK-RV64C-NEXT:    lbu s1, 88(a1)
 ; CHECK-RV64C-NEXT:    lbu a1, 96(a1)
 ; CHECK-RV64C-NEXT:    c.ntl.p1
-; CHECK-RV64C-NEXT:    sb a2, 15(a0)
+; CHECK-RV64C-NEXT:    sb a4, 15(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.p1
-; CHECK-RV64C-NEXT:    sb s0, 14(a0)
+; CHECK-RV64C-NEXT:    sb t6, 14(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.p1
-; CHECK-RV64C-NEXT:    sb s1, 13(a0)
+; CHECK-RV64C-NEXT:    sb t3, 13(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.p1
 ; CHECK-RV64C-NEXT:    sb a1, 12(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.p1
-; CHECK-RV64C-NEXT:    sb a5, 11(a0)
+; CHECK-RV64C-NEXT:    sb s1, 11(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.p1
-; CHECK-RV64C-NEXT:    sb a4, 10(a0)
+; CHECK-RV64C-NEXT:    sb s0, 10(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.p1
 ; CHECK-RV64C-NEXT:    sb a3, 9(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.p1
-; CHECK-RV64C-NEXT:    sb t6, 8(a0)
+; CHECK-RV64C-NEXT:    sb a2, 8(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.p1
-; CHECK-RV64C-NEXT:    sb t5, 7(a0)
+; CHECK-RV64C-NEXT:    sb a5, 7(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.p1
-; CHECK-RV64C-NEXT:    sb t4, 6(a0)
+; CHECK-RV64C-NEXT:    sb t5, 6(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.p1
-; CHECK-RV64C-NEXT:    sb t3, 5(a0)
+; CHECK-RV64C-NEXT:    sb t4, 5(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.p1
 ; CHECK-RV64C-NEXT:    sb t2, 4(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.p1
@@ -2504,44 +2504,44 @@ define void @test_nontemporal_P1_store_v16i8(ptr %p, <16 x i8> %v) {
 ; CHECK-RV32C-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
 ; CHECK-RV32C-NEXT:    .cfi_offset s0, -4
 ; CHECK-RV32C-NEXT:    .cfi_offset s1, -8
+; CHECK-RV32C-NEXT:    lbu t3, 52(a1)
+; CHECK-RV32C-NEXT:    lbu t6, 56(a1)
+; CHECK-RV32C-NEXT:    lbu a4, 60(a1)
 ; CHECK-RV32C-NEXT:    lbu a6, 0(a1)
 ; CHECK-RV32C-NEXT:    lbu a7, 4(a1)
 ; CHECK-RV32C-NEXT:    lbu t0, 8(a1)
 ; CHECK-RV32C-NEXT:    lbu t1, 12(a1)
 ; CHECK-RV32C-NEXT:    lbu t2, 16(a1)
-; CHECK-RV32C-NEXT:    lbu t3, 20(a1)
-; CHECK-RV32C-NEXT:    lbu t4, 24(a1)
-; CHECK-RV32C-NEXT:    lbu t5, 28(a1)
-; CHECK-RV32C-NEXT:    lbu t6, 32(a1)
+; CHECK-RV32C-NEXT:    lbu t4, 20(a1)
+; CHECK-RV32C-NEXT:    lbu t5, 24(a1)
+; CHECK-RV32C-NEXT:    lbu a5, 28(a1)
+; CHECK-RV32C-NEXT:    lbu a2, 32(a1)
 ; CHECK-RV32C-NEXT:    lbu a3, 36(a1)
-; CHECK-RV32C-NEXT:    lbu a4, 40(a1)
-; CHECK-RV32C-NEXT:    lbu a5, 44(a1)
-; CHECK-RV32C-NEXT:    lbu a2, 60(a1)
-; CHECK-RV32C-NEXT:    lbu s0, 56(a1)
-; CHECK-RV32C-NEXT:    lbu s1, 52(a1)
+; CHECK-RV32C-NEXT:    lbu s0, 40(a1)
+; CHECK-RV32C-NEXT:    lbu s1, 44(a1)
 ; CHECK-RV32C-NEXT:    lbu a1, 48(a1)
 ; CHECK-RV32C-NEXT:    c.ntl.p1
-; CHECK-RV32C-NEXT:    sb a2, 15(a0)
+; CHECK-RV32C-NEXT:    sb a4, 15(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.p1
-; CHECK-RV32C-NEXT:    sb s0, 14(a0)
+; CHECK-RV32C-NEXT:    sb t6, 14(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.p1
-; CHECK-RV32C-NEXT:    sb s1, 13(a0)
+; CHECK-RV32C-NEXT:    sb t3, 13(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.p1
 ; CHECK-RV32C-NEXT:    sb a1, 12(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.p1
-; CHECK-RV32C-NEXT:    sb a5, 11(a0)
+; CHECK-RV32C-NEXT:    sb s1, 11(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.p1
-; CHECK-RV32C-NEXT:    sb a4, 10(a0)
+; CHECK-RV32C-NEXT:    sb s0, 10(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.p1
 ; CHECK-RV32C-NEXT:    sb a3, 9(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.p1
-; CHECK-RV32C-NEXT:    sb t6, 8(a0)
+; CHECK-RV32C-NEXT:    sb a2, 8(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.p1
-; CHECK-RV32C-NEXT:    sb t5, 7(a0)
+; CHECK-RV32C-NEXT:    sb a5, 7(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.p1
-; CHECK-RV32C-NEXT:    sb t4, 6(a0)
+; CHECK-RV32C-NEXT:    sb t5, 6(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.p1
-; CHECK-RV32C-NEXT:    sb t3, 5(a0)
+; CHECK-RV32C-NEXT:    sb t4, 5(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.p1
 ; CHECK-RV32C-NEXT:    sb t2, 4(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.p1
@@ -2577,112 +2577,112 @@ define void @test_nontemporal_P1_store_v16i8(ptr %p, <16 x i8> %v) {
 define void @test_nontemporal_P1_store_v8i16(ptr %p, <8 x i16> %v) {
 ; CHECK-RV64-LABEL: test_nontemporal_P1_store_v8i16:
 ; CHECK-RV64:       # %bb.0:
-; CHECK-RV64-NEXT:    lh a2, 0(a1)
-; CHECK-RV64-NEXT:    lh a3, 8(a1)
-; CHECK-RV64-NEXT:    lh a4, 16(a1)
-; CHECK-RV64-NEXT:    lh a5, 24(a1)
-; CHECK-RV64-NEXT:    lh a6, 56(a1)
-; CHECK-RV64-NEXT:    lh a7, 48(a1)
-; CHECK-RV64-NEXT:    lh t0, 40(a1)
+; CHECK-RV64-NEXT:    lh a2, 40(a1)
+; CHECK-RV64-NEXT:    lh a3, 48(a1)
+; CHECK-RV64-NEXT:    lh a4, 56(a1)
+; CHECK-RV64-NEXT:    lh a5, 0(a1)
+; CHECK-RV64-NEXT:    lh a6, 8(a1)
+; CHECK-RV64-NEXT:    lh a7, 16(a1)
+; CHECK-RV64-NEXT:    lh t0, 24(a1)
 ; CHECK-RV64-NEXT:    lh a1, 32(a1)
 ; CHECK-RV64-NEXT:    ntl.p1
-; CHECK-RV64-NEXT:    sh a6, 14(a0)
+; CHECK-RV64-NEXT:    sh a4, 14(a0)
 ; CHECK-RV64-NEXT:    ntl.p1
-; CHECK-RV64-NEXT:    sh a7, 12(a0)
+; CHECK-RV64-NEXT:    sh a3, 12(a0)
 ; CHECK-RV64-NEXT:    ntl.p1
-; CHECK-RV64-NEXT:    sh t0, 10(a0)
+; CHECK-RV64-NEXT:    sh a2, 10(a0)
 ; CHECK-RV64-NEXT:    ntl.p1
 ; CHECK-RV64-NEXT:    sh a1, 8(a0)
 ; CHECK-RV64-NEXT:    ntl.p1
-; CHECK-RV64-NEXT:    sh a5, 6(a0)
+; CHECK-RV64-NEXT:    sh t0, 6(a0)
 ; CHECK-RV64-NEXT:    ntl.p1
-; CHECK-RV64-NEXT:    sh a4, 4(a0)
+; CHECK-RV64-NEXT:    sh a7, 4(a0)
 ; CHECK-RV64-NEXT:    ntl.p1
-; CHECK-RV64-NEXT:    sh a3, 2(a0)
+; CHECK-RV64-NEXT:    sh a6, 2(a0)
 ; CHECK-RV64-NEXT:    ntl.p1
-; CHECK-RV64-NEXT:    sh a2, 0(a0)
+; CHECK-RV64-NEXT:    sh a5, 0(a0)
 ; CHECK-RV64-NEXT:    ret
 ;
 ; CHECK-RV32-LABEL: test_nontemporal_P1_store_v8i16:
 ; CHECK-RV32:       # %bb.0:
-; CHECK-RV32-NEXT:    lh a2, 0(a1)
-; CHECK-RV32-NEXT:    lh a3, 4(a1)
-; CHECK-RV32-NEXT:    lh a4, 8(a1)
-; CHECK-RV32-NEXT:    lh a5, 12(a1)
-; CHECK-RV32-NEXT:    lh a6, 28(a1)
-; CHECK-RV32-NEXT:    lh a7, 24(a1)
-; CHECK-RV32-NEXT:    lh t0, 20(a1)
+; CHECK-RV32-NEXT:    lh a2, 20(a1)
+; CHECK-RV32-NEXT:    lh a3, 24(a1)
+; CHECK-RV32-NEXT:    lh a4, 28(a1)
+; CHECK-RV32-NEXT:    lh a5, 0(a1)
+; CHECK-RV32-NEXT:    lh a6, 4(a1)
+; CHECK-RV32-NEXT:    lh a7, 8(a1)
+; CHECK-RV32-NEXT:    lh t0, 12(a1)
 ; CHECK-RV32-NEXT:    lh a1, 16(a1)
 ; CHECK-RV32-NEXT:    ntl.p1
-; CHECK-RV32-NEXT:    sh a6, 14(a0)
+; CHECK-RV32-NEXT:    sh a4, 14(a0)
 ; CHECK-RV32-NEXT:    ntl.p1
-; CHECK-RV32-NEXT:    sh a7, 12(a0)
+; CHECK-RV32-NEXT:    sh a3, 12(a0)
 ; CHECK-RV32-NEXT:    ntl.p1
-; CHECK-RV32-NEXT:    sh t0, 10(a0)
+; CHECK-RV32-NEXT:    sh a2, 10(a0)
 ; CHECK-RV32-NEXT:    ntl.p1
 ; CHECK-RV32-NEXT:    sh a1, 8(a0)
 ; CHECK-RV32-NEXT:    ntl.p1
-; CHECK-RV32-NEXT:    sh a5, 6(a0)
+; CHECK-RV32-NEXT:    sh t0, 6(a0)
 ; CHECK-RV32-NEXT:    ntl.p1
-; CHECK-RV32-NEXT:    sh a4, 4(a0)
+; CHECK-RV32-NEXT:    sh a7, 4(a0)
 ; CHECK-RV32-NEXT:    ntl.p1
-; CHECK-RV32-NEXT:    sh a3, 2(a0)
+; CHECK-RV32-NEXT:    sh a6, 2(a0)
 ; CHECK-RV32-NEXT:    ntl.p1
-; CHECK-RV32-NEXT:    sh a2, 0(a0)
+; CHECK-RV32-NEXT:    sh a5, 0(a0)
 ; CHECK-RV32-NEXT:    ret
 ;
 ; CHECK-RV64C-LABEL: test_nontemporal_P1_store_v8i16:
 ; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    lh a7, 40(a1)
+; CHECK-RV64C-NEXT:    lh a3, 48(a1)
+; CHECK-RV64C-NEXT:    lh a4, 56(a1)
 ; CHECK-RV64C-NEXT:    lh a6, 0(a1)
-; CHECK-RV64C-NEXT:    lh a7, 8(a1)
-; CHECK-RV64C-NEXT:    lh t0, 16(a1)
+; CHECK-RV64C-NEXT:    lh t0, 8(a1)
+; CHECK-RV64C-NEXT:    lh a2, 16(a1)
 ; CHECK-RV64C-NEXT:    lh a5, 24(a1)
-; CHECK-RV64C-NEXT:    lh a2, 56(a1)
-; CHECK-RV64C-NEXT:    lh a3, 48(a1)
-; CHECK-RV64C-NEXT:    lh a4, 40(a1)
 ; CHECK-RV64C-NEXT:    lh a1, 32(a1)
 ; CHECK-RV64C-NEXT:    c.ntl.p1
-; CHECK-RV64C-NEXT:    sh a2, 14(a0)
+; CHECK-RV64C-NEXT:    sh a4, 14(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.p1
 ; CHECK-RV64C-NEXT:    sh a3, 12(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.p1
-; CHECK-RV64C-NEXT:    sh a4, 10(a0)
+; CHECK-RV64C-NEXT:    sh a7, 10(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.p1
 ; CHECK-RV64C-NEXT:    sh a1, 8(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.p1
 ; CHECK-RV64C-NEXT:    sh a5, 6(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.p1
-; CHECK-RV64C-NEXT:    sh t0, 4(a0)
+; CHECK-RV64C-NEXT:    sh a2, 4(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.p1
-; CHECK-RV64C-NEXT:    sh a7, 2(a0)
+; CHECK-RV64C-NEXT:    sh t0, 2(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.p1
 ; CHECK-RV64C-NEXT:    sh a6, 0(a0)
 ; CHECK-RV64C-NEXT:    ret
 ;
 ; CHECK-RV32C-LABEL: test_nontemporal_P1_store_v8i16:
 ; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    lh a7, 20(a1)
+; CHECK-RV32C-NEXT:    lh a3, 24(a1)
+; CHECK-RV32C-NEXT:    lh a4, 28(a1)
 ; CHECK-RV32C-NEXT:    lh a6, 0(a1)
-; CHECK-RV32C-NEXT:    lh a7, 4(a1)
-; CHECK-RV32C-NEXT:    lh t0, 8(a1)
+; CHECK-RV32C-NEXT:    lh t0, 4(a1)
+; CHECK-RV32C-NEXT:    lh a2, 8(a1)
 ; CHECK-RV32C-NEXT:    lh a5, 12(a1)
-; CHECK-RV32C-NEXT:    lh a2, 28(a1)
-; CHECK-RV32C-NEXT:    lh a3, 24(a1)
-; CHECK-RV32C-NEXT:    lh a4, 20(a1)
 ; CHECK-RV32C-NEXT:    lh a1, 16(a1)
 ; CHECK-RV32C-NEXT:    c.ntl.p1
-; CHECK-RV32C-NEXT:    sh a2, 14(a0)
+; CHECK-RV32C-NEXT:    sh a4, 14(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.p1
 ; CHECK-RV32C-NEXT:    sh a3, 12(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.p1
-; CHECK-RV32C-NEXT:    sh a4, 10(a0)
+; CHECK-RV32C-NEXT:    sh a7, 10(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.p1
 ; CHECK-RV32C-NEXT:    sh a1, 8(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.p1
 ; CHECK-RV32C-NEXT:    sh a5, 6(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.p1
-; CHECK-RV32C-NEXT:    sh t0, 4(a0)
+; CHECK-RV32C-NEXT:    sh a2, 4(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.p1
-; CHECK-RV32C-NEXT:    sh a7, 2(a0)
+; CHECK-RV32C-NEXT:    sh t0, 2(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.p1
 ; CHECK-RV32C-NEXT:    sh a6, 0(a0)
 ; CHECK-RV32C-NEXT:    ret
@@ -3735,54 +3735,54 @@ define void @test_nontemporal_PALL_store_v16i8(ptr %p, <16 x i8> %v) {
 ; CHECK-RV64-NEXT:    sd s1, 0(sp) # 8-byte Folded Spill
 ; CHECK-RV64-NEXT:    .cfi_offset s0, -8
 ; CHECK-RV64-NEXT:    .cfi_offset s1, -16
-; CHECK-RV64-NEXT:    lbu a2, 0(a1)
-; CHECK-RV64-NEXT:    lbu a3, 8(a1)
-; CHECK-RV64-NEXT:    lbu a4, 16(a1)
-; CHECK-RV64-NEXT:    lbu a5, 24(a1)
-; CHECK-RV64-NEXT:    lbu a6, 32(a1)
-; CHECK-RV64-NEXT:    lbu a7, 40(a1)
-; CHECK-RV64-NEXT:    lbu t0, 48(a1)
-; CHECK-RV64-NEXT:    lbu t1, 56(a1)
-; CHECK-RV64-NEXT:    lbu t2, 64(a1)
-; CHECK-RV64-NEXT:    lbu t3, 72(a1)
-; CHECK-RV64-NEXT:    lbu t4, 80(a1)
-; CHECK-RV64-NEXT:    lbu t5, 88(a1)
-; CHECK-RV64-NEXT:    lbu t6, 120(a1)
-; CHECK-RV64-NEXT:    lbu s0, 112(a1)
-; CHECK-RV64-NEXT:    lbu s1, 104(a1)
+; CHECK-RV64-NEXT:    lbu a2, 104(a1)
+; CHECK-RV64-NEXT:    lbu a3, 112(a1)
+; CHECK-RV64-NEXT:    lbu a4, 120(a1)
+; CHECK-RV64-NEXT:    lbu a5, 0(a1)
+; CHECK-RV64-NEXT:    lbu a6, 8(a1)
+; CHECK-RV64-NEXT:    lbu a7, 16(a1)
+; CHECK-RV64-NEXT:    lbu t0, 24(a1)
+; CHECK-RV64-NEXT:    lbu t1, 32(a1)
+; CHECK-RV64-NEXT:    lbu t2, 40(a1)
+; CHECK-RV64-NEXT:    lbu t3, 48(a1)
+; CHECK-RV64-NEXT:    lbu t4, 56(a1)
+; CHECK-RV64-NEXT:    lbu t5, 64(a1)
+; CHECK-RV64-NEXT:    lbu t6, 72(a1)
+; CHECK-RV64-NEXT:    lbu s0, 80(a1)
+; CHECK-RV64-NEXT:    lbu s1, 88(a1)
 ; CHECK-RV64-NEXT:    lbu a1, 96(a1)
 ; CHECK-RV64-NEXT:    ntl.pall
-; CHECK-RV64-NEXT:    sb t6, 15(a0)
+; CHECK-RV64-NEXT:    sb a4, 15(a0)
 ; CHECK-RV64-NEXT:    ntl.pall
-; CHECK-RV64-NEXT:    sb s0, 14(a0)
+; CHECK-RV64-NEXT:    sb a3, 14(a0)
 ; CHECK-RV64-NEXT:    ntl.pall
-; CHECK-RV64-NEXT:    sb s1, 13(a0)
+; CHECK-RV64-NEXT:    sb a2, 13(a0)
 ; CHECK-RV64-NEXT:    ntl.pall
 ; CHECK-RV64-NEXT:    sb a1, 12(a0)
 ; CHECK-RV64-NEXT:    ntl.pall
-; CHECK-RV64-NEXT:    sb t5, 11(a0)
+; CHECK-RV64-NEXT:    sb s1, 11(a0)
 ; CHECK-RV64-NEXT:    ntl.pall
-; CHECK-RV64-NEXT:    sb t4, 10(a0)
+; CHECK-RV64-NEXT:    sb s0, 10(a0)
 ; CHECK-RV64-NEXT:    ntl.pall
-; CHECK-RV64-NEXT:    sb t3, 9(a0)
+; CHECK-RV64-NEXT:    sb t6, 9(a0)
 ; CHECK-RV64-NEXT:    ntl.pall
-; CHECK-RV64-NEXT:    sb t2, 8(a0)
+; CHECK-RV64-NEXT:    sb t5, 8(a0)
 ; CHECK-RV64-NEXT:    ntl.pall
-; CHECK-RV64-NEXT:    sb t1, 7(a0)
+; CHECK-RV64-NEXT:    sb t4, 7(a0)
 ; CHECK-RV64-NEXT:    ntl.pall
-; CHECK-RV64-NEXT:    sb t0, 6(a0)
+; CHECK-RV64-NEXT:    sb t3, 6(a0)
 ; CHECK-RV64-NEXT:    ntl.pall
-; CHECK-RV64-NEXT:    sb a7, 5(a0)
+; CHECK-RV64-NEXT:    sb t2, 5(a0)
 ; CHECK-RV64-NEXT:    ntl.pall
-; CHECK-RV64-NEXT:    sb a6, 4(a0)
+; CHECK-RV64-NEXT:    sb t1, 4(a0)
 ; CHECK-RV64-NEXT:    ntl.pall
-; CHECK-RV64-NEXT:    sb a5, 3(a0)
+; CHECK-RV64-NEXT:    sb t0, 3(a0)
 ; CHECK-RV64-NEXT:    ntl.pall
-; CHECK-RV64-NEXT:    sb a4, 2(a0)
+; CHECK-RV64-NEXT:    sb a7, 2(a0)
 ; CHECK-RV64-NEXT:    ntl.pall
-; CHECK-RV64-NEXT:    sb a3, 1(a0)
+; CHECK-RV64-NEXT:    sb a6, 1(a0)
 ; CHECK-RV64-NEXT:    ntl.pall
-; CHECK-RV64-NEXT:    sb a2, 0(a0)
+; CHECK-RV64-NEXT:    sb a5, 0(a0)
 ; CHECK-RV64-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
 ; CHECK-RV64-NEXT:    ld s1, 0(sp) # 8-byte Folded Reload
 ; CHECK-RV64-NEXT:    addi sp, sp, 16
@@ -3796,54 +3796,54 @@ define void @test_nontemporal_PALL_store_v16i8(ptr %p, <16 x i8> %v) {
 ; CHECK-RV32-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
 ; CHECK-RV32-NEXT:    .cfi_offset s0, -4
 ; CHECK-RV32-NEXT:    .cfi_offset s1, -8
-; CHECK-RV32-NEXT:    lbu a2, 0(a1)
-; CHECK-RV32-NEXT:    lbu a3, 4(a1)
-; CHECK-RV32-NEXT:    lbu a4, 8(a1)
-; CHECK-RV32-NEXT:    lbu a5, 12(a1)
-; CHECK-RV32-NEXT:    lbu a6, 16(a1)
-; CHECK-RV32-NEXT:    lbu a7, 20(a1)
-; CHECK-RV32-NEXT:    lbu t0, 24(a1)
-; CHECK-RV32-NEXT:    lbu t1, 28(a1)
-; CHECK-RV32-NEXT:    lbu t2, 32(a1)
-; CHECK-RV32-NEXT:    lbu t3, 36(a1)
-; CHECK-RV32-NEXT:    lbu t4, 40(a1)
-; CHECK-RV32-NEXT:    lbu t5, 44(a1)
-; CHECK-RV32-NEXT:    lbu t6, 60(a1)
-; CHECK-RV32-NEXT:    lbu s0, 56(a1)
-; CHECK-RV32-NEXT:    lbu s1, 52(a1)
+; CHECK-RV32-NEXT:    lbu a2, 52(a1)
+; CHECK-RV32-NEXT:    lbu a3, 56(a1)
+; CHECK-RV32-NEXT:    lbu a4, 60(a1)
+; CHECK-RV32-NEXT:    lbu a5, 0(a1)
+; CHECK-RV32-NEXT:    lbu a6, 4(a1)
+; CHECK-RV32-NEXT:    lbu a7, 8(a1)
+; CHECK-RV32-NEXT:    lbu t0, 12(a1)
+; CHECK-RV32-NEXT:    lbu t1, 16(a1)
+; CHECK-RV32-NEXT:    lbu t2, 20(a1)
+; CHECK-RV32-NEXT:    lbu t3, 24(a1)
+; CHECK-RV32-NEXT:    lbu t4, 28(a1)
+; CHECK-RV32-NEXT:    lbu t5, 32(a1)
+; CHECK-RV32-NEXT:    lbu t6, 36(a1)
+; CHECK-RV32-NEXT:    lbu s0, 40(a1)
+; CHECK-RV32-NEXT:    lbu s1, 44(a1)
 ; CHECK-RV32-NEXT:    lbu a1, 48(a1)
 ; CHECK-RV32-NEXT:    ntl.pall
-; CHECK-RV32-NEXT:    sb t6, 15(a0)
+; CHECK-RV32-NEXT:    sb a4, 15(a0)
 ; CHECK-RV32-NEXT:    ntl.pall
-; CHECK-RV32-NEXT:    sb s0, 14(a0)
+; CHECK-RV32-NEXT:    sb a3, 14(a0)
 ; CHECK-RV32-NEXT:    ntl.pall
-; CHECK-RV32-NEXT:    sb s1, 13(a0)
+; CHECK-RV32-NEXT:    sb a2, 13(a0)
 ; CHECK-RV32-NEXT:    ntl.pall
 ; CHECK-RV32-NEXT:    sb a1, 12(a0)
 ; CHECK-RV32-NEXT:    ntl.pall
-; CHECK-RV32-NEXT:    sb t5, 11(a0)
+; CHECK-RV32-NEXT:    sb s1, 11(a0)
 ; CHECK-RV32-NEXT:    ntl.pall
-; CHECK-RV32-NEXT:    sb t4, 10(a0)
+; CHECK-RV32-NEXT:    sb s0, 10(a0)
 ; CHECK-RV32-NEXT:    ntl.pall
-; CHECK-RV32-NEXT:    sb t3, 9(a0)
+; CHECK-RV32-NEXT:    sb t6, 9(a0)
 ; CHECK-RV32-NEXT:    ntl.pall
-; CHECK-RV32-NEXT:    sb t2, 8(a0)
+; CHECK-RV32-NEXT:    sb t5, 8(a0)
 ; CHECK-RV32-NEXT:    ntl.pall
-; CHECK-RV32-NEXT:    sb t1, 7(a0)
+; CHECK-RV32-NEXT:    sb t4, 7(a0)
 ; CHECK-RV32-NEXT:    ntl.pall
-; CHECK-RV32-NEXT:    sb t0, 6(a0)
+; CHECK-RV32-NEXT:    sb t3, 6(a0)
 ; CHECK-RV32-NEXT:    ntl.pall
-; CHECK-RV32-NEXT:    sb a7, 5(a0)
+; CHECK-RV32-NEXT:    sb t2, 5(a0)
 ; CHECK-RV32-NEXT:    ntl.pall
-; CHECK-RV32-NEXT:    sb a6, 4(a0)
+; CHECK-RV32-NEXT:    sb t1, 4(a0)
 ; CHECK-RV32-NEXT:    ntl.pall
-; CHECK-RV32-NEXT:    sb a5, 3(a0)
+; CHECK-RV32-NEXT:    sb t0, 3(a0)
 ; CHECK-RV32-NEXT:    ntl.pall
-; CHECK-RV32-NEXT:    sb a4, 2(a0)
+; CHECK-RV32-NEXT:    sb a7, 2(a0)
 ; CHECK-RV32-NEXT:    ntl.pall
-; CHECK-RV32-NEXT:    sb a3, 1(a0)
+; CHECK-RV32-NEXT:    sb a6, 1(a0)
 ; CHECK-RV32-NEXT:    ntl.pall
-; CHECK-RV32-NEXT:    sb a2, 0(a0)
+; CHECK-RV32-NEXT:    sb a5, 0(a0)
 ; CHECK-RV32-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; CHECK-RV32-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
 ; CHECK-RV32-NEXT:    addi sp, sp, 16
@@ -3857,44 +3857,44 @@ define void @test_nontemporal_PALL_store_v16i8(ptr %p, <16 x i8> %v) {
 ; CHECK-RV64C-NEXT:    sd s1, 0(sp) # 8-byte Folded Spill
 ; CHECK-RV64C-NEXT:    .cfi_offset s0, -8
 ; CHECK-RV64C-NEXT:    .cfi_offset s1, -16
+; CHECK-RV64C-NEXT:    lbu t3, 104(a1)
+; CHECK-RV64C-NEXT:    lbu t6, 112(a1)
+; CHECK-RV64C-NEXT:    lbu a4, 120(a1)
 ; CHECK-RV64C-NEXT:    lbu a6, 0(a1)
 ; CHECK-RV64C-NEXT:    lbu a7, 8(a1)
 ; CHECK-RV64C-NEXT:    lbu t0, 16(a1)
 ; CHECK-RV64C-NEXT:    lbu t1, 24(a1)
 ; CHECK-RV64C-NEXT:    lbu t2, 32(a1)
-; CHECK-RV64C-NEXT:    lbu t3, 40(a1)
-; CHECK-RV64C-NEXT:    lbu t4, 48(a1)
-; CHECK-RV64C-NEXT:    lbu t5, 56(a1)
-; CHECK-RV64C-NEXT:    lbu t6, 64(a1)
+; CHECK-RV64C-NEXT:    lbu t4, 40(a1)
+; CHECK-RV64C-NEXT:    lbu t5, 48(a1)
+; CHECK-RV64C-NEXT:    lbu a5, 56(a1)
+; CHECK-RV64C-NEXT:    lbu a2, 64(a1)
 ; CHECK-RV64C-NEXT:    lbu a3, 72(a1)
-; CHECK-RV64C-NEXT:    lbu a4, 80(a1)
-; CHECK-RV64C-NEXT:    lbu a5, 88(a1)
-; CHECK-RV64C-NEXT:    lbu a2, 120(a1)
-; CHECK-RV64C-NEXT:    lbu s0, 112(a1)
-; CHECK-RV64C-NEXT:    lbu s1, 104(a1)
+; CHECK-RV64C-NEXT:    lbu s0, 80(a1)
+; CHECK-RV64C-NEXT:    lbu s1, 88(a1)
 ; CHECK-RV64C-NEXT:    lbu a1, 96(a1)
 ; CHECK-RV64C-NEXT:    c.ntl.pall
-; CHECK-RV64C-NEXT:    sb a2, 15(a0)
+; CHECK-RV64C-NEXT:    sb a4, 15(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.pall
-; CHECK-RV64C-NEXT:    sb s0, 14(a0)
+; CHECK-RV64C-NEXT:    sb t6, 14(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.pall
-; CHECK-RV64C-NEXT:    sb s1, 13(a0)
+; CHECK-RV64C-NEXT:    sb t3, 13(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.pall
 ; CHECK-RV64C-NEXT:    sb a1, 12(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.pall
-; CHECK-RV64C-NEXT:    sb a5, 11(a0)
+; CHECK-RV64C-NEXT:    sb s1, 11(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.pall
-; CHECK-RV64C-NEXT:    sb a4, 10(a0)
+; CHECK-RV64C-NEXT:    sb s0, 10(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.pall
 ; CHECK-RV64C-NEXT:    sb a3, 9(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.pall
-; CHECK-RV64C-NEXT:    sb t6, 8(a0)
+; CHECK-RV64C-NEXT:    sb a2, 8(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.pall
-; CHECK-RV64C-NEXT:    sb t5, 7(a0)
+; CHECK-RV64C-NEXT:    sb a5, 7(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.pall
-; CHECK-RV64C-NEXT:    sb t4, 6(a0)
+; CHECK-RV64C-NEXT:    sb t5, 6(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.pall
-; CHECK-RV64C-NEXT:    sb t3, 5(a0)
+; CHECK-RV64C-NEXT:    sb t4, 5(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.pall
 ; CHECK-RV64C-NEXT:    sb t2, 4(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.pall
@@ -3918,44 +3918,44 @@ define void @test_nontemporal_PALL_store_v16i8(ptr %p, <16 x i8> %v) {
 ; CHECK-RV32C-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
 ; CHECK-RV32C-NEXT:    .cfi_offset s0, -4
 ; CHECK-RV32C-NEXT:    .cfi_offset s1, -8
+; CHECK-RV32C-NEXT:    lbu t3, 52(a1)
+; CHECK-RV32C-NEXT:    lbu t6, 56(a1)
+; CHECK-RV32C-NEXT:    lbu a4, 60(a1)
 ; CHECK-RV32C-NEXT:    lbu a6, 0(a1)
 ; CHECK-RV32C-NEXT:    lbu a7, 4(a1)
 ; CHECK-RV32C-NEXT:    lbu t0, 8(a1)
 ; CHECK-RV32C-NEXT:    lbu t1, 12(a1)
 ; CHECK-RV32C-NEXT:    lbu t2, 16(a1)
-; CHECK-RV32C-NEXT:    lbu t3, 20(a1)
-; CHECK-RV32C-NEXT:    lbu t4, 24(a1)
-; CHECK-RV32C-NEXT:    lbu t5, 28(a1)
-; CHECK-RV32C-NEXT:    lbu t6, 32(a1)
+; CHECK-RV32C-NEXT:    lbu t4, 20(a1)
+; CHECK-RV32C-NEXT:    lbu t5, 24(a1)
+; CHECK-RV32C-NEXT:    lbu a5, 28(a1)
+; CHECK-RV32C-NEXT:    lbu a2, 32(a1)
 ; CHECK-RV32C-NEXT:    lbu a3, 36(a1)
-; CHECK-RV32C-NEXT:    lbu a4, 40(a1)
-; CHECK-RV32C-NEXT:    lbu a5, 44(a1)
-; CHECK-RV32C-NEXT:    lbu a2, 60(a1)
-; CHECK-RV32C-NEXT:    lbu s0, 56(a1)
-; CHECK-RV32C-NEXT:    lbu s1, 52(a1)
+; CHECK-RV32C-NEXT:    lbu s0, 40(a1)
+; CHECK-RV32C-NEXT:    lbu s1, 44(a1)
 ; CHECK-RV32C-NEXT:    lbu a1, 48(a1)
 ; CHECK-RV32C-NEXT:    c.ntl.pall
-; CHECK-RV32C-NEXT:    sb a2, 15(a0)
+; CHECK-RV32C-NEXT:    sb a4, 15(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.pall
-; CHECK-RV32C-NEXT:    sb s0, 14(a0)
+; CHECK-RV32C-NEXT:    sb t6, 14(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.pall
-; CHECK-RV32C-NEXT:    sb s1, 13(a0)
+; CHECK-RV32C-NEXT:    sb t3, 13(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.pall
 ; CHECK-RV32C-NEXT:    sb a1, 12(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.pall
-; CHECK-RV32C-NEXT:    sb a5, 11(a0)
+; CHECK-RV32C-NEXT:    sb s1, 11(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.pall
-; CHECK-RV32C-NEXT:    sb a4, 10(a0)
+; CHECK-RV32C-NEXT:    sb s0, 10(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.pall
 ; CHECK-RV32C-NEXT:    sb a3, 9(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.pall
-; CHECK-RV32C-NEXT:    sb t6, 8(a0)
+; CHECK-RV32C-NEXT:    sb a2, 8(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.pall
-; CHECK-RV32C-NEXT:    sb t5, 7(a0)
+; CHECK-RV32C-NEXT:    sb a5, 7(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.pall
-; CHECK-RV32C-NEXT:    sb t4, 6(a0)
+; CHECK-RV32C-NEXT:    sb t5, 6(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.pall
-; CHECK-RV32C-NEXT:    sb t3, 5(a0)
+; CHECK-RV32C-NEXT:    sb t4, 5(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.pall
 ; CHECK-RV32C-NEXT:    sb t2, 4(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.pall
@@ -3991,112 +3991,112 @@ define void @test_nontemporal_PALL_store_v16i8(ptr %p, <16 x i8> %v) {
 define void @test_nontemporal_PALL_store_v8i16(ptr %p, <8 x i16> %v) {
 ; CHECK-RV64-LABEL: test_nontemporal_PALL_store_v8i16:
 ; CHECK-RV64:       # %bb.0:
-; CHECK-RV64-NEXT:    lh a2, 0(a1)
-; CHECK-RV64-NEXT:    lh a3, 8(a1)
-; CHECK-RV64-NEXT:    lh a4, 16(a1)
-; CHECK-RV64-NEXT:    lh a5, 24(a1)
-; CHECK-RV64-NEXT:    lh a6, 56(a1)
-; CHECK-RV64-NEXT:    lh a7, 48(a1)
-; CHECK-RV64-NEXT:    lh t0, 40(a1)
+; CHECK-RV64-NEXT:    lh a2, 40(a1)
+; CHECK-RV64-NEXT:    lh a3, 48(a1)
+; CHECK-RV64-NEXT:    lh a4, 56(a1)
+; CHECK-RV64-NEXT:    lh a5, 0(a1)
+; CHECK-RV64-NEXT:    lh a6, 8(a1)
+; CHECK-RV64-NEXT:    lh a7, 16(a1)
+; CHECK-RV64-NEXT:    lh t0, 24(a1)
 ; CHECK-RV64-NEXT:    lh a1, 32(a1)
 ; CHECK-RV64-NEXT:    ntl.pall
-; CHECK-RV64-NEXT:    sh a6, 14(a0)
+; CHECK-RV64-NEXT:    sh a4, 14(a0)
 ; CHECK-RV64-NEXT:    ntl.pall
-; CHECK-RV64-NEXT:    sh a7, 12(a0)
+; CHECK-RV64-NEXT:    sh a3, 12(a0)
 ; CHECK-RV64-NEXT:    ntl.pall
-; CHECK-RV64-NEXT:    sh t0, 10(a0)
+; CHECK-RV64-NEXT:    sh a2, 10(a0)
 ; CHECK-RV64-NEXT:    ntl.pall
 ; CHECK-RV64-NEXT:    sh a1, 8(a0)
 ; CHECK-RV64-NEXT:    ntl.pall
-; CHECK-RV64-NEXT:    sh a5, 6(a0)
+; CHECK-RV64-NEXT:    sh t0, 6(a0)
 ; CHECK-RV64-NEXT:    ntl.pall
-; CHECK-RV64-NEXT:    sh a4, 4(a0)
+; CHECK-RV64-NEXT:    sh a7, 4(a0)
 ; CHECK-RV64-NEXT:    ntl.pall
-; CHECK-RV64-NEXT:    sh a3, 2(a0)
+; CHECK-RV64-NEXT:    sh a6, 2(a0)
 ; CHECK-RV64-NEXT:    ntl.pall
-; CHECK-RV64-NEXT:    sh a2, 0(a0)
+; CHECK-RV64-NEXT:    sh a5, 0(a0)
 ; CHECK-RV64-NEXT:    ret
 ;
 ; CHECK-RV32-LABEL: test_nontemporal_PALL_store_v8i16:
 ; CHECK-RV32:       # %bb.0:
-; CHECK-RV32-NEXT:    lh a2, 0(a1)
-; CHECK-RV32-NEXT:    lh a3, 4(a1)
-; CHECK-RV32-NEXT:    lh a4, 8(a1)
-; CHECK-RV32-NEXT:    lh a5, 12(a1)
-; CHECK-RV32-NEXT:    lh a6, 28(a1)
-; CHECK-RV32-NEXT:    lh a7, 24(a1)
-; CHECK-RV32-NEXT:    lh t0, 20(a1)
+; CHECK-RV32-NEXT:    lh a2, 20(a1)
+; CHECK-RV32-NEXT:    lh a3, 24(a1)
+; CHECK-RV32-NEXT:    lh a4, 28(a1)
+; CHECK-RV32-NEXT:    lh a5, 0(a1)
+; CHECK-RV32-NEXT:    lh a6, 4(a1)
+; CHECK-RV32-NEXT:    lh a7, 8(a1)
+; CHECK-RV32-NEXT:    lh t0, 12(a1)
 ; CHECK-RV32-NEXT:    lh a1, 16(a1)
 ; CHECK-RV32-NEXT:    ntl.pall
-; CHECK-RV32-NEXT:    sh a6, 14(a0)
+; CHECK-RV32-NEXT:    sh a4, 14(a0)
 ; CHECK-RV32-NEXT:    ntl.pall
-; CHECK-RV32-NEXT:    sh a7, 12(a0)
+; CHECK-RV32-NEXT:    sh a3, 12(a0)
 ; CHECK-RV32-NEXT:    ntl.pall
-; CHECK-RV32-NEXT:    sh t0, 10(a0)
+; CHECK-RV32-NEXT:    sh a2, 10(a0)
 ; CHECK-RV32-NEXT:    ntl.pall
 ; CHECK-RV32-NEXT:    sh a1, 8(a0)
 ; CHECK-RV32-NEXT:    ntl.pall
-; CHECK-RV32-NEXT:    sh a5, 6(a0)
+; CHECK-RV32-NEXT:    sh t0, 6(a0)
 ; CHECK-RV32-NEXT:    ntl.pall
-; CHECK-RV32-NEXT:    sh a4, 4(a0)
+; CHECK-RV32-NEXT:    sh a7, 4(a0)
 ; CHECK-RV32-NEXT:    ntl.pall
-; CHECK-RV32-NEXT:    sh a3, 2(a0)
+; CHECK-RV32-NEXT:    sh a6, 2(a0)
 ; CHECK-RV32-NEXT:    ntl.pall
-; CHECK-RV32-NEXT:    sh a2, 0(a0)
+; CHECK-RV32-NEXT:    sh a5, 0(a0)
 ; CHECK-RV32-NEXT:    ret
 ;
 ; CHECK-RV64C-LABEL: test_nontemporal_PALL_store_v8i16:
 ; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    lh a7, 40(a1)
+; CHECK-RV64C-NEXT:    lh a3, 48(a1)
+; CHECK-RV64C-NEXT:    lh a4, 56(a1)
 ; CHECK-RV64C-NEXT:    lh a6, 0(a1)
-; CHECK-RV64C-NEXT:    lh a7, 8(a1)
-; CHECK-RV64C-NEXT:    lh t0, 16(a1)
+; CHECK-RV64C-NEXT:    lh t0, 8(a1)
+; CHECK-RV64C-NEXT:    lh a2, 16(a1)
 ; CHECK-RV64C-NEXT:    lh a5, 24(a1)
-; CHECK-RV64C-NEXT:    lh a2, 56(a1)
-; CHECK-RV64C-NEXT:    lh a3, 48(a1)
-; CHECK-RV64C-NEXT:    lh a4, 40(a1)
 ; CHECK-RV64C-NEXT:    lh a1, 32(a1)
 ; CHECK-RV64C-NEXT:    c.ntl.pall
-; CHECK-RV64C-NEXT:    sh a2, 14(a0)
+; CHECK-RV64C-NEXT:    sh a4, 14(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.pall
 ; CHECK-RV64C-NEXT:    sh a3, 12(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.pall
-; CHECK-RV64C-NEXT:    sh a4, 10(a0)
+; CHECK-RV64C-NEXT:    sh a7, 10(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.pall
 ; CHECK-RV64C-NEXT:    sh a1, 8(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.pall
 ; CHECK-RV64C-NEXT:    sh a5, 6(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.pall
-; CHECK-RV64C-NEXT:    sh t0, 4(a0)
+; CHECK-RV64C-NEXT:    sh a2, 4(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.pall
-; CHECK-RV64C-NEXT:    sh a7, 2(a0)
+; CHECK-RV64C-NEXT:    sh t0, 2(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.pall
 ; CHECK-RV64C-NEXT:    sh a6, 0(a0)
 ; CHECK-RV64C-NEXT:    ret
 ;
 ; CHECK-RV32C-LABEL: test_nontemporal_PALL_store_v8i16:
 ; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    lh a7, 20(a1)
+; CHECK-RV32C-NEXT:    lh a3, 24(a1)
+; CHECK-RV32C-NEXT:    lh a4, 28(a1)
 ; CHECK-RV32C-NEXT:    lh a6, 0(a1)
-; CHECK-RV32C-NEXT:    lh a7, 4(a1)
-; CHECK-RV32C-NEXT:    lh t0, 8(a1)
+; CHECK-RV32C-NEXT:    lh t0, 4(a1)
+; CHECK-RV32C-NEXT:    lh a2, 8(a1)
 ; CHECK-RV32C-NEXT:    lh a5, 12(a1)
-; CHECK-RV32C-NEXT:    lh a2, 28(a1)
-; CHECK-RV32C-NEXT:    lh a3, 24(a1)
-; CHECK-RV32C-NEXT:    lh a4, 20(a1)
 ; CHECK-RV32C-NEXT:    lh a1, 16(a1)
 ; CHECK-RV32C-NEXT:    c.ntl.pall
-; CHECK-RV32C-NEXT:    sh a2, 14(a0)
+; CHECK-RV32C-NEXT:    sh a4, 14(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.pall
 ; CHECK-RV32C-NEXT:    sh a3, 12(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.pall
-; CHECK-RV32C-NEXT:    sh a4, 10(a0)
+; CHECK-RV32C-NEXT:    sh a7, 10(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.pall
 ; CHECK-RV32C-NEXT:    sh a1, 8(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.pall
 ; CHECK-RV32C-NEXT:    sh a5, 6(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.pall
-; CHECK-RV32C-NEXT:    sh t0, 4(a0)
+; CHECK-RV32C-NEXT:    sh a2, 4(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.pall
-; CHECK-RV32C-NEXT:    sh a7, 2(a0)
+; CHECK-RV32C-NEXT:    sh t0, 2(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.pall
 ; CHECK-RV32C-NEXT:    sh a6, 0(a0)
 ; CHECK-RV32C-NEXT:    ret
@@ -5149,54 +5149,54 @@ define void @test_nontemporal_S1_store_v16i8(ptr %p, <16 x i8> %v) {
 ; CHECK-RV64-NEXT:    sd s1, 0(sp) # 8-byte Folded Spill
 ; CHECK-RV64-NEXT:    .cfi_offset s0, -8
 ; CHECK-RV64-NEXT:    .cfi_offset s1, -16
-; CHECK-RV64-NEXT:    lbu a2, 0(a1)
-; CHECK-RV64-NEXT:    lbu a3, 8(a1)
-; CHECK-RV64-NEXT:    lbu a4, 16(a1)
-; CHECK-RV64-NEXT:    lbu a5, 24(a1)
-; CHECK-RV64-NEXT:    lbu a6, 32(a1)
-; CHECK-RV64-NEXT:    lbu a7, 40(a1)
-; CHECK-RV64-NEXT:    lbu t0, 48(a1)
-; CHECK-RV64-NEXT:    lbu t1, 56(a1)
-; CHECK-RV64-NEXT:    lbu t2, 64(a1)
-; CHECK-RV64-NEXT:    lbu t3, 72(a1)
-; CHECK-RV64-NEXT:    lbu t4, 80(a1)
-; CHECK-RV64-NEXT:    lbu t5, 88(a1)
-; CHECK-RV64-NEXT:    lbu t6, 120(a1)
-; CHECK-RV64-NEXT:    lbu s0, 112(a1)
-; CHECK-RV64-NEXT:    lbu s1, 104(a1)
+; CHECK-RV64-NEXT:    lbu a2, 104(a1)
+; CHECK-RV64-NEXT:    lbu a3, 112(a1)
+; CHECK-RV64-NEXT:    lbu a4, 120(a1)
+; CHECK-RV64-NEXT:    lbu a5, 0(a1)
+; CHECK-RV64-NEXT:    lbu a6, 8(a1)
+; CHECK-RV64-NEXT:    lbu a7, 16(a1)
+; CHECK-RV64-NEXT:    lbu t0, 24(a1)
+; CHECK-RV64-NEXT:    lbu t1, 32(a1)
+; CHECK-RV64-NEXT:    lbu t2, 40(a1)
+; CHECK-RV64-NEXT:    lbu t3, 48(a1)
+; CHECK-RV64-NEXT:    lbu t4, 56(a1)
+; CHECK-RV64-NEXT:    lbu t5, 64(a1)
+; CHECK-RV64-NEXT:    lbu t6, 72(a1)
+; CHECK-RV64-NEXT:    lbu s0, 80(a1)
+; CHECK-RV64-NEXT:    lbu s1, 88(a1)
 ; CHECK-RV64-NEXT:    lbu a1, 96(a1)
 ; CHECK-RV64-NEXT:    ntl.s1
-; CHECK-RV64-NEXT:    sb t6, 15(a0)
+; CHECK-RV64-NEXT:    sb a4, 15(a0)
 ; CHECK-RV64-NEXT:    ntl.s1
-; CHECK-RV64-NEXT:    sb s0, 14(a0)
+; CHECK-RV64-NEXT:    sb a3, 14(a0)
 ; CHECK-RV64-NEXT:    ntl.s1
-; CHECK-RV64-NEXT:    sb s1, 13(a0)
+; CHECK-RV64-NEXT:    sb a2, 13(a0)
 ; CHECK-RV64-NEXT:    ntl.s1
 ; CHECK-RV64-NEXT:    sb a1, 12(a0)
 ; CHECK-RV64-NEXT:    ntl.s1
-; CHECK-RV64-NEXT:    sb t5, 11(a0)
+; CHECK-RV64-NEXT:    sb s1, 11(a0)
 ; CHECK-RV64-NEXT:    ntl.s1
-; CHECK-RV64-NEXT:    sb t4, 10(a0)
+; CHECK-RV64-NEXT:    sb s0, 10(a0)
 ; CHECK-RV64-NEXT:    ntl.s1
-; CHECK-RV64-NEXT:    sb t3, 9(a0)
+; CHECK-RV64-NEXT:    sb t6, 9(a0)
 ; CHECK-RV64-NEXT:    ntl.s1
-; CHECK-RV64-NEXT:    sb t2, 8(a0)
+; CHECK-RV64-NEXT:    sb t5, 8(a0)
 ; CHECK-RV64-NEXT:    ntl.s1
-; CHECK-RV64-NEXT:    sb t1, 7(a0)
+; CHECK-RV64-NEXT:    sb t4, 7(a0)
 ; CHECK-RV64-NEXT:    ntl.s1
-; CHECK-RV64-NEXT:    sb t0, 6(a0)
+; CHECK-RV64-NEXT:    sb t3, 6(a0)
 ; CHECK-RV64-NEXT:    ntl.s1
-; CHECK-RV64-NEXT:    sb a7, 5(a0)
+; CHECK-RV64-NEXT:    sb t2, 5(a0)
 ; CHECK-RV64-NEXT:    ntl.s1
-; CHECK-RV64-NEXT:    sb a6, 4(a0)
+; CHECK-RV64-NEXT:    sb t1, 4(a0)
 ; CHECK-RV64-NEXT:    ntl.s1
-; CHECK-RV64-NEXT:    sb a5, 3(a0)
+; CHECK-RV64-NEXT:    sb t0, 3(a0)
 ; CHECK-RV64-NEXT:    ntl.s1
-; CHECK-RV64-NEXT:    sb a4, 2(a0)
+; CHECK-RV64-NEXT:    sb a7, 2(a0)
 ; CHECK-RV64-NEXT:    ntl.s1
-; CHECK-RV64-NEXT:    sb a3, 1(a0)
+; CHECK-RV64-NEXT:    sb a6, 1(a0)
 ; CHECK-RV64-NEXT:    ntl.s1
-; CHECK-RV64-NEXT:    sb a2, 0(a0)
+; CHECK-RV64-NEXT:    sb a5, 0(a0)
 ; CHECK-RV64-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
 ; CHECK-RV64-NEXT:    ld s1, 0(sp) # 8-byte Folded Reload
 ; CHECK-RV64-NEXT:    addi sp, sp, 16
@@ -5210,54 +5210,54 @@ define void @test_nontemporal_S1_store_v16i8(ptr %p, <16 x i8> %v) {
 ; CHECK-RV32-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
 ; CHECK-RV32-NEXT:    .cfi_offset s0, -4
 ; CHECK-RV32-NEXT:    .cfi_offset s1, -8
-; CHECK-RV32-NEXT:    lbu a2, 0(a1)
-; CHECK-RV32-NEXT:    lbu a3, 4(a1)
-; CHECK-RV32-NEXT:    lbu a4, 8(a1)
-; CHECK-RV32-NEXT:    lbu a5, 12(a1)
-; CHECK-RV32-NEXT:    lbu a6, 16(a1)
-; CHECK-RV32-NEXT:    lbu a7, 20(a1)
-; CHECK-RV32-NEXT:    lbu t0, 24(a1)
-; CHECK-RV32-NEXT:    lbu t1, 28(a1)
-; CHECK-RV32-NEXT:    lbu t2, 32(a1)
-; CHECK-RV32-NEXT:    lbu t3, 36(a1)
-; CHECK-RV32-NEXT:    lbu t4, 40(a1)
-; CHECK-RV32-NEXT:    lbu t5, 44(a1)
-; CHECK-RV32-NEXT:    lbu t6, 60(a1)
-; CHECK-RV32-NEXT:    lbu s0, 56(a1)
-; CHECK-RV32-NEXT:    lbu s1, 52(a1)
+; CHECK-RV32-NEXT:    lbu a2, 52(a1)
+; CHECK-RV32-NEXT:    lbu a3, 56(a1)
+; CHECK-RV32-NEXT:    lbu a4, 60(a1)
+; CHECK-RV32-NEXT:    lbu a5, 0(a1)
+; CHECK-RV32-NEXT:    lbu a6, 4(a1)
+; CHECK-RV32-NEXT:    lbu a7, 8(a1)
+; CHECK-RV32-NEXT:    lbu t0, 12(a1)
+; CHECK-RV32-NEXT:    lbu t1, 16(a1)
+; CHECK-RV32-NEXT:    lbu t2, 20(a1)
+; CHECK-RV32-NEXT:    lbu t3, 24(a1)
+; CHECK-RV32-NEXT:    lbu t4, 28(a1)
+; CHECK-RV32-NEXT:    lbu t5, 32(a1)
+; CHECK-RV32-NEXT:    lbu t6, 36(a1)
+; CHECK-RV32-NEXT:    lbu s0, 40(a1)
+; CHECK-RV32-NEXT:    lbu s1, 44(a1)
 ; CHECK-RV32-NEXT:    lbu a1, 48(a1)
 ; CHECK-RV32-NEXT:    ntl.s1
-; CHECK-RV32-NEXT:    sb t6, 15(a0)
+; CHECK-RV32-NEXT:    sb a4, 15(a0)
 ; CHECK-RV32-NEXT:    ntl.s1
-; CHECK-RV32-NEXT:    sb s0, 14(a0)
+; CHECK-RV32-NEXT:    sb a3, 14(a0)
 ; CHECK-RV32-NEXT:    ntl.s1
-; CHECK-RV32-NEXT:    sb s1, 13(a0)
+; CHECK-RV32-NEXT:    sb a2, 13(a0)
 ; CHECK-RV32-NEXT:    ntl.s1
 ; CHECK-RV32-NEXT:    sb a1, 12(a0)
 ; CHECK-RV32-NEXT:    ntl.s1
-; CHECK-RV32-NEXT:    sb t5, 11(a0)
+; CHECK-RV32-NEXT:    sb s1, 11(a0)
 ; CHECK-RV32-NEXT:    ntl.s1
-; CHECK-RV32-NEXT:    sb t4, 10(a0)
+; CHECK-RV32-NEXT:    sb s0, 10(a0)
 ; CHECK-RV32-NEXT:    ntl.s1
-; CHECK-RV32-NEXT:    sb t3, 9(a0)
+; CHECK-RV32-NEXT:    sb t6, 9(a0)
 ; CHECK-RV32-NEXT:    ntl.s1
-; CHECK-RV32-NEXT:    sb t2, 8(a0)
+; CHECK-RV32-NEXT:    sb t5, 8(a0)
 ; CHECK-RV32-NEXT:    ntl.s1
-; CHECK-RV32-NEXT:    sb t1, 7(a0)
+; CHECK-RV32-NEXT:    sb t4, 7(a0)
 ; CHECK-RV32-NEXT:    ntl.s1
-; CHECK-RV32-NEXT:    sb t0, 6(a0)
+; CHECK-RV32-NEXT:    sb t3, 6(a0)
 ; CHECK-RV32-NEXT:    ntl.s1
-; CHECK-RV32-NEXT:    sb a7, 5(a0)
+; CHECK-RV32-NEXT:    sb t2, 5(a0)
 ; CHECK-RV32-NEXT:    ntl.s1
-; CHECK-RV32-NEXT:    sb a6, 4(a0)
+; CHECK-RV32-NEXT:    sb t1, 4(a0)
 ; CHECK-RV32-NEXT:    ntl.s1
-; CHECK-RV32-NEXT:    sb a5, 3(a0)
+; CHECK-RV32-NEXT:    sb t0, 3(a0)
 ; CHECK-RV32-NEXT:    ntl.s1
-; CHECK-RV32-NEXT:    sb a4, 2(a0)
+; CHECK-RV32-NEXT:    sb a7, 2(a0)
 ; CHECK-RV32-NEXT:    ntl.s1
-; CHECK-RV32-NEXT:    sb a3, 1(a0)
+; CHECK-RV32-NEXT:    sb a6, 1(a0)
 ; CHECK-RV32-NEXT:    ntl.s1
-; CHECK-RV32-NEXT:    sb a2, 0(a0)
+; CHECK-RV32-NEXT:    sb a5, 0(a0)
 ; CHECK-RV32-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; CHECK-RV32-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
 ; CHECK-RV32-NEXT:    addi sp, sp, 16
@@ -5271,44 +5271,44 @@ define void @test_nontemporal_S1_store_v16i8(ptr %p, <16 x i8> %v) {
 ; CHECK-RV64C-NEXT:    sd s1, 0(sp) # 8-byte Folded Spill
 ; CHECK-RV64C-NEXT:    .cfi_offset s0, -8
 ; CHECK-RV64C-NEXT:    .cfi_offset s1, -16
+; CHECK-RV64C-NEXT:    lbu t3, 104(a1)
+; CHECK-RV64C-NEXT:    lbu t6, 112(a1)
+; CHECK-RV64C-NEXT:    lbu a4, 120(a1)
 ; CHECK-RV64C-NEXT:    lbu a6, 0(a1)
 ; CHECK-RV64C-NEXT:    lbu a7, 8(a1)
 ; CHECK-RV64C-NEXT:    lbu t0, 16(a1)
 ; CHECK-RV64C-NEXT:    lbu t1, 24(a1)
 ; CHECK-RV64C-NEXT:    lbu t2, 32(a1)
-; CHECK-RV64C-NEXT:    lbu t3, 40(a1)
-; CHECK-RV64C-NEXT:    lbu t4, 48(a1)
-; CHECK-RV64C-NEXT:    lbu t5, 56(a1)
-; CHECK-RV64C-NEXT:    lbu t6, 64(a1)
+; CHECK-RV64C-NEXT:    lbu t4, 40(a1)
+; CHECK-RV64C-NEXT:    lbu t5, 48(a1)
+; CHECK-RV64C-NEXT:    lbu a5, 56(a1)
+; CHECK-RV64C-NEXT:    lbu a2, 64(a1)
 ; CHECK-RV64C-NEXT:    lbu a3, 72(a1)
-; CHECK-RV64C-NEXT:    lbu a4, 80(a1)
-; CHECK-RV64C-NEXT:    lbu a5, 88(a1)
-; CHECK-RV64C-NEXT:    lbu a2, 120(a1)
-; CHECK-RV64C-NEXT:    lbu s0, 112(a1)
-; CHECK-RV64C-NEXT:    lbu s1, 104(a1)
+; CHECK-RV64C-NEXT:    lbu s0, 80(a1)
+; CHECK-RV64C-NEXT:    lbu s1, 88(a1)
 ; CHECK-RV64C-NEXT:    lbu a1, 96(a1)
 ; CHECK-RV64C-NEXT:    c.ntl.s1
-; CHECK-RV64C-NEXT:    sb a2, 15(a0)
+; CHECK-RV64C-NEXT:    sb a4, 15(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.s1
-; CHECK-RV64C-NEXT:    sb s0, 14(a0)
+; CHECK-RV64C-NEXT:    sb t6, 14(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.s1
-; CHECK-RV64C-NEXT:    sb s1, 13(a0)
+; CHECK-RV64C-NEXT:    sb t3, 13(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.s1
 ; CHECK-RV64C-NEXT:    sb a1, 12(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.s1
-; CHECK-RV64C-NEXT:    sb a5, 11(a0)
+; CHECK-RV64C-NEXT:    sb s1, 11(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.s1
-; CHECK-RV64C-NEXT:    sb a4, 10(a0)
+; CHECK-RV64C-NEXT:    sb s0, 10(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.s1
 ; CHECK-RV64C-NEXT:    sb a3, 9(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.s1
-; CHECK-RV64C-NEXT:    sb t6, 8(a0)
+; CHECK-RV64C-NEXT:    sb a2, 8(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.s1
-; CHECK-RV64C-NEXT:    sb t5, 7(a0)
+; CHECK-RV64C-NEXT:    sb a5, 7(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.s1
-; CHECK-RV64C-NEXT:    sb t4, 6(a0)
+; CHECK-RV64C-NEXT:    sb t5, 6(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.s1
-; CHECK-RV64C-NEXT:    sb t3, 5(a0)
+; CHECK-RV64C-NEXT:    sb t4, 5(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.s1
 ; CHECK-RV64C-NEXT:    sb t2, 4(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.s1
@@ -5332,44 +5332,44 @@ define void @test_nontemporal_S1_store_v16i8(ptr %p, <16 x i8> %v) {
 ; CHECK-RV32C-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
 ; CHECK-RV32C-NEXT:    .cfi_offset s0, -4
 ; CHECK-RV32C-NEXT:    .cfi_offset s1, -8
+; CHECK-RV32C-NEXT:    lbu t3, 52(a1)
+; CHECK-RV32C-NEXT:    lbu t6, 56(a1)
+; CHECK-RV32C-NEXT:    lbu a4, 60(a1)
 ; CHECK-RV32C-NEXT:    lbu a6, 0(a1)
 ; CHECK-RV32C-NEXT:    lbu a7, 4(a1)
 ; CHECK-RV32C-NEXT:    lbu t0, 8(a1)
 ; CHECK-RV32C-NEXT:    lbu t1, 12(a1)
 ; CHECK-RV32C-NEXT:    lbu t2, 16(a1)
-; CHECK-RV32C-NEXT:    lbu t3, 20(a1)
-; CHECK-RV32C-NEXT:    lbu t4, 24(a1)
-; CHECK-RV32C-NEXT:    lbu t5, 28(a1)
-; CHECK-RV32C-NEXT:    lbu t6, 32(a1)
+; CHECK-RV32C-NEXT:    lbu t4, 20(a1)
+; CHECK-RV32C-NEXT:    lbu t5, 24(a1)
+; CHECK-RV32C-NEXT:    lbu a5, 28(a1)
+; CHECK-RV32C-NEXT:    lbu a2, 32(a1)
 ; CHECK-RV32C-NEXT:    lbu a3, 36(a1)
-; CHECK-RV32C-NEXT:    lbu a4, 40(a1)
-; CHECK-RV32C-NEXT:    lbu a5, 44(a1)
-; CHECK-RV32C-NEXT:    lbu a2, 60(a1)
-; CHECK-RV32C-NEXT:    lbu s0, 56(a1)
-; CHECK-RV32C-NEXT:    lbu s1, 52(a1)
+; CHECK-RV32C-NEXT:    lbu s0, 40(a1)
+; CHECK-RV32C-NEXT:    lbu s1, 44(a1)
 ; CHECK-RV32C-NEXT:    lbu a1, 48(a1)
 ; CHECK-RV32C-NEXT:    c.ntl.s1
-; CHECK-RV32C-NEXT:    sb a2, 15(a0)
+; CHECK-RV32C-NEXT:    sb a4, 15(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.s1
-; CHECK-RV32C-NEXT:    sb s0, 14(a0)
+; CHECK-RV32C-NEXT:    sb t6, 14(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.s1
-; CHECK-RV32C-NEXT:    sb s1, 13(a0)
+; CHECK-RV32C-NEXT:    sb t3, 13(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.s1
 ; CHECK-RV32C-NEXT:    sb a1, 12(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.s1
-; CHECK-RV32C-NEXT:    sb a5, 11(a0)
+; CHECK-RV32C-NEXT:    sb s1, 11(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.s1
-; CHECK-RV32C-NEXT:    sb a4, 10(a0)
+; CHECK-RV32C-NEXT:    sb s0, 10(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.s1
 ; CHECK-RV32C-NEXT:    sb a3, 9(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.s1
-; CHECK-RV32C-NEXT:    sb t6, 8(a0)
+; CHECK-RV32C-NEXT:    sb a2, 8(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.s1
-; CHECK-RV32C-NEXT:    sb t5, 7(a0)
+; CHECK-RV32C-NEXT:    sb a5, 7(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.s1
-; CHECK-RV32C-NEXT:    sb t4, 6(a0)
+; CHECK-RV32C-NEXT:    sb t5, 6(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.s1
-; CHECK-RV32C-NEXT:    sb t3, 5(a0)
+; CHECK-RV32C-NEXT:    sb t4, 5(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.s1
 ; CHECK-RV32C-NEXT:    sb t2, 4(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.s1
@@ -5405,112 +5405,112 @@ define void @test_nontemporal_S1_store_v16i8(ptr %p, <16 x i8> %v) {
 define void @test_nontemporal_S1_store_v8i16(ptr %p, <8 x i16> %v) {
 ; CHECK-RV64-LABEL: test_nontemporal_S1_store_v8i16:
 ; CHECK-RV64:       # %bb.0:
-; CHECK-RV64-NEXT:    lh a2, 0(a1)
-; CHECK-RV64-NEXT:    lh a3, 8(a1)
-; CHECK-RV64-NEXT:    lh a4, 16(a1)
-; CHECK-RV64-NEXT:    lh a5, 24(a1)
-; CHECK-RV64-NEXT:    lh a6, 56(a1)
-; CHECK-RV64-NEXT:    lh a7, 48(a1)
-; CHECK-RV64-NEXT:    lh t0, 40(a1)
+; CHECK-RV64-NEXT:    lh a2, 40(a1)
+; CHECK-RV64-NEXT:    lh a3, 48(a1)
+; CHECK-RV64-NEXT:    lh a4, 56(a1)
+; CHECK-RV64-NEXT:    lh a5, 0(a1)
+; CHECK-RV64-NEXT:    lh a6, 8(a1)
+; CHECK-RV64-NEXT:    lh a7, 16(a1)
+; CHECK-RV64-NEXT:    lh t0, 24(a1)
 ; CHECK-RV64-NEXT:    lh a1, 32(a1)
 ; CHECK-RV64-NEXT:    ntl.s1
-; CHECK-RV64-NEXT:    sh a6, 14(a0)
+; CHECK-RV64-NEXT:    sh a4, 14(a0)
 ; CHECK-RV64-NEXT:    ntl.s1
-; CHECK-RV64-NEXT:    sh a7, 12(a0)
+; CHECK-RV64-NEXT:    sh a3, 12(a0)
 ; CHECK-RV64-NEXT:    ntl.s1
-; CHECK-RV64-NEXT:    sh t0, 10(a0)
+; CHECK-RV64-NEXT:    sh a2, 10(a0)
 ; CHECK-RV64-NEXT:    ntl.s1
 ; CHECK-RV64-NEXT:    sh a1, 8(a0)
 ; CHECK-RV64-NEXT:    ntl.s1
-; CHECK-RV64-NEXT:    sh a5, 6(a0)
+; CHECK-RV64-NEXT:    sh t0, 6(a0)
 ; CHECK-RV64-NEXT:    ntl.s1
-; CHECK-RV64-NEXT:    sh a4, 4(a0)
+; CHECK-RV64-NEXT:    sh a7, 4(a0)
 ; CHECK-RV64-NEXT:    ntl.s1
-; CHECK-RV64-NEXT:    sh a3, 2(a0)
+; CHECK-RV64-NEXT:    sh a6, 2(a0)
 ; CHECK-RV64-NEXT:    ntl.s1
-; CHECK-RV64-NEXT:    sh a2, 0(a0)
+; CHECK-RV64-NEXT:    sh a5, 0(a0)
 ; CHECK-RV64-NEXT:    ret
 ;
 ; CHECK-RV32-LABEL: test_nontemporal_S1_store_v8i16:
 ; CHECK-RV32:       # %bb.0:
-; CHECK-RV32-NEXT:    lh a2, 0(a1)
-; CHECK-RV32-NEXT:    lh a3, 4(a1)
-; CHECK-RV32-NEXT:    lh a4, 8(a1)
-; CHECK-RV32-NEXT:    lh a5, 12(a1)
-; CHECK-RV32-NEXT:    lh a6, 28(a1)
-; CHECK-RV32-NEXT:    lh a7, 24(a1)
-; CHECK-RV32-NEXT:    lh t0, 20(a1)
+; CHECK-RV32-NEXT:    lh a2, 20(a1)
+; CHECK-RV32-NEXT:    lh a3, 24(a1)
+; CHECK-RV32-NEXT:    lh a4, 28(a1)
+; CHECK-RV32-NEXT:    lh a5, 0(a1)
+; CHECK-RV32-NEXT:    lh a6, 4(a1)
+; CHECK-RV32-NEXT:    lh a7, 8(a1)
+; CHECK-RV32-NEXT:    lh t0, 12(a1)
 ; CHECK-RV32-NEXT:    lh a1, 16(a1)
 ; CHECK-RV32-NEXT:    ntl.s1
-; CHECK-RV32-NEXT:    sh a6, 14(a0)
+; CHECK-RV32-NEXT:    sh a4, 14(a0)
 ; CHECK-RV32-NEXT:    ntl.s1
-; CHECK-RV32-NEXT:    sh a7, 12(a0)
+; CHECK-RV32-NEXT:    sh a3, 12(a0)
 ; CHECK-RV32-NEXT:    ntl.s1
-; CHECK-RV32-NEXT:    sh t0, 10(a0)
+; CHECK-RV32-NEXT:    sh a2, 10(a0)
 ; CHECK-RV32-NEXT:    ntl.s1
 ; CHECK-RV32-NEXT:    sh a1, 8(a0)
 ; CHECK-RV32-NEXT:    ntl.s1
-; CHECK-RV32-NEXT:    sh a5, 6(a0)
+; CHECK-RV32-NEXT:    sh t0, 6(a0)
 ; CHECK-RV32-NEXT:    ntl.s1
-; CHECK-RV32-NEXT:    sh a4, 4(a0)
+; CHECK-RV32-NEXT:    sh a7, 4(a0)
 ; CHECK-RV32-NEXT:    ntl.s1
-; CHECK-RV32-NEXT:    sh a3, 2(a0)
+; CHECK-RV32-NEXT:    sh a6, 2(a0)
 ; CHECK-RV32-NEXT:    ntl.s1
-; CHECK-RV32-NEXT:    sh a2, 0(a0)
+; CHECK-RV32-NEXT:    sh a5, 0(a0)
 ; CHECK-RV32-NEXT:    ret
 ;
 ; CHECK-RV64C-LABEL: test_nontemporal_S1_store_v8i16:
 ; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    lh a7, 40(a1)
+; CHECK-RV64C-NEXT:    lh a3, 48(a1)
+; CHECK-RV64C-NEXT:    lh a4, 56(a1)
 ; CHECK-RV64C-NEXT:    lh a6, 0(a1)
-; CHECK-RV64C-NEXT:    lh a7, 8(a1)
-; CHECK-RV64C-NEXT:    lh t0, 16(a1)
+; CHECK-RV64C-NEXT:    lh t0, 8(a1)
+; CHECK-RV64C-NEXT:    lh a2, 16(a1)
 ; CHECK-RV64C-NEXT:    lh a5, 24(a1)
-; CHECK-RV64C-NEXT:    lh a2, 56(a1)
-; CHECK-RV64C-NEXT:    lh a3, 48(a1)
-; CHECK-RV64C-NEXT:    lh a4, 40(a1)
 ; CHECK-RV64C-NEXT:    lh a1, 32(a1)
 ; CHECK-RV64C-NEXT:    c.ntl.s1
-; CHECK-RV64C-NEXT:    sh a2, 14(a0)
+; CHECK-RV64C-NEXT:    sh a4, 14(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.s1
 ; CHECK-RV64C-NEXT:    sh a3, 12(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.s1
-; CHECK-RV64C-NEXT:    sh a4, 10(a0)
+; CHECK-RV64C-NEXT:    sh a7, 10(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.s1
 ; CHECK-RV64C-NEXT:    sh a1, 8(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.s1
 ; CHECK-RV64C-NEXT:    sh a5, 6(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.s1
-; CHECK-RV64C-NEXT:    sh t0, 4(a0)
+; CHECK-RV64C-NEXT:    sh a2, 4(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.s1
-; CHECK-RV64C-NEXT:    sh a7, 2(a0)
+; CHECK-RV64C-NEXT:    sh t0, 2(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.s1
 ; CHECK-RV64C-NEXT:    sh a6, 0(a0)
 ; CHECK-RV64C-NEXT:    ret
 ;
 ; CHECK-RV32C-LABEL: test_nontemporal_S1_store_v8i16:
 ; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    lh a7, 20(a1)
+; CHECK-RV32C-NEXT:    lh a3, 24(a1)
+; CHECK-RV32C-NEXT:    lh a4, 28(a1)
 ; CHECK-RV32C-NEXT:    lh a6, 0(a1)
-; CHECK-RV32C-NEXT:    lh a7, 4(a1)
-; CHECK-RV32C-NEXT:    lh t0, 8(a1)
+; CHECK-RV32C-NEXT:    lh t0, 4(a1)
+; CHECK-RV32C-NEXT:    lh a2, 8(a1)
 ; CHECK-RV32C-NEXT:    lh a5, 12(a1)
-; CHECK-RV32C-NEXT:    lh a2, 28(a1)
-; CHECK-RV32C-NEXT:    lh a3, 24(a1)
-; CHECK-RV32C-NEXT:    lh a4, 20(a1)
 ; CHECK-RV32C-NEXT:    lh a1, 16(a1)
 ; CHECK-RV32C-NEXT:    c.ntl.s1
-; CHECK-RV32C-NEXT:    sh a2, 14(a0)
+; CHECK-RV32C-NEXT:    sh a4, 14(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.s1
 ; CHECK-RV32C-NEXT:    sh a3, 12(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.s1
-; CHECK-RV32C-NEXT:    sh a4, 10(a0)
+; CHECK-RV32C-NEXT:    sh a7, 10(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.s1
 ; CHECK-RV32C-NEXT:    sh a1, 8(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.s1
 ; CHECK-RV32C-NEXT:    sh a5, 6(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.s1
-; CHECK-RV32C-NEXT:    sh t0, 4(a0)
+; CHECK-RV32C-NEXT:    sh a2, 4(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.s1
-; CHECK-RV32C-NEXT:    sh a7, 2(a0)
+; CHECK-RV32C-NEXT:    sh t0, 2(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.s1
 ; CHECK-RV32C-NEXT:    sh a6, 0(a0)
 ; CHECK-RV32C-NEXT:    ret
@@ -6563,54 +6563,54 @@ define void @test_nontemporal_ALL_store_v16i8(ptr %p, <16 x i8> %v) {
 ; CHECK-RV64-NEXT:    sd s1, 0(sp) # 8-byte Folded Spill
 ; CHECK-RV64-NEXT:    .cfi_offset s0, -8
 ; CHECK-RV64-NEXT:    .cfi_offset s1, -16
-; CHECK-RV64-NEXT:    lbu a2, 0(a1)
-; CHECK-RV64-NEXT:    lbu a3, 8(a1)
-; CHECK-RV64-NEXT:    lbu a4, 16(a1)
-; CHECK-RV64-NEXT:    lbu a5, 24(a1)
-; CHECK-RV64-NEXT:    lbu a6, 32(a1)
-; CHECK-RV64-NEXT:    lbu a7, 40(a1)
-; CHECK-RV64-NEXT:    lbu t0, 48(a1)
-; CHECK-RV64-NEXT:    lbu t1, 56(a1)
-; CHECK-RV64-NEXT:    lbu t2, 64(a1)
-; CHECK-RV64-NEXT:    lbu t3, 72(a1)
-; CHECK-RV64-NEXT:    lbu t4, 80(a1)
-; CHECK-RV64-NEXT:    lbu t5, 88(a1)
-; CHECK-RV64-NEXT:    lbu t6, 120(a1)
-; CHECK-RV64-NEXT:    lbu s0, 112(a1)
-; CHECK-RV64-NEXT:    lbu s1, 104(a1)
+; CHECK-RV64-NEXT:    lbu a2, 104(a1)
+; CHECK-RV64-NEXT:    lbu a3, 112(a1)
+; CHECK-RV64-NEXT:    lbu a4, 120(a1)
+; CHECK-RV64-NEXT:    lbu a5, 0(a1)
+; CHECK-RV64-NEXT:    lbu a6, 8(a1)
+; CHECK-RV64-NEXT:    lbu a7, 16(a1)
+; CHECK-RV64-NEXT:    lbu t0, 24(a1)
+; CHECK-RV64-NEXT:    lbu t1, 32(a1)
+; CHECK-RV64-NEXT:    lbu t2, 40(a1)
+; CHECK-RV64-NEXT:    lbu t3, 48(a1)
+; CHECK-RV64-NEXT:    lbu t4, 56(a1)
+; CHECK-RV64-NEXT:    lbu t5, 64(a1)
+; CHECK-RV64-NEXT:    lbu t6, 72(a1)
+; CHECK-RV64-NEXT:    lbu s0, 80(a1)
+; CHECK-RV64-NEXT:    lbu s1, 88(a1)
 ; CHECK-RV64-NEXT:    lbu a1, 96(a1)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb t6, 15(a0)
+; CHECK-RV64-NEXT:    sb a4, 15(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb s0, 14(a0)
+; CHECK-RV64-NEXT:    sb a3, 14(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb s1, 13(a0)
+; CHECK-RV64-NEXT:    sb a2, 13(a0)
 ; CHECK-RV64-NEXT:    ntl.all
 ; CHECK-RV64-NEXT:    sb a1, 12(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb t5, 11(a0)
+; CHECK-RV64-NEXT:    sb s1, 11(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb t4, 10(a0)
+; CHECK-RV64-NEXT:    sb s0, 10(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb t3, 9(a0)
+; CHECK-RV64-NEXT:    sb t6, 9(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb t2, 8(a0)
+; CHECK-RV64-NEXT:    sb t5, 8(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb t1, 7(a0)
+; CHECK-RV64-NEXT:    sb t4, 7(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb t0, 6(a0)
+; CHECK-RV64-NEXT:    sb t3, 6(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb a7, 5(a0)
+; CHECK-RV64-NEXT:    sb t2, 5(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb a6, 4(a0)
+; CHECK-RV64-NEXT:    sb t1, 4(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb a5, 3(a0)
+; CHECK-RV64-NEXT:    sb t0, 3(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb a4, 2(a0)
+; CHECK-RV64-NEXT:    sb a7, 2(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb a3, 1(a0)
+; CHECK-RV64-NEXT:    sb a6, 1(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb a2, 0(a0)
+; CHECK-RV64-NEXT:    sb a5, 0(a0)
 ; CHECK-RV64-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
 ; CHECK-RV64-NEXT:    ld s1, 0(sp) # 8-byte Folded Reload
 ; CHECK-RV64-NEXT:    addi sp, sp, 16
@@ -6624,54 +6624,54 @@ define void @test_nontemporal_ALL_store_v16i8(ptr %p, <16 x i8> %v) {
 ; CHECK-RV32-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
 ; CHECK-RV32-NEXT:    .cfi_offset s0, -4
 ; CHECK-RV32-NEXT:    .cfi_offset s1, -8
-; CHECK-RV32-NEXT:    lbu a2, 0(a1)
-; CHECK-RV32-NEXT:    lbu a3, 4(a1)
-; CHECK-RV32-NEXT:    lbu a4, 8(a1)
-; CHECK-RV32-NEXT:    lbu a5, 12(a1)
-; CHECK-RV32-NEXT:    lbu a6, 16(a1)
-; CHECK-RV32-NEXT:    lbu a7, 20(a1)
-; CHECK-RV32-NEXT:    lbu t0, 24(a1)
-; CHECK-RV32-NEXT:    lbu t1, 28(a1)
-; CHECK-RV32-NEXT:    lbu t2, 32(a1)
-; CHECK-RV32-NEXT:    lbu t3, 36(a1)
-; CHECK-RV32-NEXT:    lbu t4, 40(a1)
-; CHECK-RV32-NEXT:    lbu t5, 44(a1)
-; CHECK-RV32-NEXT:    lbu t6, 60(a1)
-; CHECK-RV32-NEXT:    lbu s0, 56(a1)
-; CHECK-RV32-NEXT:    lbu s1, 52(a1)
+; CHECK-RV32-NEXT:    lbu a2, 52(a1)
+; CHECK-RV32-NEXT:    lbu a3, 56(a1)
+; CHECK-RV32-NEXT:    lbu a4, 60(a1)
+; CHECK-RV32-NEXT:    lbu a5, 0(a1)
+; CHECK-RV32-NEXT:    lbu a6, 4(a1)
+; CHECK-RV32-NEXT:    lbu a7, 8(a1)
+; CHECK-RV32-NEXT:    lbu t0, 12(a1)
+; CHECK-RV32-NEXT:    lbu t1, 16(a1)
+; CHECK-RV32-NEXT:    lbu t2, 20(a1)
+; CHECK-RV32-NEXT:    lbu t3, 24(a1)
+; CHECK-RV32-NEXT:    lbu t4, 28(a1)
+; CHECK-RV32-NEXT:    lbu t5, 32(a1)
+; CHECK-RV32-NEXT:    lbu t6, 36(a1)
+; CHECK-RV32-NEXT:    lbu s0, 40(a1)
+; CHECK-RV32-NEXT:    lbu s1, 44(a1)
 ; CHECK-RV32-NEXT:    lbu a1, 48(a1)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb t6, 15(a0)
+; CHECK-RV32-NEXT:    sb a4, 15(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb s0, 14(a0)
+; CHECK-RV32-NEXT:    sb a3, 14(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb s1, 13(a0)
+; CHECK-RV32-NEXT:    sb a2, 13(a0)
 ; CHECK-RV32-NEXT:    ntl.all
 ; CHECK-RV32-NEXT:    sb a1, 12(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb t5, 11(a0)
+; CHECK-RV32-NEXT:    sb s1, 11(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb t4, 10(a0)
+; CHECK-RV32-NEXT:    sb s0, 10(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb t3, 9(a0)
+; CHECK-RV32-NEXT:    sb t6, 9(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb t2, 8(a0)
+; CHECK-RV32-NEXT:    sb t5, 8(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb t1, 7(a0)
+; CHECK-RV32-NEXT:    sb t4, 7(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb t0, 6(a0)
+; CHECK-RV32-NEXT:    sb t3, 6(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb a7, 5(a0)
+; CHECK-RV32-NEXT:    sb t2, 5(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb a6, 4(a0)
+; CHECK-RV32-NEXT:    sb t1, 4(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb a5, 3(a0)
+; CHECK-RV32-NEXT:    sb t0, 3(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb a4, 2(a0)
+; CHECK-RV32-NEXT:    sb a7, 2(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb a3, 1(a0)
+; CHECK-RV32-NEXT:    sb a6, 1(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb a2, 0(a0)
+; CHECK-RV32-NEXT:    sb a5, 0(a0)
 ; CHECK-RV32-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; CHECK-RV32-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
 ; CHECK-RV32-NEXT:    addi sp, sp, 16
@@ -6685,44 +6685,44 @@ define void @test_nontemporal_ALL_store_v16i8(ptr %p, <16 x i8> %v) {
 ; CHECK-RV64C-NEXT:    sd s1, 0(sp) # 8-byte Folded Spill
 ; CHECK-RV64C-NEXT:    .cfi_offset s0, -8
 ; CHECK-RV64C-NEXT:    .cfi_offset s1, -16
+; CHECK-RV64C-NEXT:    lbu t3, 104(a1)
+; CHECK-RV64C-NEXT:    lbu t6, 112(a1)
+; CHECK-RV64C-NEXT:    lbu a4, 120(a1)
 ; CHECK-RV64C-NEXT:    lbu a6, 0(a1)
 ; CHECK-RV64C-NEXT:    lbu a7, 8(a1)
 ; CHECK-RV64C-NEXT:    lbu t0, 16(a1)
 ; CHECK-RV64C-NEXT:    lbu t1, 24(a1)
 ; CHECK-RV64C-NEXT:    lbu t2, 32(a1)
-; CHECK-RV64C-NEXT:    lbu t3, 40(a1)
-; CHECK-RV64C-NEXT:    lbu t4, 48(a1)
-; CHECK-RV64C-NEXT:    lbu t5, 56(a1)
-; CHECK-RV64C-NEXT:    lbu t6, 64(a1)
+; CHECK-RV64C-NEXT:    lbu t4, 40(a1)
+; CHECK-RV64C-NEXT:    lbu t5, 48(a1)
+; CHECK-RV64C-NEXT:    lbu a5, 56(a1)
+; CHECK-RV64C-NEXT:    lbu a2, 64(a1)
 ; CHECK-RV64C-NEXT:    lbu a3, 72(a1)
-; CHECK-RV64C-NEXT:    lbu a4, 80(a1)
-; CHECK-RV64C-NEXT:    lbu a5, 88(a1)
-; CHECK-RV64C-NEXT:    lbu a2, 120(a1)
-; CHECK-RV64C-NEXT:    lbu s0, 112(a1)
-; CHECK-RV64C-NEXT:    lbu s1, 104(a1)
+; CHECK-RV64C-NEXT:    lbu s0, 80(a1)
+; CHECK-RV64C-NEXT:    lbu s1, 88(a1)
 ; CHECK-RV64C-NEXT:    lbu a1, 96(a1)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sb a2, 15(a0)
+; CHECK-RV64C-NEXT:    sb a4, 15(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sb s0, 14(a0)
+; CHECK-RV64C-NEXT:    sb t6, 14(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sb s1, 13(a0)
+; CHECK-RV64C-NEXT:    sb t3, 13(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
 ; CHECK-RV64C-NEXT:    sb a1, 12(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sb a5, 11(a0)
+; CHECK-RV64C-NEXT:    sb s1, 11(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sb a4, 10(a0)
+; CHECK-RV64C-NEXT:    sb s0, 10(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
 ; CHECK-RV64C-NEXT:    sb a3, 9(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sb t6, 8(a0)
+; CHECK-RV64C-NEXT:    sb a2, 8(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sb t5, 7(a0)
+; CHECK-RV64C-NEXT:    sb a5, 7(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sb t4, 6(a0)
+; CHECK-RV64C-NEXT:    sb t5, 6(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sb t3, 5(a0)
+; CHECK-RV64C-NEXT:    sb t4, 5(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
 ; CHECK-RV64C-NEXT:    sb t2, 4(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
@@ -6746,44 +6746,44 @@ define void @test_nontemporal_ALL_store_v16i8(ptr %p, <16 x i8> %v) {
 ; CHECK-RV32C-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
 ; CHECK-RV32C-NEXT:    .cfi_offset s0, -4
 ; CHECK-RV32C-NEXT:    .cfi_offset s1, -8
+; CHECK-RV32C-NEXT:    lbu t3, 52(a1)
+; CHECK-RV32C-NEXT:    lbu t6, 56(a1)
+; CHECK-RV32C-NEXT:    lbu a4, 60(a1)
 ; CHECK-RV32C-NEXT:    lbu a6, 0(a1)
 ; CHECK-RV32C-NEXT:    lbu a7, 4(a1)
 ; CHECK-RV32C-NEXT:    lbu t0, 8(a1)
 ; CHECK-RV32C-NEXT:    lbu t1, 12(a1)
 ; CHECK-RV32C-NEXT:    lbu t2, 16(a1)
-; CHECK-RV32C-NEXT:    lbu t3, 20(a1)
-; CHECK-RV32C-NEXT:    lbu t4, 24(a1)
-; CHECK-RV32C-NEXT:    lbu t5, 28(a1)
-; CHECK-RV32C-NEXT:    lbu t6, 32(a1)
+; CHECK-RV32C-NEXT:    lbu t4, 20(a1)
+; CHECK-RV32C-NEXT:    lbu t5, 24(a1)
+; CHECK-RV32C-NEXT:    lbu a5, 28(a1)
+; CHECK-RV32C-NEXT:    lbu a2, 32(a1)
 ; CHECK-RV32C-NEXT:    lbu a3, 36(a1)
-; CHECK-RV32C-NEXT:    lbu a4, 40(a1)
-; CHECK-RV32C-NEXT:    lbu a5, 44(a1)
-; CHECK-RV32C-NEXT:    lbu a2, 60(a1)
-; CHECK-RV32C-NEXT:    lbu s0, 56(a1)
-; CHECK-RV32C-NEXT:    lbu s1, 52(a1)
+; CHECK-RV32C-NEXT:    lbu s0, 40(a1)
+; CHECK-RV32C-NEXT:    lbu s1, 44(a1)
 ; CHECK-RV32C-NEXT:    lbu a1, 48(a1)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sb a2, 15(a0)
+; CHECK-RV32C-NEXT:    sb a4, 15(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sb s0, 14(a0)
+; CHECK-RV32C-NEXT:    sb t6, 14(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sb s1, 13(a0)
+; CHECK-RV32C-NEXT:    sb t3, 13(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
 ; CHECK-RV32C-NEXT:    sb a1, 12(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sb a5, 11(a0)
+; CHECK-RV32C-NEXT:    sb s1, 11(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sb a4, 10(a0)
+; CHECK-RV32C-NEXT:    sb s0, 10(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
 ; CHECK-RV32C-NEXT:    sb a3, 9(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sb t6, 8(a0)
+; CHECK-RV32C-NEXT:    sb a2, 8(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sb t5, 7(a0)
+; CHECK-RV32C-NEXT:    sb a5, 7(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sb t4, 6(a0)
+; CHECK-RV32C-NEXT:    sb t5, 6(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sb t3, 5(a0)
+; CHECK-RV32C-NEXT:    sb t4, 5(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
 ; CHECK-RV32C-NEXT:    sb t2, 4(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
@@ -6819,112 +6819,112 @@ define void @test_nontemporal_ALL_store_v16i8(ptr %p, <16 x i8> %v) {
 define void @test_nontemporal_ALL_store_v8i16(ptr %p, <8 x i16> %v) {
 ; CHECK-RV64-LABEL: test_nontemporal_ALL_store_v8i16:
 ; CHECK-RV64:       # %bb.0:
-; CHECK-RV64-NEXT:    lh a2, 0(a1)
-; CHECK-RV64-NEXT:    lh a3, 8(a1)
-; CHECK-RV64-NEXT:    lh a4, 16(a1)
-; CHECK-RV64-NEXT:    lh a5, 24(a1)
-; CHECK-RV64-NEXT:    lh a6, 56(a1)
-; CHECK-RV64-NEXT:    lh a7, 48(a1)
-; CHECK-RV64-NEXT:    lh t0, 40(a1)
+; CHECK-RV64-NEXT:    lh a2, 40(a1)
+; CHECK-RV64-NEXT:    lh a3, 48(a1)
+; CHECK-RV64-NEXT:    lh a4, 56(a1)
+; CHECK-RV64-NEXT:    lh a5, 0(a1)
+; CHECK-RV64-NEXT:    lh a6, 8(a1)
+; CHECK-RV64-NEXT:    lh a7, 16(a1)
+; CHECK-RV64-NEXT:    lh t0, 24(a1)
 ; CHECK-RV64-NEXT:    lh a1, 32(a1)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sh a6, 14(a0)
+; CHECK-RV64-NEXT:    sh a4, 14(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sh a7, 12(a0)
+; CHECK-RV64-NEXT:    sh a3, 12(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sh t0, 10(a0)
+; CHECK-RV64-NEXT:    sh a2, 10(a0)
 ; CHECK-RV64-NEXT:    ntl.all
 ; CHECK-RV64-NEXT:    sh a1, 8(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sh a5, 6(a0)
+; CHECK-RV64-NEXT:    sh t0, 6(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sh a4, 4(a0)
+; CHECK-RV64-NEXT:    sh a7, 4(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sh a3, 2(a0)
+; CHECK-RV64-NEXT:    sh a6, 2(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sh a2, 0(a0)
+; CHECK-RV64-NEXT:    sh a5, 0(a0)
 ; CHECK-RV64-NEXT:    ret
 ;
 ; CHECK-RV32-LABEL: test_nontemporal_ALL_store_v8i16:
 ; CHECK-RV32:       # %bb.0:
-; CHECK-RV32-NEXT:    lh a2, 0(a1)
-; CHECK-RV32-NEXT:    lh a3, 4(a1)
-; CHECK-RV32-NEXT:    lh a4, 8(a1)
-; CHECK-RV32-NEXT:    lh a5, 12(a1)
-; CHECK-RV32-NEXT:    lh a6, 28(a1)
-; CHECK-RV32-NEXT:    lh a7, 24(a1)
-; CHECK-RV32-NEXT:    lh t0, 20(a1)
+; CHECK-RV32-NEXT:    lh a2, 20(a1)
+; CHECK-RV32-NEXT:    lh a3, 24(a1)
+; CHECK-RV32-NEXT:    lh a4, 28(a1)
+; CHECK-RV32-NEXT:    lh a5, 0(a1)
+; CHECK-RV32-NEXT:    lh a6, 4(a1)
+; CHECK-RV32-NEXT:    lh a7, 8(a1)
+; CHECK-RV32-NEXT:    lh t0, 12(a1)
 ; CHECK-RV32-NEXT:    lh a1, 16(a1)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sh a6, 14(a0)
+; CHECK-RV32-NEXT:    sh a4, 14(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sh a7, 12(a0)
+; CHECK-RV32-NEXT:    sh a3, 12(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sh t0, 10(a0)
+; CHECK-RV32-NEXT:    sh a2, 10(a0)
 ; CHECK-RV32-NEXT:    ntl.all
 ; CHECK-RV32-NEXT:    sh a1, 8(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sh a5, 6(a0)
+; CHECK-RV32-NEXT:    sh t0, 6(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sh a4, 4(a0)
+; CHECK-RV32-NEXT:    sh a7, 4(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sh a3, 2(a0)
+; CHECK-RV32-NEXT:    sh a6, 2(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sh a2, 0(a0)
+; CHECK-RV32-NEXT:    sh a5, 0(a0)
 ; CHECK-RV32-NEXT:    ret
 ;
 ; CHECK-RV64C-LABEL: test_nontemporal_ALL_store_v8i16:
 ; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    lh a7, 40(a1)
+; CHECK-RV64C-NEXT:    lh a3, 48(a1)
+; CHECK-RV64C-NEXT:    lh a4, 56(a1)
 ; CHECK-RV64C-NEXT:    lh a6, 0(a1)
-; CHECK-RV64C-NEXT:    lh a7, 8(a1)
-; CHECK-RV64C-NEXT:    lh t0, 16(a1)
+; CHECK-RV64C-NEXT:    lh t0, 8(a1)
+; CHECK-RV64C-NEXT:    lh a2, 16(a1)
 ; CHECK-RV64C-NEXT:    lh a5, 24(a1)
-; CHECK-RV64C-NEXT:    lh a2, 56(a1)
-; CHECK-RV64C-NEXT:    lh a3, 48(a1)
-; CHECK-RV64C-NEXT:    lh a4, 40(a1)
 ; CHECK-RV64C-NEXT:    lh a1, 32(a1)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sh a2, 14(a0)
+; CHECK-RV64C-NEXT:    sh a4, 14(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
 ; CHECK-RV64C-NEXT:    sh a3, 12(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sh a4, 10(a0)
+; CHECK-RV64C-NEXT:    sh a7, 10(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
 ; CHECK-RV64C-NEXT:    sh a1, 8(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
 ; CHECK-RV64C-NEXT:    sh a5, 6(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sh t0, 4(a0)
+; CHECK-RV64C-NEXT:    sh a2, 4(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sh a7, 2(a0)
+; CHECK-RV64C-NEXT:    sh t0, 2(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
 ; CHECK-RV64C-NEXT:    sh a6, 0(a0)
 ; CHECK-RV64C-NEXT:    ret
 ;
 ; CHECK-RV32C-LABEL: test_nontemporal_ALL_store_v8i16:
 ; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    lh a7, 20(a1)
+; CHECK-RV32C-NEXT:    lh a3, 24(a1)
+; CHECK-RV32C-NEXT:    lh a4, 28(a1)
 ; CHECK-RV32C-NEXT:    lh a6, 0(a1)
-; CHECK-RV32C-NEXT:    lh a7, 4(a1)
-; CHECK-RV32C-NEXT:    lh t0, 8(a1)
+; CHECK-RV32C-NEXT:    lh t0, 4(a1)
+; CHECK-RV32C-NEXT:    lh a2, 8(a1)
 ; CHECK-RV32C-NEXT:    lh a5, 12(a1)
-; CHECK-RV32C-NEXT:    lh a2, 28(a1)
-; CHECK-RV32C-NEXT:    lh a3, 24(a1)
-; CHECK-RV32C-NEXT:    lh a4, 20(a1)
 ; CHECK-RV32C-NEXT:    lh a1, 16(a1)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sh a2, 14(a0)
+; CHECK-RV32C-NEXT:    sh a4, 14(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
 ; CHECK-RV32C-NEXT:    sh a3, 12(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sh a4, 10(a0)
+; CHECK-RV32C-NEXT:    sh a7, 10(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
 ; CHECK-RV32C-NEXT:    sh a1, 8(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
 ; CHECK-RV32C-NEXT:    sh a5, 6(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sh t0, 4(a0)
+; CHECK-RV32C-NEXT:    sh a2, 4(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sh a7, 2(a0)
+; CHECK-RV32C-NEXT:    sh t0, 2(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
 ; CHECK-RV32C-NEXT:    sh a6, 0(a0)
 ; CHECK-RV32C-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll b/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll
index 7c3294fa81dcfe..71b9d9b8c5dca3 100644
--- a/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll
+++ b/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll
@@ -1241,8 +1241,8 @@ define i64 @foo2(ptr %p) {
 define void @PR41129(ptr %p64) {
 ; RV32-LABEL: PR41129:
 ; RV32:       # %bb.0: # %entry
-; RV32-NEXT:    lw a2, 4(a0)
 ; RV32-NEXT:    lw a1, 0(a0)
+; RV32-NEXT:    lw a2, 4(a0)
 ; RV32-NEXT:    or a3, a1, a2
 ; RV32-NEXT:    beqz a3, .LBB37_2
 ; RV32-NEXT:  # %bb.1: # %false
diff --git a/llvm/test/CodeGen/RISCV/push-pop-popret.ll b/llvm/test/CodeGen/RISCV/push-pop-popret.ll
index 9ff4235746caf0..9bc7bb6170a0fa 100644
--- a/llvm/test/CodeGen/RISCV/push-pop-popret.ll
+++ b/llvm/test/CodeGen/RISCV/push-pop-popret.ll
@@ -1109,41 +1109,41 @@ define void @many_args(i32, i32, i32, i32, i32, i32, i32, i32, i32) nounwind {
 ; RV32IZCMP-NEXT:    cm.push {ra, s0-s4}, -32
 ; RV32IZCMP-NEXT:    lui a0, %hi(var0)
 ; RV32IZCMP-NEXT:    lw a6, %lo(var0)(a0)
-; RV32IZCMP-NEXT:    lw a7, %lo(var0+4)(a0)
-; RV32IZCMP-NEXT:    lw t0, %lo(var0+8)(a0)
-; RV32IZCMP-NEXT:    lw t1, %lo(var0+12)(a0)
-; RV32IZCMP-NEXT:    addi a5, a0, %lo(var0)
-; RV32IZCMP-NEXT:    lw t2, 16(a5)
-; RV32IZCMP-NEXT:    lw t3, 20(a5)
-; RV32IZCMP-NEXT:    lw t4, 24(a5)
-; RV32IZCMP-NEXT:    lw t5, 28(a5)
-; RV32IZCMP-NEXT:    lw t6, 32(a5)
-; RV32IZCMP-NEXT:    lw s2, 36(a5)
-; RV32IZCMP-NEXT:    lw s3, 40(a5)
-; RV32IZCMP-NEXT:    lw s4, 44(a5)
-; RV32IZCMP-NEXT:    lw a1, 48(a5)
-; RV32IZCMP-NEXT:    lw s0, 52(a5)
-; RV32IZCMP-NEXT:    lw s1, 68(a5)
-; RV32IZCMP-NEXT:    lw a2, 64(a5)
-; RV32IZCMP-NEXT:    lw a3, 60(a5)
-; RV32IZCMP-NEXT:    lw a4, 56(a5)
-; RV32IZCMP-NEXT:    sw s1, 68(a5)
-; RV32IZCMP-NEXT:    sw a2, 64(a5)
-; RV32IZCMP-NEXT:    sw a3, 60(a5)
-; RV32IZCMP-NEXT:    sw a4, 56(a5)
-; RV32IZCMP-NEXT:    sw s0, 52(a5)
-; RV32IZCMP-NEXT:    sw a1, 48(a5)
-; RV32IZCMP-NEXT:    sw s4, 44(a5)
-; RV32IZCMP-NEXT:    sw s3, 40(a5)
-; RV32IZCMP-NEXT:    sw s2, 36(a5)
-; RV32IZCMP-NEXT:    sw t6, 32(a5)
-; RV32IZCMP-NEXT:    sw t5, 28(a5)
-; RV32IZCMP-NEXT:    sw t4, 24(a5)
-; RV32IZCMP-NEXT:    sw t3, 20(a5)
-; RV32IZCMP-NEXT:    sw t2, 16(a5)
-; RV32IZCMP-NEXT:    sw t1, %lo(var0+12)(a0)
-; RV32IZCMP-NEXT:    sw t0, %lo(var0+8)(a0)
-; RV32IZCMP-NEXT:    sw a7, %lo(var0+4)(a0)
+; RV32IZCMP-NEXT:    addi a2, a0, %lo(var0)
+; RV32IZCMP-NEXT:    lw a7, 16(a2)
+; RV32IZCMP-NEXT:    lw t0, 20(a2)
+; RV32IZCMP-NEXT:    lw t1, 24(a2)
+; RV32IZCMP-NEXT:    lw t2, 28(a2)
+; RV32IZCMP-NEXT:    lw t3, 32(a2)
+; RV32IZCMP-NEXT:    lw t4, 36(a2)
+; RV32IZCMP-NEXT:    lw t5, 40(a2)
+; RV32IZCMP-NEXT:    lw t6, 44(a2)
+; RV32IZCMP-NEXT:    lw a3, 48(a2)
+; RV32IZCMP-NEXT:    lw a4, 52(a2)
+; RV32IZCMP-NEXT:    lw a5, 56(a2)
+; RV32IZCMP-NEXT:    lw a1, 60(a2)
+; RV32IZCMP-NEXT:    lw s0, 64(a2)
+; RV32IZCMP-NEXT:    lw s1, 68(a2)
+; RV32IZCMP-NEXT:    lw s2, %lo(var0+4)(a0)
+; RV32IZCMP-NEXT:    lw s3, %lo(var0+8)(a0)
+; RV32IZCMP-NEXT:    lw s4, %lo(var0+12)(a0)
+; RV32IZCMP-NEXT:    sw s1, 68(a2)
+; RV32IZCMP-NEXT:    sw s0, 64(a2)
+; RV32IZCMP-NEXT:    sw a1, 60(a2)
+; RV32IZCMP-NEXT:    sw a5, 56(a2)
+; RV32IZCMP-NEXT:    sw a4, 52(a2)
+; RV32IZCMP-NEXT:    sw a3, 48(a2)
+; RV32IZCMP-NEXT:    sw t6, 44(a2)
+; RV32IZCMP-NEXT:    sw t5, 40(a2)
+; RV32IZCMP-NEXT:    sw t4, 36(a2)
+; RV32IZCMP-NEXT:    sw t3, 32(a2)
+; RV32IZCMP-NEXT:    sw t2, 28(a2)
+; RV32IZCMP-NEXT:    sw t1, 24(a2)
+; RV32IZCMP-NEXT:    sw t0, 20(a2)
+; RV32IZCMP-NEXT:    sw a7, 16(a2)
+; RV32IZCMP-NEXT:    sw s4, %lo(var0+12)(a0)
+; RV32IZCMP-NEXT:    sw s3, %lo(var0+8)(a0)
+; RV32IZCMP-NEXT:    sw s2, %lo(var0+4)(a0)
 ; RV32IZCMP-NEXT:    sw a6, %lo(var0)(a0)
 ; RV32IZCMP-NEXT:    cm.popret {ra, s0-s4}, 32
 ;
@@ -1152,41 +1152,41 @@ define void @many_args(i32, i32, i32, i32, i32, i32, i32, i32, i32) nounwind {
 ; RV64IZCMP-NEXT:    cm.push {ra, s0-s4}, -48
 ; RV64IZCMP-NEXT:    lui a0, %hi(var0)
 ; RV64IZCMP-NEXT:    lw a6, %lo(var0)(a0)
-; RV64IZCMP-NEXT:    lw a7, %lo(var0+4)(a0)
-; RV64IZCMP-NEXT:    lw t0, %lo(var0+8)(a0)
-; RV64IZCMP-NEXT:    lw t1, %lo(var0+12)(a0)
-; RV64IZCMP-NEXT:    addi a5, a0, %lo(var0)
-; RV64IZCMP-NEXT:    lw t2, 16(a5)
-; RV64IZCMP-NEXT:    lw t3, 20(a5)
-; RV64IZCMP-NEXT:    lw t4, 24(a5)
-; RV64IZCMP-NEXT:    lw t5, 28(a5)
-; RV64IZCMP-NEXT:    lw t6, 32(a5)
-; RV64IZCMP-NEXT:    lw s2, 36(a5)
-; RV64IZCMP-NEXT:    lw s3, 40(a5)
-; RV64IZCMP-NEXT:    lw s4, 44(a5)
-; RV64IZCMP-NEXT:    lw a1, 48(a5)
-; RV64IZCMP-NEXT:    lw s0, 52(a5)
-; RV64IZCMP-NEXT:    lw s1, 68(a5)
-; RV64IZCMP-NEXT:    lw a2, 64(a5)
-; RV64IZCMP-NEXT:    lw a3, 60(a5)
-; RV64IZCMP-NEXT:    lw a4, 56(a5)
-; RV64IZCMP-NEXT:    sw s1, 68(a5)
-; RV64IZCMP-NEXT:    sw a2, 64(a5)
-; RV64IZCMP-NEXT:    sw a3, 60(a5)
-; RV64IZCMP-NEXT:    sw a4, 56(a5)
-; RV64IZCMP-NEXT:    sw s0, 52(a5)
-; RV64IZCMP-NEXT:    sw a1, 48(a5)
-; RV64IZCMP-NEXT:    sw s4, 44(a5)
-; RV64IZCMP-NEXT:    sw s3, 40(a5)
-; RV64IZCMP-NEXT:    sw s2, 36(a5)
-; RV64IZCMP-NEXT:    sw t6, 32(a5)
-; RV64IZCMP-NEXT:    sw t5, 28(a5)
-; RV64IZCMP-NEXT:    sw t4, 24(a5)
-; RV64IZCMP-NEXT:    sw t3, 20(a5)
-; RV64IZCMP-NEXT:    sw t2, 16(a5)
-; RV64IZCMP-NEXT:    sw t1, %lo(var0+12)(a0)
-; RV64IZCMP-NEXT:    sw t0, %lo(var0+8)(a0)
-; RV64IZCMP-NEXT:    sw a7, %lo(var0+4)(a0)
+; RV64IZCMP-NEXT:    addi a2, a0, %lo(var0)
+; RV64IZCMP-NEXT:    lw a7, 16(a2)
+; RV64IZCMP-NEXT:    lw t0, 20(a2)
+; RV64IZCMP-NEXT:    lw t1, 24(a2)
+; RV64IZCMP-NEXT:    lw t2, 28(a2)
+; RV64IZCMP-NEXT:    lw t3, 32(a2)
+; RV64IZCMP-NEXT:    lw t4, 36(a2)
+; RV64IZCMP-NEXT:    lw t5, 40(a2)
+; RV64IZCMP-NEXT:    lw t6, 44(a2)
+; RV64IZCMP-NEXT:    lw a3, 48(a2)
+; RV64IZCMP-NEXT:    lw a4, 52(a2)
+; RV64IZCMP-NEXT:    lw a5, 56(a2)
+; RV64IZCMP-NEXT:    lw a1, 60(a2)
+; RV64IZCMP-NEXT:    lw s0, 64(a2)
+; RV64IZCMP-NEXT:    lw s1, 68(a2)
+; RV64IZCMP-NEXT:    lw s2, %lo(var0+4)(a0)
+; RV64IZCMP-NEXT:    lw s3, %lo(var0+8)(a0)
+; RV64IZCMP-NEXT:    lw s4, %lo(var0+12)(a0)
+; RV64IZCMP-NEXT:    sw s1, 68(a2)
+; RV64IZCMP-NEXT:    sw s0, 64(a2)
+; RV64IZCMP-NEXT:    sw a1, 60(a2)
+; RV64IZCMP-NEXT:    sw a5, 56(a2)
+; RV64IZCMP-NEXT:    sw a4, 52(a2)
+; RV64IZCMP-NEXT:    sw a3, 48(a2)
+; RV64IZCMP-NEXT:    sw t6, 44(a2)
+; RV64IZCMP-NEXT:    sw t5, 40(a2)
+; RV64IZCMP-NEXT:    sw t4, 36(a2)
+; RV64IZCMP-NEXT:    sw t3, 32(a2)
+; RV64IZCMP-NEXT:    sw t2, 28(a2)
+; RV64IZCMP-NEXT:    sw t1, 24(a2)
+; RV64IZCMP-NEXT:    sw t0, 20(a2)
+; RV64IZCMP-NEXT:    sw a7, 16(a2)
+; RV64IZCMP-NEXT:    sw s4, %lo(var0+12)(a0)
+; RV64IZCMP-NEXT:    sw s3, %lo(var0+8)(a0)
+; RV64IZCMP-NEXT:    sw s2, %lo(var0+4)(a0)
 ; RV64IZCMP-NEXT:    sw a6, %lo(var0)(a0)
 ; RV64IZCMP-NEXT:    cm.popret {ra, s0-s4}, 48
 ;
@@ -1195,41 +1195,41 @@ define void @many_args(i32, i32, i32, i32, i32, i32, i32, i32, i32) nounwind {
 ; RV32IZCMP-SR-NEXT:    cm.push {ra, s0-s4}, -32
 ; RV32IZCMP-SR-NEXT:    lui a0, %hi(var0)
 ; RV32IZCMP-SR-NEXT:    lw a6, %lo(var0)(a0)
-; RV32IZCMP-SR-NEXT:    lw a7, %lo(var0+4)(a0)
-; RV32IZCMP-SR-NEXT:    lw t0, %lo(var0+8)(a0)
-; RV32IZCMP-SR-NEXT:    lw t1, %lo(var0+12)(a0)
-; RV32IZCMP-SR-NEXT:    addi a5, a0, %lo(var0)
-; RV32IZCMP-SR-NEXT:    lw t2, 16(a5)
-; RV32IZCMP-SR-NEXT:    lw t3, 20(a5)
-; RV32IZCMP-SR-NEXT:    lw t4, 24(a5)
-; RV32IZCMP-SR-NEXT:    lw t5, 28(a5)
-; RV32IZCMP-SR-NEXT:    lw t6, 32(a5)
-; RV32IZCMP-SR-NEXT:    lw s2, 36(a5)
-; RV32IZCMP-SR-NEXT:    lw s3, 40(a5)
-; RV32IZCMP-SR-NEXT:    lw s4, 44(a5)
-; RV32IZCMP-SR-NEXT:    lw a1, 48(a5)
-; RV32IZCMP-SR-NEXT:    lw s0, 52(a5)
-; RV32IZCMP-SR-NEXT:    lw s1, 68(a5)
-; RV32IZCMP-SR-NEXT:    lw a2, 64(a5)
-; RV32IZCMP-SR-NEXT:    lw a3, 60(a5)
-; RV32IZCMP-SR-NEXT:    lw a4, 56(a5)
-; RV32IZCMP-SR-NEXT:    sw s1, 68(a5)
-; RV32IZCMP-SR-NEXT:    sw a2, 64(a5)
-; RV32IZCMP-SR-NEXT:    sw a3, 60(a5)
-; RV32IZCMP-SR-NEXT:    sw a4, 56(a5)
-; RV32IZCMP-SR-NEXT:    sw s0, 52(a5)
-; RV32IZCMP-SR-NEXT:    sw a1, 48(a5)
-; RV32IZCMP-SR-NEXT:    sw s4, 44(a5)
-; RV32IZCMP-SR-NEXT:    sw s3, 40(a5)
-; RV32IZCMP-SR-NEXT:    sw s2, 36(a5)
-; RV32IZCMP-SR-NEXT:    sw t6, 32(a5)
-; RV32IZCMP-SR-NEXT:    sw t5, 28(a5)
-; RV32IZCMP-SR-NEXT:    sw t4, 24(a5)
-; RV32IZCMP-SR-NEXT:    sw t3, 20(a5)
-; RV32IZCMP-SR-NEXT:    sw t2, 16(a5)
-; RV32IZCMP-SR-NEXT:    sw t1, %lo(var0+12)(a0)
-; RV32IZCMP-SR-NEXT:    sw t0, %lo(var0+8)(a0)
-; RV32IZCMP-SR-NEXT:    sw a7, %lo(var0+4)(a0)
+; RV32IZCMP-SR-NEXT:    addi a2, a0, %lo(var0)
+; RV32IZCMP-SR-NEXT:    lw a7, 16(a2)
+; RV32IZCMP-SR-NEXT:    lw t0, 20(a2)
+; RV32IZCMP-SR-NEXT:    lw t1, 24(a2)
+; RV32IZCMP-SR-NEXT:    lw t2, 28(a2)
+; RV32IZCMP-SR-NEXT:    lw t3, 32(a2)
+; RV32IZCMP-SR-NEXT:    lw t4, 36(a2)
+; RV32IZCMP-SR-NEXT:    lw t5, 40(a2)
+; RV32IZCMP-SR-NEXT:    lw t6, 44(a2)
+; RV32IZCMP-SR-NEXT:    lw a3, 48(a2)
+; RV32IZCMP-SR-NEXT:    lw a4, 52(a2)
+; RV32IZCMP-SR-NEXT:    lw a5, 56(a2)
+; RV32IZCMP-SR-NEXT:    lw a1, 60(a2)
+; RV32IZCMP-SR-NEXT:    lw s0, 64(a2)
+; RV32IZCMP-SR-NEXT:    lw s1, 68(a2)
+; RV32IZCMP-SR-NEXT:    lw s2, %lo(var0+4)(a0)
+; RV32IZCMP-SR-NEXT:    lw s3, %lo(var0+8)(a0)
+; RV32IZCMP-SR-NEXT:    lw s4, %lo(var0+12)(a0)
+; RV32IZCMP-SR-NEXT:    sw s1, 68(a2)
+; RV32IZCMP-SR-NEXT:    sw s0, 64(a2)
+; RV32IZCMP-SR-NEXT:    sw a1, 60(a2)
+; RV32IZCMP-SR-NEXT:    sw a5, 56(a2)
+; RV32IZCMP-SR-NEXT:    sw a4, 52(a2)
+; RV32IZCMP-SR-NEXT:    sw a3, 48(a2)
+; RV32IZCMP-SR-NEXT:    sw t6, 44(a2)
+; RV32IZCMP-SR-NEXT:    sw t5, 40(a2)
+; RV32IZCMP-SR-NEXT:    sw t4, 36(a2)
+; RV32IZCMP-SR-NEXT:    sw t3, 32(a2)
+; RV32IZCMP-SR-NEXT:    sw t2, 28(a2)
+; RV32IZCMP-SR-NEXT:    sw t1, 24(a2)
+; RV32IZCMP-SR-NEXT:    sw t0, 20(a2)
+; RV32IZCMP-SR-NEXT:    sw a7, 16(a2)
+; RV32IZCMP-SR-NEXT:    sw s4, %lo(var0+12)(a0)
+; RV32IZCMP-SR-NEXT:    sw s3, %lo(var0+8)(a0)
+; RV32IZCMP-SR-NEXT:    sw s2, %lo(var0+4)(a0)
 ; RV32IZCMP-SR-NEXT:    sw a6, %lo(var0)(a0)
 ; RV32IZCMP-SR-NEXT:    cm.popret {ra, s0-s4}, 32
 ;
@@ -1238,41 +1238,41 @@ define void @many_args(i32, i32, i32, i32, i32, i32, i32, i32, i32) nounwind {
 ; RV64IZCMP-SR-NEXT:    cm.push {ra, s0-s4}, -48
 ; RV64IZCMP-SR-NEXT:    lui a0, %hi(var0)
 ; RV64IZCMP-SR-NEXT:    lw a6, %lo(var0)(a0)
-; RV64IZCMP-SR-NEXT:    lw a7, %lo(var0+4)(a0)
-; RV64IZCMP-SR-NEXT:    lw t0, %lo(var0+8)(a0)
-; RV64IZCMP-SR-NEXT:    lw t1, %lo(var0+12)(a0)
-; RV64IZCMP-SR-NEXT:    addi a5, a0, %lo(var0)
-; RV64IZCMP-SR-NEXT:    lw t2, 16(a5)
-; RV64IZCMP-SR-NEXT:    lw t3, 20(a5)
-; RV64IZCMP-SR-NEXT:    lw t4, 24(a5)
-; RV64IZCMP-SR-NEXT:    lw t5, 28(a5)
-; RV64IZCMP-SR-NEXT:    lw t6, 32(a5)
-; RV64IZCMP-SR-NEXT:    lw s2, 36(a5)
-; RV64IZCMP-SR-NEXT:    lw s3, 40(a5)
-; RV64IZCMP-SR-NEXT:    lw s4, 44(a5)
-; RV64IZCMP-SR-NEXT:    lw a1, 48(a5)
-; RV64IZCMP-SR-NEXT:    lw s0, 52(a5)
-; RV64IZCMP-SR-NEXT:    lw s1, 68(a5)
-; RV64IZCMP-SR-NEXT:    lw a2, 64(a5)
-; RV64IZCMP-SR-NEXT:    lw a3, 60(a5)
-; RV64IZCMP-SR-NEXT:    lw a4, 56(a5)
-; RV64IZCMP-SR-NEXT:    sw s1, 68(a5)
-; RV64IZCMP-SR-NEXT:    sw a2, 64(a5)
-; RV64IZCMP-SR-NEXT:    sw a3, 60(a5)
-; RV64IZCMP-SR-NEXT:    sw a4, 56(a5)
-; RV64IZCMP-SR-NEXT:    sw s0, 52(a5)
-; RV64IZCMP-SR-NEXT:    sw a1, 48(a5)
-; RV64IZCMP-SR-NEXT:    sw s4, 44(a5)
-; RV64IZCMP-SR-NEXT:    sw s3, 40(a5)
-; RV64IZCMP-SR-NEXT:    sw s2, 36(a5)
-; RV64IZCMP-SR-NEXT:    sw t6, 32(a5)
-; RV64IZCMP-SR-NEXT:    sw t5, 28(a5)
-; RV64IZCMP-SR-NEXT:    sw t4, 24(a5)
-; RV64IZCMP-SR-NEXT:    sw t3, 20(a5)
-; RV64IZCMP-SR-NEXT:    sw t2, 16(a5)
-; RV64IZCMP-SR-NEXT:    sw t1, %lo(var0+12)(a0)
-; RV64IZCMP-SR-NEXT:    sw t0, %lo(var0+8)(a0)
-; RV64IZCMP-SR-NEXT:    sw a7, %lo(var0+4)(a0)
+; RV64IZCMP-SR-NEXT:    addi a2, a0, %lo(var0)
+; RV64IZCMP-SR-NEXT:    lw a7, 16(a2)
+; RV64IZCMP-SR-NEXT:    lw t0, 20(a2)
+; RV64IZCMP-SR-NEXT:    lw t1, 24(a2)
+; RV64IZCMP-SR-NEXT:    lw t2, 28(a2)
+; RV64IZCMP-SR-NEXT:    lw t3, 32(a2)
+; RV64IZCMP-SR-NEXT:    lw t4, 36(a2)
+; RV64IZCMP-SR-NEXT:    lw t5, 40(a2)
+; RV64IZCMP-SR-NEXT:    lw t6, 44(a2)
+; RV64IZCMP-SR-NEXT:    lw a3, 48(a2)
+; RV64IZCMP-SR-NEXT:    lw a4, 52(a2)
+; RV64IZCMP-SR-NEXT:    lw a5, 56(a2)
+; RV64IZCMP-SR-NEXT:    lw a1, 60(a2)
+; RV64IZCMP-SR-NEXT:    lw s0, 64(a2)
+; RV64IZCMP-SR-NEXT:    lw s1, 68(a2)
+; RV64IZCMP-SR-NEXT:    lw s2, %lo(var0+4)(a0)
+; RV64IZCMP-SR-NEXT:    lw s3, %lo(var0+8)(a0)
+; RV64IZCMP-SR-NEXT:    lw s4, %lo(var0+12)(a0)
+; RV64IZCMP-SR-NEXT:    sw s1, 68(a2)
+; RV64IZCMP-SR-NEXT:    sw s0, 64(a2)
+; RV64IZCMP-SR-NEXT:    sw a1, 60(a2)
+; RV64IZCMP-SR-NEXT:    sw a5, 56(a2)
+; RV64IZCMP-SR-NEXT:    sw a4, 52(a2)
+; RV64IZCMP-SR-NEXT:    sw a3, 48(a2)
+; RV64IZCMP-SR-NEXT:    sw t6, 44(a2)
+; RV64IZCMP-SR-NEXT:    sw t5, 40(a2)
+; RV64IZCMP-SR-NEXT:    sw t4, 36(a2)
+; RV64IZCMP-SR-NEXT:    sw t3, 32(a2)
+; RV64IZCMP-SR-NEXT:    sw t2, 28(a2)
+; RV64IZCMP-SR-NEXT:    sw t1, 24(a2)
+; RV64IZCMP-SR-NEXT:    sw t0, 20(a2)
+; RV64IZCMP-SR-NEXT:    sw a7, 16(a2)
+; RV64IZCMP-SR-NEXT:    sw s4, %lo(var0+12)(a0)
+; RV64IZCMP-SR-NEXT:    sw s3, %lo(var0+8)(a0)
+; RV64IZCMP-SR-NEXT:    sw s2, %lo(var0+4)(a0)
 ; RV64IZCMP-SR-NEXT:    sw a6, %lo(var0)(a0)
 ; RV64IZCMP-SR-NEXT:    cm.popret {ra, s0-s4}, 48
 ;
@@ -1286,41 +1286,41 @@ define void @many_args(i32, i32, i32, i32, i32, i32, i32, i32, i32) nounwind {
 ; RV32I-NEXT:    sw s4, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    lui a0, %hi(var0)
 ; RV32I-NEXT:    lw a1, %lo(var0)(a0)
-; RV32I-NEXT:    lw a2, %lo(var0+4)(a0)
-; RV32I-NEXT:    lw a3, %lo(var0+8)(a0)
-; RV32I-NEXT:    lw a4, %lo(var0+12)(a0)
-; RV32I-NEXT:    addi a5, a0, %lo(var0)
-; RV32I-NEXT:    lw a6, 16(a5)
-; RV32I-NEXT:    lw a7, 20(a5)
-; RV32I-NEXT:    lw t0, 24(a5)
-; RV32I-NEXT:    lw t1, 28(a5)
-; RV32I-NEXT:    lw t2, 32(a5)
-; RV32I-NEXT:    lw t3, 36(a5)
-; RV32I-NEXT:    lw t4, 40(a5)
-; RV32I-NEXT:    lw t5, 44(a5)
-; RV32I-NEXT:    lw t6, 48(a5)
-; RV32I-NEXT:    lw s0, 52(a5)
-; RV32I-NEXT:    lw s1, 68(a5)
-; RV32I-NEXT:    lw s2, 64(a5)
-; RV32I-NEXT:    lw s3, 60(a5)
-; RV32I-NEXT:    lw s4, 56(a5)
-; RV32I-NEXT:    sw s1, 68(a5)
-; RV32I-NEXT:    sw s2, 64(a5)
-; RV32I-NEXT:    sw s3, 60(a5)
-; RV32I-NEXT:    sw s4, 56(a5)
-; RV32I-NEXT:    sw s0, 52(a5)
-; RV32I-NEXT:    sw t6, 48(a5)
-; RV32I-NEXT:    sw t5, 44(a5)
-; RV32I-NEXT:    sw t4, 40(a5)
-; RV32I-NEXT:    sw t3, 36(a5)
-; RV32I-NEXT:    sw t2, 32(a5)
-; RV32I-NEXT:    sw t1, 28(a5)
-; RV32I-NEXT:    sw t0, 24(a5)
-; RV32I-NEXT:    sw a7, 20(a5)
-; RV32I-NEXT:    sw a6, 16(a5)
-; RV32I-NEXT:    sw a4, %lo(var0+12)(a0)
-; RV32I-NEXT:    sw a3, %lo(var0+8)(a0)
-; RV32I-NEXT:    sw a2, %lo(var0+4)(a0)
+; RV32I-NEXT:    addi a2, a0, %lo(var0)
+; RV32I-NEXT:    lw a3, 16(a2)
+; RV32I-NEXT:    lw a4, 20(a2)
+; RV32I-NEXT:    lw a5, 24(a2)
+; RV32I-NEXT:    lw a6, 28(a2)
+; RV32I-NEXT:    lw a7, 32(a2)
+; RV32I-NEXT:    lw t0, 36(a2)
+; RV32I-NEXT:    lw t1, 40(a2)
+; RV32I-NEXT:    lw t2, 44(a2)
+; RV32I-NEXT:    lw t3, 48(a2)
+; RV32I-NEXT:    lw t4, 52(a2)
+; RV32I-NEXT:    lw t5, 56(a2)
+; RV32I-NEXT:    lw t6, 60(a2)
+; RV32I-NEXT:    lw s0, 64(a2)
+; RV32I-NEXT:    lw s1, 68(a2)
+; RV32I-NEXT:    lw s2, %lo(var0+4)(a0)
+; RV32I-NEXT:    lw s3, %lo(var0+8)(a0)
+; RV32I-NEXT:    lw s4, %lo(var0+12)(a0)
+; RV32I-NEXT:    sw s1, 68(a2)
+; RV32I-NEXT:    sw s0, 64(a2)
+; RV32I-NEXT:    sw t6, 60(a2)
+; RV32I-NEXT:    sw t5, 56(a2)
+; RV32I-NEXT:    sw t4, 52(a2)
+; RV32I-NEXT:    sw t3, 48(a2)
+; RV32I-NEXT:    sw t2, 44(a2)
+; RV32I-NEXT:    sw t1, 40(a2)
+; RV32I-NEXT:    sw t0, 36(a2)
+; RV32I-NEXT:    sw a7, 32(a2)
+; RV32I-NEXT:    sw a6, 28(a2)
+; RV32I-NEXT:    sw a5, 24(a2)
+; RV32I-NEXT:    sw a4, 20(a2)
+; RV32I-NEXT:    sw a3, 16(a2)
+; RV32I-NEXT:    sw s4, %lo(var0+12)(a0)
+; RV32I-NEXT:    sw s3, %lo(var0+8)(a0)
+; RV32I-NEXT:    sw s2, %lo(var0+4)(a0)
 ; RV32I-NEXT:    sw a1, %lo(var0)(a0)
 ; RV32I-NEXT:    lw s0, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 24(sp) # 4-byte Folded Reload
@@ -1340,41 +1340,41 @@ define void @many_args(i32, i32, i32, i32, i32, i32, i32, i32, i32) nounwind {
 ; RV64I-NEXT:    sd s4, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    lui a0, %hi(var0)
 ; RV64I-NEXT:    lw a1, %lo(var0)(a0)
-; RV64I-NEXT:    lw a2, %lo(var0+4)(a0)
-; RV64I-NEXT:    lw a3, %lo(var0+8)(a0)
-; RV64I-NEXT:    lw a4, %lo(var0+12)(a0)
-; RV64I-NEXT:    addi a5, a0, %lo(var0)
-; RV64I-NEXT:    lw a6, 16(a5)
-; RV64I-NEXT:    lw a7, 20(a5)
-; RV64I-NEXT:    lw t0, 24(a5)
-; RV64I-NEXT:    lw t1, 28(a5)
-; RV64I-NEXT:    lw t2, 32(a5)
-; RV64I-NEXT:    lw t3, 36(a5)
-; RV64I-NEXT:    lw t4, 40(a5)
-; RV64I-NEXT:    lw t5, 44(a5)
-; RV64I-NEXT:    lw t6, 48(a5)
-; RV64I-NEXT:    lw s0, 52(a5)
-; RV64I-NEXT:    lw s1, 68(a5)
-; RV64I-NEXT:    lw s2, 64(a5)
-; RV64I-NEXT:    lw s3, 60(a5)
-; RV64I-NEXT:    lw s4, 56(a5)
-; RV64I-NEXT:    sw s1, 68(a5)
-; RV64I-NEXT:    sw s2, 64(a5)
-; RV64I-NEXT:    sw s3, 60(a5)
-; RV64I-NEXT:    sw s4, 56(a5)
-; RV64I-NEXT:    sw s0, 52(a5)
-; RV64I-NEXT:    sw t6, 48(a5)
-; RV64I-NEXT:    sw t5, 44(a5)
-; RV64I-NEXT:    sw t4, 40(a5)
-; RV64I-NEXT:    sw t3, 36(a5)
-; RV64I-NEXT:    sw t2, 32(a5)
-; RV64I-NEXT:    sw t1, 28(a5)
-; RV64I-NEXT:    sw t0, 24(a5)
-; RV64I-NEXT:    sw a7, 20(a5)
-; RV64I-NEXT:    sw a6, 16(a5)
-; RV64I-NEXT:    sw a4, %lo(var0+12)(a0)
-; RV64I-NEXT:    sw a3, %lo(var0+8)(a0)
-; RV64I-NEXT:    sw a2, %lo(var0+4)(a0)
+; RV64I-NEXT:    addi a2, a0, %lo(var0)
+; RV64I-NEXT:    lw a3, 16(a2)
+; RV64I-NEXT:    lw a4, 20(a2)
+; RV64I-NEXT:    lw a5, 24(a2)
+; RV64I-NEXT:    lw a6, 28(a2)
+; RV64I-NEXT:    lw a7, 32(a2)
+; RV64I-NEXT:    lw t0, 36(a2)
+; RV64I-NEXT:    lw t1, 40(a2)
+; RV64I-NEXT:    lw t2, 44(a2)
+; RV64I-NEXT:    lw t3, 48(a2)
+; RV64I-NEXT:    lw t4, 52(a2)
+; RV64I-NEXT:    lw t5, 56(a2)
+; RV64I-NEXT:    lw t6, 60(a2)
+; RV64I-NEXT:    lw s0, 64(a2)
+; RV64I-NEXT:    lw s1, 68(a2)
+; RV64I-NEXT:    lw s2, %lo(var0+4)(a0)
+; RV64I-NEXT:    lw s3, %lo(var0+8)(a0)
+; RV64I-NEXT:    lw s4, %lo(var0+12)(a0)
+; RV64I-NEXT:    sw s1, 68(a2)
+; RV64I-NEXT:    sw s0, 64(a2)
+; RV64I-NEXT:    sw t6, 60(a2)
+; RV64I-NEXT:    sw t5, 56(a2)
+; RV64I-NEXT:    sw t4, 52(a2)
+; RV64I-NEXT:    sw t3, 48(a2)
+; RV64I-NEXT:    sw t2, 44(a2)
+; RV64I-NEXT:    sw t1, 40(a2)
+; RV64I-NEXT:    sw t0, 36(a2)
+; RV64I-NEXT:    sw a7, 32(a2)
+; RV64I-NEXT:    sw a6, 28(a2)
+; RV64I-NEXT:    sw a5, 24(a2)
+; RV64I-NEXT:    sw a4, 20(a2)
+; RV64I-NEXT:    sw a3, 16(a2)
+; RV64I-NEXT:    sw s4, %lo(var0+12)(a0)
+; RV64I-NEXT:    sw s3, %lo(var0+8)(a0)
+; RV64I-NEXT:    sw s2, %lo(var0+4)(a0)
 ; RV64I-NEXT:    sw a1, %lo(var0)(a0)
 ; RV64I-NEXT:    ld s0, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 32(sp) # 8-byte Folded Reload
@@ -1813,84 +1813,84 @@ define void @callee_with_irq() nounwind "interrupt"="user" {
 ; RV32IZCMP-NEXT:    sw t4, 44(sp) # 4-byte Folded Spill
 ; RV32IZCMP-NEXT:    sw t5, 40(sp) # 4-byte Folded Spill
 ; RV32IZCMP-NEXT:    sw t6, 36(sp) # 4-byte Folded Spill
-; RV32IZCMP-NEXT:    lui a6, %hi(var_test_irq)
-; RV32IZCMP-NEXT:    lw a0, %lo(var_test_irq)(a6)
+; RV32IZCMP-NEXT:    lui a5, %hi(var_test_irq)
+; RV32IZCMP-NEXT:    lw a0, %lo(var_test_irq)(a5)
 ; RV32IZCMP-NEXT:    sw a0, 32(sp) # 4-byte Folded Spill
-; RV32IZCMP-NEXT:    lw a0, %lo(var_test_irq+4)(a6)
+; RV32IZCMP-NEXT:    addi a2, a5, %lo(var_test_irq)
+; RV32IZCMP-NEXT:    lw a0, 16(a2)
 ; RV32IZCMP-NEXT:    sw a0, 28(sp) # 4-byte Folded Spill
-; RV32IZCMP-NEXT:    lw a0, %lo(var_test_irq+8)(a6)
+; RV32IZCMP-NEXT:    lw a0, 20(a2)
 ; RV32IZCMP-NEXT:    sw a0, 24(sp) # 4-byte Folded Spill
-; RV32IZCMP-NEXT:    lw a0, %lo(var_test_irq+12)(a6)
+; RV32IZCMP-NEXT:    lw a0, 24(a2)
 ; RV32IZCMP-NEXT:    sw a0, 20(sp) # 4-byte Folded Spill
-; RV32IZCMP-NEXT:    addi a5, a6, %lo(var_test_irq)
-; RV32IZCMP-NEXT:    lw a0, 16(a5)
+; RV32IZCMP-NEXT:    lw a0, 28(a2)
 ; RV32IZCMP-NEXT:    sw a0, 16(sp) # 4-byte Folded Spill
-; RV32IZCMP-NEXT:    lw a0, 20(a5)
+; RV32IZCMP-NEXT:    lw a0, 32(a2)
 ; RV32IZCMP-NEXT:    sw a0, 12(sp) # 4-byte Folded Spill
-; RV32IZCMP-NEXT:    lw t4, 24(a5)
-; RV32IZCMP-NEXT:    lw t5, 28(a5)
-; RV32IZCMP-NEXT:    lw t6, 32(a5)
-; RV32IZCMP-NEXT:    lw s2, 36(a5)
-; RV32IZCMP-NEXT:    lw s3, 40(a5)
-; RV32IZCMP-NEXT:    lw s4, 44(a5)
-; RV32IZCMP-NEXT:    lw s5, 48(a5)
-; RV32IZCMP-NEXT:    lw s6, 52(a5)
-; RV32IZCMP-NEXT:    lw s7, 56(a5)
-; RV32IZCMP-NEXT:    lw s8, 60(a5)
-; RV32IZCMP-NEXT:    lw s9, 64(a5)
-; RV32IZCMP-NEXT:    lw s10, 68(a5)
-; RV32IZCMP-NEXT:    lw s11, 72(a5)
-; RV32IZCMP-NEXT:    lw ra, 76(a5)
-; RV32IZCMP-NEXT:    lw s1, 80(a5)
-; RV32IZCMP-NEXT:    lw t3, 84(a5)
-; RV32IZCMP-NEXT:    lw t2, 88(a5)
-; RV32IZCMP-NEXT:    lw t1, 92(a5)
-; RV32IZCMP-NEXT:    lw t0, 96(a5)
-; RV32IZCMP-NEXT:    lw s0, 100(a5)
-; RV32IZCMP-NEXT:    lw a7, 104(a5)
-; RV32IZCMP-NEXT:    lw a4, 108(a5)
-; RV32IZCMP-NEXT:    lw a0, 124(a5)
-; RV32IZCMP-NEXT:    lw a1, 120(a5)
-; RV32IZCMP-NEXT:    lw a2, 116(a5)
-; RV32IZCMP-NEXT:    lw a3, 112(a5)
-; RV32IZCMP-NEXT:    sw a0, 124(a5)
-; RV32IZCMP-NEXT:    sw a1, 120(a5)
-; RV32IZCMP-NEXT:    sw a2, 116(a5)
-; RV32IZCMP-NEXT:    sw a3, 112(a5)
-; RV32IZCMP-NEXT:    sw a4, 108(a5)
-; RV32IZCMP-NEXT:    sw a7, 104(a5)
-; RV32IZCMP-NEXT:    sw s0, 100(a5)
-; RV32IZCMP-NEXT:    sw t0, 96(a5)
-; RV32IZCMP-NEXT:    sw t1, 92(a5)
-; RV32IZCMP-NEXT:    sw t2, 88(a5)
-; RV32IZCMP-NEXT:    sw t3, 84(a5)
-; RV32IZCMP-NEXT:    sw s1, 80(a5)
-; RV32IZCMP-NEXT:    sw ra, 76(a5)
-; RV32IZCMP-NEXT:    sw s11, 72(a5)
-; RV32IZCMP-NEXT:    sw s10, 68(a5)
-; RV32IZCMP-NEXT:    sw s9, 64(a5)
-; RV32IZCMP-NEXT:    sw s8, 60(a5)
-; RV32IZCMP-NEXT:    sw s7, 56(a5)
-; RV32IZCMP-NEXT:    sw s6, 52(a5)
-; RV32IZCMP-NEXT:    sw s5, 48(a5)
-; RV32IZCMP-NEXT:    sw s4, 44(a5)
-; RV32IZCMP-NEXT:    sw s3, 40(a5)
-; RV32IZCMP-NEXT:    sw s2, 36(a5)
-; RV32IZCMP-NEXT:    sw t6, 32(a5)
-; RV32IZCMP-NEXT:    sw t5, 28(a5)
-; RV32IZCMP-NEXT:    sw t4, 24(a5)
+; RV32IZCMP-NEXT:    lw t4, 36(a2)
+; RV32IZCMP-NEXT:    lw t5, 40(a2)
+; RV32IZCMP-NEXT:    lw t6, 44(a2)
+; RV32IZCMP-NEXT:    lw s2, 48(a2)
+; RV32IZCMP-NEXT:    lw s3, 52(a2)
+; RV32IZCMP-NEXT:    lw s4, 56(a2)
+; RV32IZCMP-NEXT:    lw s5, 60(a2)
+; RV32IZCMP-NEXT:    lw s6, 64(a2)
+; RV32IZCMP-NEXT:    lw s7, 68(a2)
+; RV32IZCMP-NEXT:    lw s8, 72(a2)
+; RV32IZCMP-NEXT:    lw s9, 76(a2)
+; RV32IZCMP-NEXT:    lw s10, 80(a2)
+; RV32IZCMP-NEXT:    lw s11, 84(a2)
+; RV32IZCMP-NEXT:    lw ra, 88(a2)
+; RV32IZCMP-NEXT:    lw s1, 92(a2)
+; RV32IZCMP-NEXT:    lw t0, 96(a2)
+; RV32IZCMP-NEXT:    lw a7, 100(a2)
+; RV32IZCMP-NEXT:    lw a6, 104(a2)
+; RV32IZCMP-NEXT:    lw a4, 108(a2)
+; RV32IZCMP-NEXT:    lw s0, 112(a2)
+; RV32IZCMP-NEXT:    lw a3, 116(a2)
+; RV32IZCMP-NEXT:    lw a1, 120(a2)
+; RV32IZCMP-NEXT:    lw a0, 124(a2)
+; RV32IZCMP-NEXT:    lw t3, %lo(var_test_irq+4)(a5)
+; RV32IZCMP-NEXT:    lw t2, %lo(var_test_irq+8)(a5)
+; RV32IZCMP-NEXT:    lw t1, %lo(var_test_irq+12)(a5)
+; RV32IZCMP-NEXT:    sw a0, 124(a2)
+; RV32IZCMP-NEXT:    sw a1, 120(a2)
+; RV32IZCMP-NEXT:    sw a3, 116(a2)
+; RV32IZCMP-NEXT:    sw s0, 112(a2)
+; RV32IZCMP-NEXT:    sw a4, 108(a2)
+; RV32IZCMP-NEXT:    sw a6, 104(a2)
+; RV32IZCMP-NEXT:    sw a7, 100(a2)
+; RV32IZCMP-NEXT:    sw t0, 96(a2)
+; RV32IZCMP-NEXT:    sw s1, 92(a2)
+; RV32IZCMP-NEXT:    sw ra, 88(a2)
+; RV32IZCMP-NEXT:    sw s11, 84(a2)
+; RV32IZCMP-NEXT:    sw s10, 80(a2)
+; RV32IZCMP-NEXT:    sw s9, 76(a2)
+; RV32IZCMP-NEXT:    sw s8, 72(a2)
+; RV32IZCMP-NEXT:    sw s7, 68(a2)
+; RV32IZCMP-NEXT:    sw s6, 64(a2)
+; RV32IZCMP-NEXT:    sw s5, 60(a2)
+; RV32IZCMP-NEXT:    sw s4, 56(a2)
+; RV32IZCMP-NEXT:    sw s3, 52(a2)
+; RV32IZCMP-NEXT:    sw s2, 48(a2)
+; RV32IZCMP-NEXT:    sw t6, 44(a2)
+; RV32IZCMP-NEXT:    sw t5, 40(a2)
+; RV32IZCMP-NEXT:    sw t4, 36(a2)
 ; RV32IZCMP-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
-; RV32IZCMP-NEXT:    sw a0, 20(a5)
+; RV32IZCMP-NEXT:    sw a0, 32(a2)
 ; RV32IZCMP-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
-; RV32IZCMP-NEXT:    sw a0, 16(a5)
+; RV32IZCMP-NEXT:    sw a0, 28(a2)
 ; RV32IZCMP-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
-; RV32IZCMP-NEXT:    sw a0, %lo(var_test_irq+12)(a6)
+; RV32IZCMP-NEXT:    sw a0, 24(a2)
 ; RV32IZCMP-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
-; RV32IZCMP-NEXT:    sw a0, %lo(var_test_irq+8)(a6)
+; RV32IZCMP-NEXT:    sw a0, 20(a2)
 ; RV32IZCMP-NEXT:    lw a0, 28(sp) # 4-byte Folded Reload
-; RV32IZCMP-NEXT:    sw a0, %lo(var_test_irq+4)(a6)
+; RV32IZCMP-NEXT:    sw a0, 16(a2)
+; RV32IZCMP-NEXT:    sw t1, %lo(var_test_irq+12)(a5)
+; RV32IZCMP-NEXT:    sw t2, %lo(var_test_irq+8)(a5)
+; RV32IZCMP-NEXT:    sw t3, %lo(var_test_irq+4)(a5)
 ; RV32IZCMP-NEXT:    lw a0, 32(sp) # 4-byte Folded Reload
-; RV32IZCMP-NEXT:    sw a0, %lo(var_test_irq)(a6)
+; RV32IZCMP-NEXT:    sw a0, %lo(var_test_irq)(a5)
 ; RV32IZCMP-NEXT:    lw t0, 92(sp) # 4-byte Folded Reload
 ; RV32IZCMP-NEXT:    lw t1, 88(sp) # 4-byte Folded Reload
 ; RV32IZCMP-NEXT:    lw t2, 84(sp) # 4-byte Folded Reload
@@ -1929,84 +1929,84 @@ define void @callee_with_irq() nounwind "interrupt"="user" {
 ; RV64IZCMP-NEXT:    sd t4, 72(sp) # 8-byte Folded Spill
 ; RV64IZCMP-NEXT:    sd t5, 64(sp) # 8-byte Folded Spill
 ; RV64IZCMP-NEXT:    sd t6, 56(sp) # 8-byte Folded Spill
-; RV64IZCMP-NEXT:    lui a6, %hi(var_test_irq)
-; RV64IZCMP-NEXT:    lw a0, %lo(var_test_irq)(a6)
+; RV64IZCMP-NEXT:    lui a5, %hi(var_test_irq)
+; RV64IZCMP-NEXT:    lw a0, %lo(var_test_irq)(a5)
 ; RV64IZCMP-NEXT:    sd a0, 48(sp) # 8-byte Folded Spill
-; RV64IZCMP-NEXT:    lw a0, %lo(var_test_irq+4)(a6)
+; RV64IZCMP-NEXT:    addi a2, a5, %lo(var_test_irq)
+; RV64IZCMP-NEXT:    lw a0, 16(a2)
 ; RV64IZCMP-NEXT:    sd a0, 40(sp) # 8-byte Folded Spill
-; RV64IZCMP-NEXT:    lw a0, %lo(var_test_irq+8)(a6)
+; RV64IZCMP-NEXT:    lw a0, 20(a2)
 ; RV64IZCMP-NEXT:    sd a0, 32(sp) # 8-byte Folded Spill
-; RV64IZCMP-NEXT:    lw a0, %lo(var_test_irq+12)(a6)
+; RV64IZCMP-NEXT:    lw a0, 24(a2)
 ; RV64IZCMP-NEXT:    sd a0, 24(sp) # 8-byte Folded Spill
-; RV64IZCMP-NEXT:    addi a5, a6, %lo(var_test_irq)
-; RV64IZCMP-NEXT:    lw a0, 16(a5)
+; RV64IZCMP-NEXT:    lw a0, 28(a2)
 ; RV64IZCMP-NEXT:    sd a0, 16(sp) # 8-byte Folded Spill
-; RV64IZCMP-NEXT:    lw a0, 20(a5)
+; RV64IZCMP-NEXT:    lw a0, 32(a2)
 ; RV64IZCMP-NEXT:    sd a0, 8(sp) # 8-byte Folded Spill
-; RV64IZCMP-NEXT:    lw t4, 24(a5)
-; RV64IZCMP-NEXT:    lw t5, 28(a5)
-; RV64IZCMP-NEXT:    lw t6, 32(a5)
-; RV64IZCMP-NEXT:    lw s2, 36(a5)
-; RV64IZCMP-NEXT:    lw s3, 40(a5)
-; RV64IZCMP-NEXT:    lw s4, 44(a5)
-; RV64IZCMP-NEXT:    lw s5, 48(a5)
-; RV64IZCMP-NEXT:    lw s6, 52(a5)
-; RV64IZCMP-NEXT:    lw s7, 56(a5)
-; RV64IZCMP-NEXT:    lw s8, 60(a5)
-; RV64IZCMP-NEXT:    lw s9, 64(a5)
-; RV64IZCMP-NEXT:    lw s10, 68(a5)
-; RV64IZCMP-NEXT:    lw s11, 72(a5)
-; RV64IZCMP-NEXT:    lw ra, 76(a5)
-; RV64IZCMP-NEXT:    lw s1, 80(a5)
-; RV64IZCMP-NEXT:    lw t3, 84(a5)
-; RV64IZCMP-NEXT:    lw t2, 88(a5)
-; RV64IZCMP-NEXT:    lw t1, 92(a5)
-; RV64IZCMP-NEXT:    lw t0, 96(a5)
-; RV64IZCMP-NEXT:    lw s0, 100(a5)
-; RV64IZCMP-NEXT:    lw a7, 104(a5)
-; RV64IZCMP-NEXT:    lw a4, 108(a5)
-; RV64IZCMP-NEXT:    lw a0, 124(a5)
-; RV64IZCMP-NEXT:    lw a1, 120(a5)
-; RV64IZCMP-NEXT:    lw a2, 116(a5)
-; RV64IZCMP-NEXT:    lw a3, 112(a5)
-; RV64IZCMP-NEXT:    sw a0, 124(a5)
-; RV64IZCMP-NEXT:    sw a1, 120(a5)
-; RV64IZCMP-NEXT:    sw a2, 116(a5)
-; RV64IZCMP-NEXT:    sw a3, 112(a5)
-; RV64IZCMP-NEXT:    sw a4, 108(a5)
-; RV64IZCMP-NEXT:    sw a7, 104(a5)
-; RV64IZCMP-NEXT:    sw s0, 100(a5)
-; RV64IZCMP-NEXT:    sw t0, 96(a5)
-; RV64IZCMP-NEXT:    sw t1, 92(a5)
-; RV64IZCMP-NEXT:    sw t2, 88(a5)
-; RV64IZCMP-NEXT:    sw t3, 84(a5)
-; RV64IZCMP-NEXT:    sw s1, 80(a5)
-; RV64IZCMP-NEXT:    sw ra, 76(a5)
-; RV64IZCMP-NEXT:    sw s11, 72(a5)
-; RV64IZCMP-NEXT:    sw s10, 68(a5)
-; RV64IZCMP-NEXT:    sw s9, 64(a5)
-; RV64IZCMP-NEXT:    sw s8, 60(a5)
-; RV64IZCMP-NEXT:    sw s7, 56(a5)
-; RV64IZCMP-NEXT:    sw s6, 52(a5)
-; RV64IZCMP-NEXT:    sw s5, 48(a5)
-; RV64IZCMP-NEXT:    sw s4, 44(a5)
-; RV64IZCMP-NEXT:    sw s3, 40(a5)
-; RV64IZCMP-NEXT:    sw s2, 36(a5)
-; RV64IZCMP-NEXT:    sw t6, 32(a5)
-; RV64IZCMP-NEXT:    sw t5, 28(a5)
-; RV64IZCMP-NEXT:    sw t4, 24(a5)
+; RV64IZCMP-NEXT:    lw t4, 36(a2)
+; RV64IZCMP-NEXT:    lw t5, 40(a2)
+; RV64IZCMP-NEXT:    lw t6, 44(a2)
+; RV64IZCMP-NEXT:    lw s2, 48(a2)
+; RV64IZCMP-NEXT:    lw s3, 52(a2)
+; RV64IZCMP-NEXT:    lw s4, 56(a2)
+; RV64IZCMP-NEXT:    lw s5, 60(a2)
+; RV64IZCMP-NEXT:    lw s6, 64(a2)
+; RV64IZCMP-NEXT:    lw s7, 68(a2)
+; RV64IZCMP-NEXT:    lw s8, 72(a2)
+; RV64IZCMP-NEXT:    lw s9, 76(a2)
+; RV64IZCMP-NEXT:    lw s10, 80(a2)
+; RV64IZCMP-NEXT:    lw s11, 84(a2)
+; RV64IZCMP-NEXT:    lw ra, 88(a2)
+; RV64IZCMP-NEXT:    lw s1, 92(a2)
+; RV64IZCMP-NEXT:    lw t0, 96(a2)
+; RV64IZCMP-NEXT:    lw a7, 100(a2)
+; RV64IZCMP-NEXT:    lw a6, 104(a2)
+; RV64IZCMP-NEXT:    lw a4, 108(a2)
+; RV64IZCMP-NEXT:    lw s0, 112(a2)
+; RV64IZCMP-NEXT:    lw a3, 116(a2)
+; RV64IZCMP-NEXT:    lw a1, 120(a2)
+; RV64IZCMP-NEXT:    lw a0, 124(a2)
+; RV64IZCMP-NEXT:    lw t3, %lo(var_test_irq+4)(a5)
+; RV64IZCMP-NEXT:    lw t2, %lo(var_test_irq+8)(a5)
+; RV64IZCMP-NEXT:    lw t1, %lo(var_test_irq+12)(a5)
+; RV64IZCMP-NEXT:    sw a0, 124(a2)
+; RV64IZCMP-NEXT:    sw a1, 120(a2)
+; RV64IZCMP-NEXT:    sw a3, 116(a2)
+; RV64IZCMP-NEXT:    sw s0, 112(a2)
+; RV64IZCMP-NEXT:    sw a4, 108(a2)
+; RV64IZCMP-NEXT:    sw a6, 104(a2)
+; RV64IZCMP-NEXT:    sw a7, 100(a2)
+; RV64IZCMP-NEXT:    sw t0, 96(a2)
+; RV64IZCMP-NEXT:    sw s1, 92(a2)
+; RV64IZCMP-NEXT:    sw ra, 88(a2)
+; RV64IZCMP-NEXT:    sw s11, 84(a2)
+; RV64IZCMP-NEXT:    sw s10, 80(a2)
+; RV64IZCMP-NEXT:    sw s9, 76(a2)
+; RV64IZCMP-NEXT:    sw s8, 72(a2)
+; RV64IZCMP-NEXT:    sw s7, 68(a2)
+; RV64IZCMP-NEXT:    sw s6, 64(a2)
+; RV64IZCMP-NEXT:    sw s5, 60(a2)
+; RV64IZCMP-NEXT:    sw s4, 56(a2)
+; RV64IZCMP-NEXT:    sw s3, 52(a2)
+; RV64IZCMP-NEXT:    sw s2, 48(a2)
+; RV64IZCMP-NEXT:    sw t6, 44(a2)
+; RV64IZCMP-NEXT:    sw t5, 40(a2)
+; RV64IZCMP-NEXT:    sw t4, 36(a2)
 ; RV64IZCMP-NEXT:    ld a0, 8(sp) # 8-byte Folded Reload
-; RV64IZCMP-NEXT:    sw a0, 20(a5)
+; RV64IZCMP-NEXT:    sw a0, 32(a2)
 ; RV64IZCMP-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
-; RV64IZCMP-NEXT:    sw a0, 16(a5)
+; RV64IZCMP-NEXT:    sw a0, 28(a2)
 ; RV64IZCMP-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
-; RV64IZCMP-NEXT:    sw a0, %lo(var_test_irq+12)(a6)
+; RV64IZCMP-NEXT:    sw a0, 24(a2)
 ; RV64IZCMP-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
-; RV64IZCMP-NEXT:    sw a0, %lo(var_test_irq+8)(a6)
+; RV64IZCMP-NEXT:    sw a0, 20(a2)
 ; RV64IZCMP-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
-; RV64IZCMP-NEXT:    sw a0, %lo(var_test_irq+4)(a6)
+; RV64IZCMP-NEXT:    sw a0, 16(a2)
+; RV64IZCMP-NEXT:    sw t1, %lo(var_test_irq+12)(a5)
+; RV64IZCMP-NEXT:    sw t2, %lo(var_test_irq+8)(a5)
+; RV64IZCMP-NEXT:    sw t3, %lo(var_test_irq+4)(a5)
 ; RV64IZCMP-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
-; RV64IZCMP-NEXT:    sw a0, %lo(var_test_irq)(a6)
+; RV64IZCMP-NEXT:    sw a0, %lo(var_test_irq)(a5)
 ; RV64IZCMP-NEXT:    ld t0, 168(sp) # 8-byte Folded Reload
 ; RV64IZCMP-NEXT:    ld t1, 160(sp) # 8-byte Folded Reload
 ; RV64IZCMP-NEXT:    ld t2, 152(sp) # 8-byte Folded Reload
@@ -2045,84 +2045,84 @@ define void @callee_with_irq() nounwind "interrupt"="user" {
 ; RV32IZCMP-SR-NEXT:    sw t4, 44(sp) # 4-byte Folded Spill
 ; RV32IZCMP-SR-NEXT:    sw t5, 40(sp) # 4-byte Folded Spill
 ; RV32IZCMP-SR-NEXT:    sw t6, 36(sp) # 4-byte Folded Spill
-; RV32IZCMP-SR-NEXT:    lui a6, %hi(var_test_irq)
-; RV32IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq)(a6)
+; RV32IZCMP-SR-NEXT:    lui a5, %hi(var_test_irq)
+; RV32IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq)(a5)
 ; RV32IZCMP-SR-NEXT:    sw a0, 32(sp) # 4-byte Folded Spill
-; RV32IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq+4)(a6)
+; RV32IZCMP-SR-NEXT:    addi a2, a5, %lo(var_test_irq)
+; RV32IZCMP-SR-NEXT:    lw a0, 16(a2)
 ; RV32IZCMP-SR-NEXT:    sw a0, 28(sp) # 4-byte Folded Spill
-; RV32IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq+8)(a6)
+; RV32IZCMP-SR-NEXT:    lw a0, 20(a2)
 ; RV32IZCMP-SR-NEXT:    sw a0, 24(sp) # 4-byte Folded Spill
-; RV32IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq+12)(a6)
+; RV32IZCMP-SR-NEXT:    lw a0, 24(a2)
 ; RV32IZCMP-SR-NEXT:    sw a0, 20(sp) # 4-byte Folded Spill
-; RV32IZCMP-SR-NEXT:    addi a5, a6, %lo(var_test_irq)
-; RV32IZCMP-SR-NEXT:    lw a0, 16(a5)
+; RV32IZCMP-SR-NEXT:    lw a0, 28(a2)
 ; RV32IZCMP-SR-NEXT:    sw a0, 16(sp) # 4-byte Folded Spill
-; RV32IZCMP-SR-NEXT:    lw a0, 20(a5)
+; RV32IZCMP-SR-NEXT:    lw a0, 32(a2)
 ; RV32IZCMP-SR-NEXT:    sw a0, 12(sp) # 4-byte Folded Spill
-; RV32IZCMP-SR-NEXT:    lw t4, 24(a5)
-; RV32IZCMP-SR-NEXT:    lw t5, 28(a5)
-; RV32IZCMP-SR-NEXT:    lw t6, 32(a5)
-; RV32IZCMP-SR-NEXT:    lw s2, 36(a5)
-; RV32IZCMP-SR-NEXT:    lw s3, 40(a5)
-; RV32IZCMP-SR-NEXT:    lw s4, 44(a5)
-; RV32IZCMP-SR-NEXT:    lw s5, 48(a5)
-; RV32IZCMP-SR-NEXT:    lw s6, 52(a5)
-; RV32IZCMP-SR-NEXT:    lw s7, 56(a5)
-; RV32IZCMP-SR-NEXT:    lw s8, 60(a5)
-; RV32IZCMP-SR-NEXT:    lw s9, 64(a5)
-; RV32IZCMP-SR-NEXT:    lw s10, 68(a5)
-; RV32IZCMP-SR-NEXT:    lw s11, 72(a5)
-; RV32IZCMP-SR-NEXT:    lw ra, 76(a5)
-; RV32IZCMP-SR-NEXT:    lw s1, 80(a5)
-; RV32IZCMP-SR-NEXT:    lw t3, 84(a5)
-; RV32IZCMP-SR-NEXT:    lw t2, 88(a5)
-; RV32IZCMP-SR-NEXT:    lw t1, 92(a5)
-; RV32IZCMP-SR-NEXT:    lw t0, 96(a5)
-; RV32IZCMP-SR-NEXT:    lw s0, 100(a5)
-; RV32IZCMP-SR-NEXT:    lw a7, 104(a5)
-; RV32IZCMP-SR-NEXT:    lw a4, 108(a5)
-; RV32IZCMP-SR-NEXT:    lw a0, 124(a5)
-; RV32IZCMP-SR-NEXT:    lw a1, 120(a5)
-; RV32IZCMP-SR-NEXT:    lw a2, 116(a5)
-; RV32IZCMP-SR-NEXT:    lw a3, 112(a5)
-; RV32IZCMP-SR-NEXT:    sw a0, 124(a5)
-; RV32IZCMP-SR-NEXT:    sw a1, 120(a5)
-; RV32IZCMP-SR-NEXT:    sw a2, 116(a5)
-; RV32IZCMP-SR-NEXT:    sw a3, 112(a5)
-; RV32IZCMP-SR-NEXT:    sw a4, 108(a5)
-; RV32IZCMP-SR-NEXT:    sw a7, 104(a5)
-; RV32IZCMP-SR-NEXT:    sw s0, 100(a5)
-; RV32IZCMP-SR-NEXT:    sw t0, 96(a5)
-; RV32IZCMP-SR-NEXT:    sw t1, 92(a5)
-; RV32IZCMP-SR-NEXT:    sw t2, 88(a5)
-; RV32IZCMP-SR-NEXT:    sw t3, 84(a5)
-; RV32IZCMP-SR-NEXT:    sw s1, 80(a5)
-; RV32IZCMP-SR-NEXT:    sw ra, 76(a5)
-; RV32IZCMP-SR-NEXT:    sw s11, 72(a5)
-; RV32IZCMP-SR-NEXT:    sw s10, 68(a5)
-; RV32IZCMP-SR-NEXT:    sw s9, 64(a5)
-; RV32IZCMP-SR-NEXT:    sw s8, 60(a5)
-; RV32IZCMP-SR-NEXT:    sw s7, 56(a5)
-; RV32IZCMP-SR-NEXT:    sw s6, 52(a5)
-; RV32IZCMP-SR-NEXT:    sw s5, 48(a5)
-; RV32IZCMP-SR-NEXT:    sw s4, 44(a5)
-; RV32IZCMP-SR-NEXT:    sw s3, 40(a5)
-; RV32IZCMP-SR-NEXT:    sw s2, 36(a5)
-; RV32IZCMP-SR-NEXT:    sw t6, 32(a5)
-; RV32IZCMP-SR-NEXT:    sw t5, 28(a5)
-; RV32IZCMP-SR-NEXT:    sw t4, 24(a5)
+; RV32IZCMP-SR-NEXT:    lw t4, 36(a2)
+; RV32IZCMP-SR-NEXT:    lw t5, 40(a2)
+; RV32IZCMP-SR-NEXT:    lw t6, 44(a2)
+; RV32IZCMP-SR-NEXT:    lw s2, 48(a2)
+; RV32IZCMP-SR-NEXT:    lw s3, 52(a2)
+; RV32IZCMP-SR-NEXT:    lw s4, 56(a2)
+; RV32IZCMP-SR-NEXT:    lw s5, 60(a2)
+; RV32IZCMP-SR-NEXT:    lw s6, 64(a2)
+; RV32IZCMP-SR-NEXT:    lw s7, 68(a2)
+; RV32IZCMP-SR-NEXT:    lw s8, 72(a2)
+; RV32IZCMP-SR-NEXT:    lw s9, 76(a2)
+; RV32IZCMP-SR-NEXT:    lw s10, 80(a2)
+; RV32IZCMP-SR-NEXT:    lw s11, 84(a2)
+; RV32IZCMP-SR-NEXT:    lw ra, 88(a2)
+; RV32IZCMP-SR-NEXT:    lw s1, 92(a2)
+; RV32IZCMP-SR-NEXT:    lw t0, 96(a2)
+; RV32IZCMP-SR-NEXT:    lw a7, 100(a2)
+; RV32IZCMP-SR-NEXT:    lw a6, 104(a2)
+; RV32IZCMP-SR-NEXT:    lw a4, 108(a2)
+; RV32IZCMP-SR-NEXT:    lw s0, 112(a2)
+; RV32IZCMP-SR-NEXT:    lw a3, 116(a2)
+; RV32IZCMP-SR-NEXT:    lw a1, 120(a2)
+; RV32IZCMP-SR-NEXT:    lw a0, 124(a2)
+; RV32IZCMP-SR-NEXT:    lw t3, %lo(var_test_irq+4)(a5)
+; RV32IZCMP-SR-NEXT:    lw t2, %lo(var_test_irq+8)(a5)
+; RV32IZCMP-SR-NEXT:    lw t1, %lo(var_test_irq+12)(a5)
+; RV32IZCMP-SR-NEXT:    sw a0, 124(a2)
+; RV32IZCMP-SR-NEXT:    sw a1, 120(a2)
+; RV32IZCMP-SR-NEXT:    sw a3, 116(a2)
+; RV32IZCMP-SR-NEXT:    sw s0, 112(a2)
+; RV32IZCMP-SR-NEXT:    sw a4, 108(a2)
+; RV32IZCMP-SR-NEXT:    sw a6, 104(a2)
+; RV32IZCMP-SR-NEXT:    sw a7, 100(a2)
+; RV32IZCMP-SR-NEXT:    sw t0, 96(a2)
+; RV32IZCMP-SR-NEXT:    sw s1, 92(a2)
+; RV32IZCMP-SR-NEXT:    sw ra, 88(a2)
+; RV32IZCMP-SR-NEXT:    sw s11, 84(a2)
+; RV32IZCMP-SR-NEXT:    sw s10, 80(a2)
+; RV32IZCMP-SR-NEXT:    sw s9, 76(a2)
+; RV32IZCMP-SR-NEXT:    sw s8, 72(a2)
+; RV32IZCMP-SR-NEXT:    sw s7, 68(a2)
+; RV32IZCMP-SR-NEXT:    sw s6, 64(a2)
+; RV32IZCMP-SR-NEXT:    sw s5, 60(a2)
+; RV32IZCMP-SR-NEXT:    sw s4, 56(a2)
+; RV32IZCMP-SR-NEXT:    sw s3, 52(a2)
+; RV32IZCMP-SR-NEXT:    sw s2, 48(a2)
+; RV32IZCMP-SR-NEXT:    sw t6, 44(a2)
+; RV32IZCMP-SR-NEXT:    sw t5, 40(a2)
+; RV32IZCMP-SR-NEXT:    sw t4, 36(a2)
 ; RV32IZCMP-SR-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
-; RV32IZCMP-SR-NEXT:    sw a0, 20(a5)
+; RV32IZCMP-SR-NEXT:    sw a0, 32(a2)
 ; RV32IZCMP-SR-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
-; RV32IZCMP-SR-NEXT:    sw a0, 16(a5)
+; RV32IZCMP-SR-NEXT:    sw a0, 28(a2)
 ; RV32IZCMP-SR-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
-; RV32IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq+12)(a6)
+; RV32IZCMP-SR-NEXT:    sw a0, 24(a2)
 ; RV32IZCMP-SR-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
-; RV32IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq+8)(a6)
+; RV32IZCMP-SR-NEXT:    sw a0, 20(a2)
 ; RV32IZCMP-SR-NEXT:    lw a0, 28(sp) # 4-byte Folded Reload
-; RV32IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq+4)(a6)
+; RV32IZCMP-SR-NEXT:    sw a0, 16(a2)
+; RV32IZCMP-SR-NEXT:    sw t1, %lo(var_test_irq+12)(a5)
+; RV32IZCMP-SR-NEXT:    sw t2, %lo(var_test_irq+8)(a5)
+; RV32IZCMP-SR-NEXT:    sw t3, %lo(var_test_irq+4)(a5)
 ; RV32IZCMP-SR-NEXT:    lw a0, 32(sp) # 4-byte Folded Reload
-; RV32IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq)(a6)
+; RV32IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq)(a5)
 ; RV32IZCMP-SR-NEXT:    lw t0, 92(sp) # 4-byte Folded Reload
 ; RV32IZCMP-SR-NEXT:    lw t1, 88(sp) # 4-byte Folded Reload
 ; RV32IZCMP-SR-NEXT:    lw t2, 84(sp) # 4-byte Folded Reload
@@ -2161,84 +2161,84 @@ define void @callee_with_irq() nounwind "interrupt"="user" {
 ; RV64IZCMP-SR-NEXT:    sd t4, 72(sp) # 8-byte Folded Spill
 ; RV64IZCMP-SR-NEXT:    sd t5, 64(sp) # 8-byte Folded Spill
 ; RV64IZCMP-SR-NEXT:    sd t6, 56(sp) # 8-byte Folded Spill
-; RV64IZCMP-SR-NEXT:    lui a6, %hi(var_test_irq)
-; RV64IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq)(a6)
+; RV64IZCMP-SR-NEXT:    lui a5, %hi(var_test_irq)
+; RV64IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq)(a5)
 ; RV64IZCMP-SR-NEXT:    sd a0, 48(sp) # 8-byte Folded Spill
-; RV64IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq+4)(a6)
+; RV64IZCMP-SR-NEXT:    addi a2, a5, %lo(var_test_irq)
+; RV64IZCMP-SR-NEXT:    lw a0, 16(a2)
 ; RV64IZCMP-SR-NEXT:    sd a0, 40(sp) # 8-byte Folded Spill
-; RV64IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq+8)(a6)
+; RV64IZCMP-SR-NEXT:    lw a0, 20(a2)
 ; RV64IZCMP-SR-NEXT:    sd a0, 32(sp) # 8-byte Folded Spill
-; RV64IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq+12)(a6)
+; RV64IZCMP-SR-NEXT:    lw a0, 24(a2)
 ; RV64IZCMP-SR-NEXT:    sd a0, 24(sp) # 8-byte Folded Spill
-; RV64IZCMP-SR-NEXT:    addi a5, a6, %lo(var_test_irq)
-; RV64IZCMP-SR-NEXT:    lw a0, 16(a5)
+; RV64IZCMP-SR-NEXT:    lw a0, 28(a2)
 ; RV64IZCMP-SR-NEXT:    sd a0, 16(sp) # 8-byte Folded Spill
-; RV64IZCMP-SR-NEXT:    lw a0, 20(a5)
+; RV64IZCMP-SR-NEXT:    lw a0, 32(a2)
 ; RV64IZCMP-SR-NEXT:    sd a0, 8(sp) # 8-byte Folded Spill
-; RV64IZCMP-SR-NEXT:    lw t4, 24(a5)
-; RV64IZCMP-SR-NEXT:    lw t5, 28(a5)
-; RV64IZCMP-SR-NEXT:    lw t6, 32(a5)
-; RV64IZCMP-SR-NEXT:    lw s2, 36(a5)
-; RV64IZCMP-SR-NEXT:    lw s3, 40(a5)
-; RV64IZCMP-SR-NEXT:    lw s4, 44(a5)
-; RV64IZCMP-SR-NEXT:    lw s5, 48(a5)
-; RV64IZCMP-SR-NEXT:    lw s6, 52(a5)
-; RV64IZCMP-SR-NEXT:    lw s7, 56(a5)
-; RV64IZCMP-SR-NEXT:    lw s8, 60(a5)
-; RV64IZCMP-SR-NEXT:    lw s9, 64(a5)
-; RV64IZCMP-SR-NEXT:    lw s10, 68(a5)
-; RV64IZCMP-SR-NEXT:    lw s11, 72(a5)
-; RV64IZCMP-SR-NEXT:    lw ra, 76(a5)
-; RV64IZCMP-SR-NEXT:    lw s1, 80(a5)
-; RV64IZCMP-SR-NEXT:    lw t3, 84(a5)
-; RV64IZCMP-SR-NEXT:    lw t2, 88(a5)
-; RV64IZCMP-SR-NEXT:    lw t1, 92(a5)
-; RV64IZCMP-SR-NEXT:    lw t0, 96(a5)
-; RV64IZCMP-SR-NEXT:    lw s0, 100(a5)
-; RV64IZCMP-SR-NEXT:    lw a7, 104(a5)
-; RV64IZCMP-SR-NEXT:    lw a4, 108(a5)
-; RV64IZCMP-SR-NEXT:    lw a0, 124(a5)
-; RV64IZCMP-SR-NEXT:    lw a1, 120(a5)
-; RV64IZCMP-SR-NEXT:    lw a2, 116(a5)
-; RV64IZCMP-SR-NEXT:    lw a3, 112(a5)
-; RV64IZCMP-SR-NEXT:    sw a0, 124(a5)
-; RV64IZCMP-SR-NEXT:    sw a1, 120(a5)
-; RV64IZCMP-SR-NEXT:    sw a2, 116(a5)
-; RV64IZCMP-SR-NEXT:    sw a3, 112(a5)
-; RV64IZCMP-SR-NEXT:    sw a4, 108(a5)
-; RV64IZCMP-SR-NEXT:    sw a7, 104(a5)
-; RV64IZCMP-SR-NEXT:    sw s0, 100(a5)
-; RV64IZCMP-SR-NEXT:    sw t0, 96(a5)
-; RV64IZCMP-SR-NEXT:    sw t1, 92(a5)
-; RV64IZCMP-SR-NEXT:    sw t2, 88(a5)
-; RV64IZCMP-SR-NEXT:    sw t3, 84(a5)
-; RV64IZCMP-SR-NEXT:    sw s1, 80(a5)
-; RV64IZCMP-SR-NEXT:    sw ra, 76(a5)
-; RV64IZCMP-SR-NEXT:    sw s11, 72(a5)
-; RV64IZCMP-SR-NEXT:    sw s10, 68(a5)
-; RV64IZCMP-SR-NEXT:    sw s9, 64(a5)
-; RV64IZCMP-SR-NEXT:    sw s8, 60(a5)
-; RV64IZCMP-SR-NEXT:    sw s7, 56(a5)
-; RV64IZCMP-SR-NEXT:    sw s6, 52(a5)
-; RV64IZCMP-SR-NEXT:    sw s5, 48(a5)
-; RV64IZCMP-SR-NEXT:    sw s4, 44(a5)
-; RV64IZCMP-SR-NEXT:    sw s3, 40(a5)
-; RV64IZCMP-SR-NEXT:    sw s2, 36(a5)
-; RV64IZCMP-SR-NEXT:    sw t6, 32(a5)
-; RV64IZCMP-SR-NEXT:    sw t5, 28(a5)
-; RV64IZCMP-SR-NEXT:    sw t4, 24(a5)
+; RV64IZCMP-SR-NEXT:    lw t4, 36(a2)
+; RV64IZCMP-SR-NEXT:    lw t5, 40(a2)
+; RV64IZCMP-SR-NEXT:    lw t6, 44(a2)
+; RV64IZCMP-SR-NEXT:    lw s2, 48(a2)
+; RV64IZCMP-SR-NEXT:    lw s3, 52(a2)
+; RV64IZCMP-SR-NEXT:    lw s4, 56(a2)
+; RV64IZCMP-SR-NEXT:    lw s5, 60(a2)
+; RV64IZCMP-SR-NEXT:    lw s6, 64(a2)
+; RV64IZCMP-SR-NEXT:    lw s7, 68(a2)
+; RV64IZCMP-SR-NEXT:    lw s8, 72(a2)
+; RV64IZCMP-SR-NEXT:    lw s9, 76(a2)
+; RV64IZCMP-SR-NEXT:    lw s10, 80(a2)
+; RV64IZCMP-SR-NEXT:    lw s11, 84(a2)
+; RV64IZCMP-SR-NEXT:    lw ra, 88(a2)
+; RV64IZCMP-SR-NEXT:    lw s1, 92(a2)
+; RV64IZCMP-SR-NEXT:    lw t0, 96(a2)
+; RV64IZCMP-SR-NEXT:    lw a7, 100(a2)
+; RV64IZCMP-SR-NEXT:    lw a6, 104(a2)
+; RV64IZCMP-SR-NEXT:    lw a4, 108(a2)
+; RV64IZCMP-SR-NEXT:    lw s0, 112(a2)
+; RV64IZCMP-SR-NEXT:    lw a3, 116(a2)
+; RV64IZCMP-SR-NEXT:    lw a1, 120(a2)
+; RV64IZCMP-SR-NEXT:    lw a0, 124(a2)
+; RV64IZCMP-SR-NEXT:    lw t3, %lo(var_test_irq+4)(a5)
+; RV64IZCMP-SR-NEXT:    lw t2, %lo(var_test_irq+8)(a5)
+; RV64IZCMP-SR-NEXT:    lw t1, %lo(var_test_irq+12)(a5)
+; RV64IZCMP-SR-NEXT:    sw a0, 124(a2)
+; RV64IZCMP-SR-NEXT:    sw a1, 120(a2)
+; RV64IZCMP-SR-NEXT:    sw a3, 116(a2)
+; RV64IZCMP-SR-NEXT:    sw s0, 112(a2)
+; RV64IZCMP-SR-NEXT:    sw a4, 108(a2)
+; RV64IZCMP-SR-NEXT:    sw a6, 104(a2)
+; RV64IZCMP-SR-NEXT:    sw a7, 100(a2)
+; RV64IZCMP-SR-NEXT:    sw t0, 96(a2)
+; RV64IZCMP-SR-NEXT:    sw s1, 92(a2)
+; RV64IZCMP-SR-NEXT:    sw ra, 88(a2)
+; RV64IZCMP-SR-NEXT:    sw s11, 84(a2)
+; RV64IZCMP-SR-NEXT:    sw s10, 80(a2)
+; RV64IZCMP-SR-NEXT:    sw s9, 76(a2)
+; RV64IZCMP-SR-NEXT:    sw s8, 72(a2)
+; RV64IZCMP-SR-NEXT:    sw s7, 68(a2)
+; RV64IZCMP-SR-NEXT:    sw s6, 64(a2)
+; RV64IZCMP-SR-NEXT:    sw s5, 60(a2)
+; RV64IZCMP-SR-NEXT:    sw s4, 56(a2)
+; RV64IZCMP-SR-NEXT:    sw s3, 52(a2)
+; RV64IZCMP-SR-NEXT:    sw s2, 48(a2)
+; RV64IZCMP-SR-NEXT:    sw t6, 44(a2)
+; RV64IZCMP-SR-NEXT:    sw t5, 40(a2)
+; RV64IZCMP-SR-NEXT:    sw t4, 36(a2)
 ; RV64IZCMP-SR-NEXT:    ld a0, 8(sp) # 8-byte Folded Reload
-; RV64IZCMP-SR-NEXT:    sw a0, 20(a5)
+; RV64IZCMP-SR-NEXT:    sw a0, 32(a2)
 ; RV64IZCMP-SR-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
-; RV64IZCMP-SR-NEXT:    sw a0, 16(a5)
+; RV64IZCMP-SR-NEXT:    sw a0, 28(a2)
 ; RV64IZCMP-SR-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
-; RV64IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq+12)(a6)
+; RV64IZCMP-SR-NEXT:    sw a0, 24(a2)
 ; RV64IZCMP-SR-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
-; RV64IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq+8)(a6)
+; RV64IZCMP-SR-NEXT:    sw a0, 20(a2)
 ; RV64IZCMP-SR-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
-; RV64IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq+4)(a6)
+; RV64IZCMP-SR-NEXT:    sw a0, 16(a2)
+; RV64IZCMP-SR-NEXT:    sw t1, %lo(var_test_irq+12)(a5)
+; RV64IZCMP-SR-NEXT:    sw t2, %lo(var_test_irq+8)(a5)
+; RV64IZCMP-SR-NEXT:    sw t3, %lo(var_test_irq+4)(a5)
 ; RV64IZCMP-SR-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
-; RV64IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq)(a6)
+; RV64IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq)(a5)
 ; RV64IZCMP-SR-NEXT:    ld t0, 168(sp) # 8-byte Folded Reload
 ; RV64IZCMP-SR-NEXT:    ld t1, 160(sp) # 8-byte Folded Reload
 ; RV64IZCMP-SR-NEXT:    ld t2, 152(sp) # 8-byte Folded Reload
@@ -2289,84 +2289,84 @@ define void @callee_with_irq() nounwind "interrupt"="user" {
 ; RV32I-NEXT:    sw t4, 40(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw t5, 36(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw t6, 32(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lui a6, %hi(var_test_irq)
-; RV32I-NEXT:    lw a0, %lo(var_test_irq)(a6)
+; RV32I-NEXT:    lui a4, %hi(var_test_irq)
+; RV32I-NEXT:    lw a0, %lo(var_test_irq)(a4)
 ; RV32I-NEXT:    sw a0, 28(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lw a0, %lo(var_test_irq+4)(a6)
+; RV32I-NEXT:    addi a2, a4, %lo(var_test_irq)
+; RV32I-NEXT:    lw a0, 16(a2)
 ; RV32I-NEXT:    sw a0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lw a0, %lo(var_test_irq+8)(a6)
+; RV32I-NEXT:    lw a0, 20(a2)
 ; RV32I-NEXT:    sw a0, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lw a0, %lo(var_test_irq+12)(a6)
+; RV32I-NEXT:    lw a0, 24(a2)
 ; RV32I-NEXT:    sw a0, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    addi a5, a6, %lo(var_test_irq)
-; RV32I-NEXT:    lw a0, 16(a5)
+; RV32I-NEXT:    lw a0, 28(a2)
 ; RV32I-NEXT:    sw a0, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lw a0, 20(a5)
+; RV32I-NEXT:    lw a0, 32(a2)
 ; RV32I-NEXT:    sw a0, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lw t0, 24(a5)
-; RV32I-NEXT:    lw t1, 28(a5)
-; RV32I-NEXT:    lw t2, 32(a5)
-; RV32I-NEXT:    lw t3, 36(a5)
-; RV32I-NEXT:    lw t4, 40(a5)
-; RV32I-NEXT:    lw t5, 44(a5)
-; RV32I-NEXT:    lw t6, 48(a5)
-; RV32I-NEXT:    lw s0, 52(a5)
-; RV32I-NEXT:    lw s1, 56(a5)
-; RV32I-NEXT:    lw s2, 60(a5)
-; RV32I-NEXT:    lw s3, 64(a5)
-; RV32I-NEXT:    lw s4, 68(a5)
-; RV32I-NEXT:    lw s5, 72(a5)
-; RV32I-NEXT:    lw s6, 76(a5)
-; RV32I-NEXT:    lw s7, 80(a5)
-; RV32I-NEXT:    lw s8, 84(a5)
-; RV32I-NEXT:    lw s9, 88(a5)
-; RV32I-NEXT:    lw s10, 92(a5)
-; RV32I-NEXT:    lw s11, 96(a5)
-; RV32I-NEXT:    lw ra, 100(a5)
-; RV32I-NEXT:    lw a7, 104(a5)
-; RV32I-NEXT:    lw a4, 108(a5)
-; RV32I-NEXT:    lw a0, 124(a5)
-; RV32I-NEXT:    lw a1, 120(a5)
-; RV32I-NEXT:    lw a2, 116(a5)
-; RV32I-NEXT:    lw a3, 112(a5)
-; RV32I-NEXT:    sw a0, 124(a5)
-; RV32I-NEXT:    sw a1, 120(a5)
-; RV32I-NEXT:    sw a2, 116(a5)
-; RV32I-NEXT:    sw a3, 112(a5)
-; RV32I-NEXT:    sw a4, 108(a5)
-; RV32I-NEXT:    sw a7, 104(a5)
-; RV32I-NEXT:    sw ra, 100(a5)
-; RV32I-NEXT:    sw s11, 96(a5)
-; RV32I-NEXT:    sw s10, 92(a5)
-; RV32I-NEXT:    sw s9, 88(a5)
-; RV32I-NEXT:    sw s8, 84(a5)
-; RV32I-NEXT:    sw s7, 80(a5)
-; RV32I-NEXT:    sw s6, 76(a5)
-; RV32I-NEXT:    sw s5, 72(a5)
-; RV32I-NEXT:    sw s4, 68(a5)
-; RV32I-NEXT:    sw s3, 64(a5)
-; RV32I-NEXT:    sw s2, 60(a5)
-; RV32I-NEXT:    sw s1, 56(a5)
-; RV32I-NEXT:    sw s0, 52(a5)
-; RV32I-NEXT:    sw t6, 48(a5)
-; RV32I-NEXT:    sw t5, 44(a5)
-; RV32I-NEXT:    sw t4, 40(a5)
-; RV32I-NEXT:    sw t3, 36(a5)
-; RV32I-NEXT:    sw t2, 32(a5)
-; RV32I-NEXT:    sw t1, 28(a5)
-; RV32I-NEXT:    sw t0, 24(a5)
+; RV32I-NEXT:    lw t0, 36(a2)
+; RV32I-NEXT:    lw t1, 40(a2)
+; RV32I-NEXT:    lw t2, 44(a2)
+; RV32I-NEXT:    lw t3, 48(a2)
+; RV32I-NEXT:    lw t4, 52(a2)
+; RV32I-NEXT:    lw t5, 56(a2)
+; RV32I-NEXT:    lw t6, 60(a2)
+; RV32I-NEXT:    lw s0, 64(a2)
+; RV32I-NEXT:    lw s1, 68(a2)
+; RV32I-NEXT:    lw s2, 72(a2)
+; RV32I-NEXT:    lw s3, 76(a2)
+; RV32I-NEXT:    lw s4, 80(a2)
+; RV32I-NEXT:    lw s5, 84(a2)
+; RV32I-NEXT:    lw s6, 88(a2)
+; RV32I-NEXT:    lw s7, 92(a2)
+; RV32I-NEXT:    lw s8, 96(a2)
+; RV32I-NEXT:    lw s9, 100(a2)
+; RV32I-NEXT:    lw s10, 104(a2)
+; RV32I-NEXT:    lw s11, 108(a2)
+; RV32I-NEXT:    lw ra, 112(a2)
+; RV32I-NEXT:    lw a3, 116(a2)
+; RV32I-NEXT:    lw a1, 120(a2)
+; RV32I-NEXT:    lw a0, 124(a2)
+; RV32I-NEXT:    lw a7, %lo(var_test_irq+4)(a4)
+; RV32I-NEXT:    lw a6, %lo(var_test_irq+8)(a4)
+; RV32I-NEXT:    lw a5, %lo(var_test_irq+12)(a4)
+; RV32I-NEXT:    sw a0, 124(a2)
+; RV32I-NEXT:    sw a1, 120(a2)
+; RV32I-NEXT:    sw a3, 116(a2)
+; RV32I-NEXT:    sw ra, 112(a2)
+; RV32I-NEXT:    sw s11, 108(a2)
+; RV32I-NEXT:    sw s10, 104(a2)
+; RV32I-NEXT:    sw s9, 100(a2)
+; RV32I-NEXT:    sw s8, 96(a2)
+; RV32I-NEXT:    sw s7, 92(a2)
+; RV32I-NEXT:    sw s6, 88(a2)
+; RV32I-NEXT:    sw s5, 84(a2)
+; RV32I-NEXT:    sw s4, 80(a2)
+; RV32I-NEXT:    sw s3, 76(a2)
+; RV32I-NEXT:    sw s2, 72(a2)
+; RV32I-NEXT:    sw s1, 68(a2)
+; RV32I-NEXT:    sw s0, 64(a2)
+; RV32I-NEXT:    sw t6, 60(a2)
+; RV32I-NEXT:    sw t5, 56(a2)
+; RV32I-NEXT:    sw t4, 52(a2)
+; RV32I-NEXT:    sw t3, 48(a2)
+; RV32I-NEXT:    sw t2, 44(a2)
+; RV32I-NEXT:    sw t1, 40(a2)
+; RV32I-NEXT:    sw t0, 36(a2)
 ; RV32I-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sw a0, 20(a5)
+; RV32I-NEXT:    sw a0, 32(a2)
 ; RV32I-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sw a0, 16(a5)
+; RV32I-NEXT:    sw a0, 28(a2)
 ; RV32I-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sw a0, %lo(var_test_irq+12)(a6)
+; RV32I-NEXT:    sw a0, 24(a2)
 ; RV32I-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sw a0, %lo(var_test_irq+8)(a6)
+; RV32I-NEXT:    sw a0, 20(a2)
 ; RV32I-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sw a0, %lo(var_test_irq+4)(a6)
+; RV32I-NEXT:    sw a0, 16(a2)
+; RV32I-NEXT:    sw a5, %lo(var_test_irq+12)(a4)
+; RV32I-NEXT:    sw a6, %lo(var_test_irq+8)(a4)
+; RV32I-NEXT:    sw a7, %lo(var_test_irq+4)(a4)
 ; RV32I-NEXT:    lw a0, 28(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sw a0, %lo(var_test_irq)(a6)
+; RV32I-NEXT:    sw a0, %lo(var_test_irq)(a4)
 ; RV32I-NEXT:    lw ra, 140(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw t0, 136(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw t1, 132(sp) # 4-byte Folded Reload
@@ -2429,84 +2429,84 @@ define void @callee_with_irq() nounwind "interrupt"="user" {
 ; RV64I-NEXT:    sd t4, 64(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd t5, 56(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd t6, 48(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lui a6, %hi(var_test_irq)
-; RV64I-NEXT:    lw a0, %lo(var_test_irq)(a6)
+; RV64I-NEXT:    lui a4, %hi(var_test_irq)
+; RV64I-NEXT:    lw a0, %lo(var_test_irq)(a4)
 ; RV64I-NEXT:    sd a0, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lw a0, %lo(var_test_irq+4)(a6)
+; RV64I-NEXT:    addi a2, a4, %lo(var_test_irq)
+; RV64I-NEXT:    lw a0, 16(a2)
 ; RV64I-NEXT:    sd a0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lw a0, %lo(var_test_irq+8)(a6)
+; RV64I-NEXT:    lw a0, 20(a2)
 ; RV64I-NEXT:    sd a0, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lw a0, %lo(var_test_irq+12)(a6)
+; RV64I-NEXT:    lw a0, 24(a2)
 ; RV64I-NEXT:    sd a0, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    addi a5, a6, %lo(var_test_irq)
-; RV64I-NEXT:    lw a0, 16(a5)
+; RV64I-NEXT:    lw a0, 28(a2)
 ; RV64I-NEXT:    sd a0, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lw a0, 20(a5)
+; RV64I-NEXT:    lw a0, 32(a2)
 ; RV64I-NEXT:    sd a0, 0(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lw t0, 24(a5)
-; RV64I-NEXT:    lw t1, 28(a5)
-; RV64I-NEXT:    lw t2, 32(a5)
-; RV64I-NEXT:    lw t3, 36(a5)
-; RV64I-NEXT:    lw t4, 40(a5)
-; RV64I-NEXT:    lw t5, 44(a5)
-; RV64I-NEXT:    lw t6, 48(a5)
-; RV64I-NEXT:    lw s0, 52(a5)
-; RV64I-NEXT:    lw s1, 56(a5)
-; RV64I-NEXT:    lw s2, 60(a5)
-; RV64I-NEXT:    lw s3, 64(a5)
-; RV64I-NEXT:    lw s4, 68(a5)
-; RV64I-NEXT:    lw s5, 72(a5)
-; RV64I-NEXT:    lw s6, 76(a5)
-; RV64I-NEXT:    lw s7, 80(a5)
-; RV64I-NEXT:    lw s8, 84(a5)
-; RV64I-NEXT:    lw s9, 88(a5)
-; RV64I-NEXT:    lw s10, 92(a5)
-; RV64I-NEXT:    lw s11, 96(a5)
-; RV64I-NEXT:    lw ra, 100(a5)
-; RV64I-NEXT:    lw a7, 104(a5)
-; RV64I-NEXT:    lw a4, 108(a5)
-; RV64I-NEXT:    lw a0, 124(a5)
-; RV64I-NEXT:    lw a1, 120(a5)
-; RV64I-NEXT:    lw a2, 116(a5)
-; RV64I-NEXT:    lw a3, 112(a5)
-; RV64I-NEXT:    sw a0, 124(a5)
-; RV64I-NEXT:    sw a1, 120(a5)
-; RV64I-NEXT:    sw a2, 116(a5)
-; RV64I-NEXT:    sw a3, 112(a5)
-; RV64I-NEXT:    sw a4, 108(a5)
-; RV64I-NEXT:    sw a7, 104(a5)
-; RV64I-NEXT:    sw ra, 100(a5)
-; RV64I-NEXT:    sw s11, 96(a5)
-; RV64I-NEXT:    sw s10, 92(a5)
-; RV64I-NEXT:    sw s9, 88(a5)
-; RV64I-NEXT:    sw s8, 84(a5)
-; RV64I-NEXT:    sw s7, 80(a5)
-; RV64I-NEXT:    sw s6, 76(a5)
-; RV64I-NEXT:    sw s5, 72(a5)
-; RV64I-NEXT:    sw s4, 68(a5)
-; RV64I-NEXT:    sw s3, 64(a5)
-; RV64I-NEXT:    sw s2, 60(a5)
-; RV64I-NEXT:    sw s1, 56(a5)
-; RV64I-NEXT:    sw s0, 52(a5)
-; RV64I-NEXT:    sw t6, 48(a5)
-; RV64I-NEXT:    sw t5, 44(a5)
-; RV64I-NEXT:    sw t4, 40(a5)
-; RV64I-NEXT:    sw t3, 36(a5)
-; RV64I-NEXT:    sw t2, 32(a5)
-; RV64I-NEXT:    sw t1, 28(a5)
-; RV64I-NEXT:    sw t0, 24(a5)
+; RV64I-NEXT:    lw t0, 36(a2)
+; RV64I-NEXT:    lw t1, 40(a2)
+; RV64I-NEXT:    lw t2, 44(a2)
+; RV64I-NEXT:    lw t3, 48(a2)
+; RV64I-NEXT:    lw t4, 52(a2)
+; RV64I-NEXT:    lw t5, 56(a2)
+; RV64I-NEXT:    lw t6, 60(a2)
+; RV64I-NEXT:    lw s0, 64(a2)
+; RV64I-NEXT:    lw s1, 68(a2)
+; RV64I-NEXT:    lw s2, 72(a2)
+; RV64I-NEXT:    lw s3, 76(a2)
+; RV64I-NEXT:    lw s4, 80(a2)
+; RV64I-NEXT:    lw s5, 84(a2)
+; RV64I-NEXT:    lw s6, 88(a2)
+; RV64I-NEXT:    lw s7, 92(a2)
+; RV64I-NEXT:    lw s8, 96(a2)
+; RV64I-NEXT:    lw s9, 100(a2)
+; RV64I-NEXT:    lw s10, 104(a2)
+; RV64I-NEXT:    lw s11, 108(a2)
+; RV64I-NEXT:    lw ra, 112(a2)
+; RV64I-NEXT:    lw a3, 116(a2)
+; RV64I-NEXT:    lw a1, 120(a2)
+; RV64I-NEXT:    lw a0, 124(a2)
+; RV64I-NEXT:    lw a7, %lo(var_test_irq+4)(a4)
+; RV64I-NEXT:    lw a6, %lo(var_test_irq+8)(a4)
+; RV64I-NEXT:    lw a5, %lo(var_test_irq+12)(a4)
+; RV64I-NEXT:    sw a0, 124(a2)
+; RV64I-NEXT:    sw a1, 120(a2)
+; RV64I-NEXT:    sw a3, 116(a2)
+; RV64I-NEXT:    sw ra, 112(a2)
+; RV64I-NEXT:    sw s11, 108(a2)
+; RV64I-NEXT:    sw s10, 104(a2)
+; RV64I-NEXT:    sw s9, 100(a2)
+; RV64I-NEXT:    sw s8, 96(a2)
+; RV64I-NEXT:    sw s7, 92(a2)
+; RV64I-NEXT:    sw s6, 88(a2)
+; RV64I-NEXT:    sw s5, 84(a2)
+; RV64I-NEXT:    sw s4, 80(a2)
+; RV64I-NEXT:    sw s3, 76(a2)
+; RV64I-NEXT:    sw s2, 72(a2)
+; RV64I-NEXT:    sw s1, 68(a2)
+; RV64I-NEXT:    sw s0, 64(a2)
+; RV64I-NEXT:    sw t6, 60(a2)
+; RV64I-NEXT:    sw t5, 56(a2)
+; RV64I-NEXT:    sw t4, 52(a2)
+; RV64I-NEXT:    sw t3, 48(a2)
+; RV64I-NEXT:    sw t2, 44(a2)
+; RV64I-NEXT:    sw t1, 40(a2)
+; RV64I-NEXT:    sw t0, 36(a2)
 ; RV64I-NEXT:    ld a0, 0(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sw a0, 20(a5)
+; RV64I-NEXT:    sw a0, 32(a2)
 ; RV64I-NEXT:    ld a0, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sw a0, 16(a5)
+; RV64I-NEXT:    sw a0, 28(a2)
 ; RV64I-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sw a0, %lo(var_test_irq+12)(a6)
+; RV64I-NEXT:    sw a0, 24(a2)
 ; RV64I-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sw a0, %lo(var_test_irq+8)(a6)
+; RV64I-NEXT:    sw a0, 20(a2)
 ; RV64I-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sw a0, %lo(var_test_irq+4)(a6)
+; RV64I-NEXT:    sw a0, 16(a2)
+; RV64I-NEXT:    sw a5, %lo(var_test_irq+12)(a4)
+; RV64I-NEXT:    sw a6, %lo(var_test_irq+8)(a4)
+; RV64I-NEXT:    sw a7, %lo(var_test_irq+4)(a4)
 ; RV64I-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sw a0, %lo(var_test_irq)(a6)
+; RV64I-NEXT:    sw a0, %lo(var_test_irq)(a4)
 ; RV64I-NEXT:    ld ra, 264(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld t0, 256(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld t1, 248(sp) # 8-byte Folded Reload
@@ -2546,333 +2546,333 @@ define void @callee_no_irq() nounwind{
 ; RV32IZCMP-LABEL: callee_no_irq:
 ; RV32IZCMP:       # %bb.0:
 ; RV32IZCMP-NEXT:    cm.push {ra, s0-s11}, -96
-; RV32IZCMP-NEXT:    lui a6, %hi(var_test_irq)
-; RV32IZCMP-NEXT:    lw a0, %lo(var_test_irq)(a6)
+; RV32IZCMP-NEXT:    lui a5, %hi(var_test_irq)
+; RV32IZCMP-NEXT:    lw a0, %lo(var_test_irq)(a5)
 ; RV32IZCMP-NEXT:    sw a0, 28(sp) # 4-byte Folded Spill
-; RV32IZCMP-NEXT:    lw a0, %lo(var_test_irq+4)(a6)
+; RV32IZCMP-NEXT:    addi a2, a5, %lo(var_test_irq)
+; RV32IZCMP-NEXT:    lw a0, 16(a2)
 ; RV32IZCMP-NEXT:    sw a0, 24(sp) # 4-byte Folded Spill
-; RV32IZCMP-NEXT:    lw a0, %lo(var_test_irq+8)(a6)
+; RV32IZCMP-NEXT:    lw a0, 20(a2)
 ; RV32IZCMP-NEXT:    sw a0, 20(sp) # 4-byte Folded Spill
-; RV32IZCMP-NEXT:    lw a0, %lo(var_test_irq+12)(a6)
+; RV32IZCMP-NEXT:    lw a0, 24(a2)
 ; RV32IZCMP-NEXT:    sw a0, 16(sp) # 4-byte Folded Spill
-; RV32IZCMP-NEXT:    addi a5, a6, %lo(var_test_irq)
-; RV32IZCMP-NEXT:    lw a0, 16(a5)
+; RV32IZCMP-NEXT:    lw a0, 28(a2)
 ; RV32IZCMP-NEXT:    sw a0, 12(sp) # 4-byte Folded Spill
-; RV32IZCMP-NEXT:    lw a0, 20(a5)
+; RV32IZCMP-NEXT:    lw a0, 32(a2)
 ; RV32IZCMP-NEXT:    sw a0, 8(sp) # 4-byte Folded Spill
-; RV32IZCMP-NEXT:    lw t4, 24(a5)
-; RV32IZCMP-NEXT:    lw t5, 28(a5)
-; RV32IZCMP-NEXT:    lw t6, 32(a5)
-; RV32IZCMP-NEXT:    lw s2, 36(a5)
-; RV32IZCMP-NEXT:    lw s3, 40(a5)
-; RV32IZCMP-NEXT:    lw s4, 44(a5)
-; RV32IZCMP-NEXT:    lw s5, 48(a5)
-; RV32IZCMP-NEXT:    lw s6, 52(a5)
-; RV32IZCMP-NEXT:    lw s7, 56(a5)
-; RV32IZCMP-NEXT:    lw s8, 60(a5)
-; RV32IZCMP-NEXT:    lw s9, 64(a5)
-; RV32IZCMP-NEXT:    lw s10, 68(a5)
-; RV32IZCMP-NEXT:    lw s11, 72(a5)
-; RV32IZCMP-NEXT:    lw ra, 76(a5)
-; RV32IZCMP-NEXT:    lw s1, 80(a5)
-; RV32IZCMP-NEXT:    lw t3, 84(a5)
-; RV32IZCMP-NEXT:    lw t2, 88(a5)
-; RV32IZCMP-NEXT:    lw t1, 92(a5)
-; RV32IZCMP-NEXT:    lw t0, 96(a5)
-; RV32IZCMP-NEXT:    lw s0, 100(a5)
-; RV32IZCMP-NEXT:    lw a7, 104(a5)
-; RV32IZCMP-NEXT:    lw a4, 108(a5)
-; RV32IZCMP-NEXT:    lw a0, 124(a5)
-; RV32IZCMP-NEXT:    lw a1, 120(a5)
-; RV32IZCMP-NEXT:    lw a2, 116(a5)
-; RV32IZCMP-NEXT:    lw a3, 112(a5)
-; RV32IZCMP-NEXT:    sw a0, 124(a5)
-; RV32IZCMP-NEXT:    sw a1, 120(a5)
-; RV32IZCMP-NEXT:    sw a2, 116(a5)
-; RV32IZCMP-NEXT:    sw a3, 112(a5)
-; RV32IZCMP-NEXT:    sw a4, 108(a5)
-; RV32IZCMP-NEXT:    sw a7, 104(a5)
-; RV32IZCMP-NEXT:    sw s0, 100(a5)
-; RV32IZCMP-NEXT:    sw t0, 96(a5)
-; RV32IZCMP-NEXT:    sw t1, 92(a5)
-; RV32IZCMP-NEXT:    sw t2, 88(a5)
-; RV32IZCMP-NEXT:    sw t3, 84(a5)
-; RV32IZCMP-NEXT:    sw s1, 80(a5)
-; RV32IZCMP-NEXT:    sw ra, 76(a5)
-; RV32IZCMP-NEXT:    sw s11, 72(a5)
-; RV32IZCMP-NEXT:    sw s10, 68(a5)
-; RV32IZCMP-NEXT:    sw s9, 64(a5)
-; RV32IZCMP-NEXT:    sw s8, 60(a5)
-; RV32IZCMP-NEXT:    sw s7, 56(a5)
-; RV32IZCMP-NEXT:    sw s6, 52(a5)
-; RV32IZCMP-NEXT:    sw s5, 48(a5)
-; RV32IZCMP-NEXT:    sw s4, 44(a5)
-; RV32IZCMP-NEXT:    sw s3, 40(a5)
-; RV32IZCMP-NEXT:    sw s2, 36(a5)
-; RV32IZCMP-NEXT:    sw t6, 32(a5)
-; RV32IZCMP-NEXT:    sw t5, 28(a5)
-; RV32IZCMP-NEXT:    sw t4, 24(a5)
+; RV32IZCMP-NEXT:    lw t4, 36(a2)
+; RV32IZCMP-NEXT:    lw t5, 40(a2)
+; RV32IZCMP-NEXT:    lw t6, 44(a2)
+; RV32IZCMP-NEXT:    lw s2, 48(a2)
+; RV32IZCMP-NEXT:    lw s3, 52(a2)
+; RV32IZCMP-NEXT:    lw s4, 56(a2)
+; RV32IZCMP-NEXT:    lw s5, 60(a2)
+; RV32IZCMP-NEXT:    lw s6, 64(a2)
+; RV32IZCMP-NEXT:    lw s7, 68(a2)
+; RV32IZCMP-NEXT:    lw s8, 72(a2)
+; RV32IZCMP-NEXT:    lw s9, 76(a2)
+; RV32IZCMP-NEXT:    lw s10, 80(a2)
+; RV32IZCMP-NEXT:    lw s11, 84(a2)
+; RV32IZCMP-NEXT:    lw ra, 88(a2)
+; RV32IZCMP-NEXT:    lw s1, 92(a2)
+; RV32IZCMP-NEXT:    lw t0, 96(a2)
+; RV32IZCMP-NEXT:    lw a7, 100(a2)
+; RV32IZCMP-NEXT:    lw a6, 104(a2)
+; RV32IZCMP-NEXT:    lw a4, 108(a2)
+; RV32IZCMP-NEXT:    lw s0, 112(a2)
+; RV32IZCMP-NEXT:    lw a3, 116(a2)
+; RV32IZCMP-NEXT:    lw a1, 120(a2)
+; RV32IZCMP-NEXT:    lw a0, 124(a2)
+; RV32IZCMP-NEXT:    lw t3, %lo(var_test_irq+4)(a5)
+; RV32IZCMP-NEXT:    lw t2, %lo(var_test_irq+8)(a5)
+; RV32IZCMP-NEXT:    lw t1, %lo(var_test_irq+12)(a5)
+; RV32IZCMP-NEXT:    sw a0, 124(a2)
+; RV32IZCMP-NEXT:    sw a1, 120(a2)
+; RV32IZCMP-NEXT:    sw a3, 116(a2)
+; RV32IZCMP-NEXT:    sw s0, 112(a2)
+; RV32IZCMP-NEXT:    sw a4, 108(a2)
+; RV32IZCMP-NEXT:    sw a6, 104(a2)
+; RV32IZCMP-NEXT:    sw a7, 100(a2)
+; RV32IZCMP-NEXT:    sw t0, 96(a2)
+; RV32IZCMP-NEXT:    sw s1, 92(a2)
+; RV32IZCMP-NEXT:    sw ra, 88(a2)
+; RV32IZCMP-NEXT:    sw s11, 84(a2)
+; RV32IZCMP-NEXT:    sw s10, 80(a2)
+; RV32IZCMP-NEXT:    sw s9, 76(a2)
+; RV32IZCMP-NEXT:    sw s8, 72(a2)
+; RV32IZCMP-NEXT:    sw s7, 68(a2)
+; RV32IZCMP-NEXT:    sw s6, 64(a2)
+; RV32IZCMP-NEXT:    sw s5, 60(a2)
+; RV32IZCMP-NEXT:    sw s4, 56(a2)
+; RV32IZCMP-NEXT:    sw s3, 52(a2)
+; RV32IZCMP-NEXT:    sw s2, 48(a2)
+; RV32IZCMP-NEXT:    sw t6, 44(a2)
+; RV32IZCMP-NEXT:    sw t5, 40(a2)
+; RV32IZCMP-NEXT:    sw t4, 36(a2)
 ; RV32IZCMP-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
-; RV32IZCMP-NEXT:    sw a0, 20(a5)
+; RV32IZCMP-NEXT:    sw a0, 32(a2)
 ; RV32IZCMP-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
-; RV32IZCMP-NEXT:    sw a0, 16(a5)
+; RV32IZCMP-NEXT:    sw a0, 28(a2)
 ; RV32IZCMP-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
-; RV32IZCMP-NEXT:    sw a0, %lo(var_test_irq+12)(a6)
+; RV32IZCMP-NEXT:    sw a0, 24(a2)
 ; RV32IZCMP-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
-; RV32IZCMP-NEXT:    sw a0, %lo(var_test_irq+8)(a6)
+; RV32IZCMP-NEXT:    sw a0, 20(a2)
 ; RV32IZCMP-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
-; RV32IZCMP-NEXT:    sw a0, %lo(var_test_irq+4)(a6)
+; RV32IZCMP-NEXT:    sw a0, 16(a2)
+; RV32IZCMP-NEXT:    sw t1, %lo(var_test_irq+12)(a5)
+; RV32IZCMP-NEXT:    sw t2, %lo(var_test_irq+8)(a5)
+; RV32IZCMP-NEXT:    sw t3, %lo(var_test_irq+4)(a5)
 ; RV32IZCMP-NEXT:    lw a0, 28(sp) # 4-byte Folded Reload
-; RV32IZCMP-NEXT:    sw a0, %lo(var_test_irq)(a6)
+; RV32IZCMP-NEXT:    sw a0, %lo(var_test_irq)(a5)
 ; RV32IZCMP-NEXT:    cm.popret {ra, s0-s11}, 96
 ;
 ; RV64IZCMP-LABEL: callee_no_irq:
 ; RV64IZCMP:       # %bb.0:
 ; RV64IZCMP-NEXT:    cm.push {ra, s0-s11}, -160
-; RV64IZCMP-NEXT:    lui a6, %hi(var_test_irq)
-; RV64IZCMP-NEXT:    lw a0, %lo(var_test_irq)(a6)
+; RV64IZCMP-NEXT:    lui a5, %hi(var_test_irq)
+; RV64IZCMP-NEXT:    lw a0, %lo(var_test_irq)(a5)
 ; RV64IZCMP-NEXT:    sd a0, 40(sp) # 8-byte Folded Spill
-; RV64IZCMP-NEXT:    lw a0, %lo(var_test_irq+4)(a6)
+; RV64IZCMP-NEXT:    addi a2, a5, %lo(var_test_irq)
+; RV64IZCMP-NEXT:    lw a0, 16(a2)
 ; RV64IZCMP-NEXT:    sd a0, 32(sp) # 8-byte Folded Spill
-; RV64IZCMP-NEXT:    lw a0, %lo(var_test_irq+8)(a6)
+; RV64IZCMP-NEXT:    lw a0, 20(a2)
 ; RV64IZCMP-NEXT:    sd a0, 24(sp) # 8-byte Folded Spill
-; RV64IZCMP-NEXT:    lw a0, %lo(var_test_irq+12)(a6)
+; RV64IZCMP-NEXT:    lw a0, 24(a2)
 ; RV64IZCMP-NEXT:    sd a0, 16(sp) # 8-byte Folded Spill
-; RV64IZCMP-NEXT:    addi a5, a6, %lo(var_test_irq)
-; RV64IZCMP-NEXT:    lw a0, 16(a5)
+; RV64IZCMP-NEXT:    lw a0, 28(a2)
 ; RV64IZCMP-NEXT:    sd a0, 8(sp) # 8-byte Folded Spill
-; RV64IZCMP-NEXT:    lw a0, 20(a5)
+; RV64IZCMP-NEXT:    lw a0, 32(a2)
 ; RV64IZCMP-NEXT:    sd a0, 0(sp) # 8-byte Folded Spill
-; RV64IZCMP-NEXT:    lw t4, 24(a5)
-; RV64IZCMP-NEXT:    lw t5, 28(a5)
-; RV64IZCMP-NEXT:    lw t6, 32(a5)
-; RV64IZCMP-NEXT:    lw s2, 36(a5)
-; RV64IZCMP-NEXT:    lw s3, 40(a5)
-; RV64IZCMP-NEXT:    lw s4, 44(a5)
-; RV64IZCMP-NEXT:    lw s5, 48(a5)
-; RV64IZCMP-NEXT:    lw s6, 52(a5)
-; RV64IZCMP-NEXT:    lw s7, 56(a5)
-; RV64IZCMP-NEXT:    lw s8, 60(a5)
-; RV64IZCMP-NEXT:    lw s9, 64(a5)
-; RV64IZCMP-NEXT:    lw s10, 68(a5)
-; RV64IZCMP-NEXT:    lw s11, 72(a5)
-; RV64IZCMP-NEXT:    lw ra, 76(a5)
-; RV64IZCMP-NEXT:    lw s1, 80(a5)
-; RV64IZCMP-NEXT:    lw t3, 84(a5)
-; RV64IZCMP-NEXT:    lw t2, 88(a5)
-; RV64IZCMP-NEXT:    lw t1, 92(a5)
-; RV64IZCMP-NEXT:    lw t0, 96(a5)
-; RV64IZCMP-NEXT:    lw s0, 100(a5)
-; RV64IZCMP-NEXT:    lw a7, 104(a5)
-; RV64IZCMP-NEXT:    lw a4, 108(a5)
-; RV64IZCMP-NEXT:    lw a0, 124(a5)
-; RV64IZCMP-NEXT:    lw a1, 120(a5)
-; RV64IZCMP-NEXT:    lw a2, 116(a5)
-; RV64IZCMP-NEXT:    lw a3, 112(a5)
-; RV64IZCMP-NEXT:    sw a0, 124(a5)
-; RV64IZCMP-NEXT:    sw a1, 120(a5)
-; RV64IZCMP-NEXT:    sw a2, 116(a5)
-; RV64IZCMP-NEXT:    sw a3, 112(a5)
-; RV64IZCMP-NEXT:    sw a4, 108(a5)
-; RV64IZCMP-NEXT:    sw a7, 104(a5)
-; RV64IZCMP-NEXT:    sw s0, 100(a5)
-; RV64IZCMP-NEXT:    sw t0, 96(a5)
-; RV64IZCMP-NEXT:    sw t1, 92(a5)
-; RV64IZCMP-NEXT:    sw t2, 88(a5)
-; RV64IZCMP-NEXT:    sw t3, 84(a5)
-; RV64IZCMP-NEXT:    sw s1, 80(a5)
-; RV64IZCMP-NEXT:    sw ra, 76(a5)
-; RV64IZCMP-NEXT:    sw s11, 72(a5)
-; RV64IZCMP-NEXT:    sw s10, 68(a5)
-; RV64IZCMP-NEXT:    sw s9, 64(a5)
-; RV64IZCMP-NEXT:    sw s8, 60(a5)
-; RV64IZCMP-NEXT:    sw s7, 56(a5)
-; RV64IZCMP-NEXT:    sw s6, 52(a5)
-; RV64IZCMP-NEXT:    sw s5, 48(a5)
-; RV64IZCMP-NEXT:    sw s4, 44(a5)
-; RV64IZCMP-NEXT:    sw s3, 40(a5)
-; RV64IZCMP-NEXT:    sw s2, 36(a5)
-; RV64IZCMP-NEXT:    sw t6, 32(a5)
-; RV64IZCMP-NEXT:    sw t5, 28(a5)
-; RV64IZCMP-NEXT:    sw t4, 24(a5)
+; RV64IZCMP-NEXT:    lw t4, 36(a2)
+; RV64IZCMP-NEXT:    lw t5, 40(a2)
+; RV64IZCMP-NEXT:    lw t6, 44(a2)
+; RV64IZCMP-NEXT:    lw s2, 48(a2)
+; RV64IZCMP-NEXT:    lw s3, 52(a2)
+; RV64IZCMP-NEXT:    lw s4, 56(a2)
+; RV64IZCMP-NEXT:    lw s5, 60(a2)
+; RV64IZCMP-NEXT:    lw s6, 64(a2)
+; RV64IZCMP-NEXT:    lw s7, 68(a2)
+; RV64IZCMP-NEXT:    lw s8, 72(a2)
+; RV64IZCMP-NEXT:    lw s9, 76(a2)
+; RV64IZCMP-NEXT:    lw s10, 80(a2)
+; RV64IZCMP-NEXT:    lw s11, 84(a2)
+; RV64IZCMP-NEXT:    lw ra, 88(a2)
+; RV64IZCMP-NEXT:    lw s1, 92(a2)
+; RV64IZCMP-NEXT:    lw t0, 96(a2)
+; RV64IZCMP-NEXT:    lw a7, 100(a2)
+; RV64IZCMP-NEXT:    lw a6, 104(a2)
+; RV64IZCMP-NEXT:    lw a4, 108(a2)
+; RV64IZCMP-NEXT:    lw s0, 112(a2)
+; RV64IZCMP-NEXT:    lw a3, 116(a2)
+; RV64IZCMP-NEXT:    lw a1, 120(a2)
+; RV64IZCMP-NEXT:    lw a0, 124(a2)
+; RV64IZCMP-NEXT:    lw t3, %lo(var_test_irq+4)(a5)
+; RV64IZCMP-NEXT:    lw t2, %lo(var_test_irq+8)(a5)
+; RV64IZCMP-NEXT:    lw t1, %lo(var_test_irq+12)(a5)
+; RV64IZCMP-NEXT:    sw a0, 124(a2)
+; RV64IZCMP-NEXT:    sw a1, 120(a2)
+; RV64IZCMP-NEXT:    sw a3, 116(a2)
+; RV64IZCMP-NEXT:    sw s0, 112(a2)
+; RV64IZCMP-NEXT:    sw a4, 108(a2)
+; RV64IZCMP-NEXT:    sw a6, 104(a2)
+; RV64IZCMP-NEXT:    sw a7, 100(a2)
+; RV64IZCMP-NEXT:    sw t0, 96(a2)
+; RV64IZCMP-NEXT:    sw s1, 92(a2)
+; RV64IZCMP-NEXT:    sw ra, 88(a2)
+; RV64IZCMP-NEXT:    sw s11, 84(a2)
+; RV64IZCMP-NEXT:    sw s10, 80(a2)
+; RV64IZCMP-NEXT:    sw s9, 76(a2)
+; RV64IZCMP-NEXT:    sw s8, 72(a2)
+; RV64IZCMP-NEXT:    sw s7, 68(a2)
+; RV64IZCMP-NEXT:    sw s6, 64(a2)
+; RV64IZCMP-NEXT:    sw s5, 60(a2)
+; RV64IZCMP-NEXT:    sw s4, 56(a2)
+; RV64IZCMP-NEXT:    sw s3, 52(a2)
+; RV64IZCMP-NEXT:    sw s2, 48(a2)
+; RV64IZCMP-NEXT:    sw t6, 44(a2)
+; RV64IZCMP-NEXT:    sw t5, 40(a2)
+; RV64IZCMP-NEXT:    sw t4, 36(a2)
 ; RV64IZCMP-NEXT:    ld a0, 0(sp) # 8-byte Folded Reload
-; RV64IZCMP-NEXT:    sw a0, 20(a5)
+; RV64IZCMP-NEXT:    sw a0, 32(a2)
 ; RV64IZCMP-NEXT:    ld a0, 8(sp) # 8-byte Folded Reload
-; RV64IZCMP-NEXT:    sw a0, 16(a5)
+; RV64IZCMP-NEXT:    sw a0, 28(a2)
 ; RV64IZCMP-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
-; RV64IZCMP-NEXT:    sw a0, %lo(var_test_irq+12)(a6)
+; RV64IZCMP-NEXT:    sw a0, 24(a2)
 ; RV64IZCMP-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
-; RV64IZCMP-NEXT:    sw a0, %lo(var_test_irq+8)(a6)
+; RV64IZCMP-NEXT:    sw a0, 20(a2)
 ; RV64IZCMP-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
-; RV64IZCMP-NEXT:    sw a0, %lo(var_test_irq+4)(a6)
+; RV64IZCMP-NEXT:    sw a0, 16(a2)
+; RV64IZCMP-NEXT:    sw t1, %lo(var_test_irq+12)(a5)
+; RV64IZCMP-NEXT:    sw t2, %lo(var_test_irq+8)(a5)
+; RV64IZCMP-NEXT:    sw t3, %lo(var_test_irq+4)(a5)
 ; RV64IZCMP-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
-; RV64IZCMP-NEXT:    sw a0, %lo(var_test_irq)(a6)
+; RV64IZCMP-NEXT:    sw a0, %lo(var_test_irq)(a5)
 ; RV64IZCMP-NEXT:    cm.popret {ra, s0-s11}, 160
 ;
 ; RV32IZCMP-SR-LABEL: callee_no_irq:
 ; RV32IZCMP-SR:       # %bb.0:
 ; RV32IZCMP-SR-NEXT:    cm.push {ra, s0-s11}, -96
-; RV32IZCMP-SR-NEXT:    lui a6, %hi(var_test_irq)
-; RV32IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq)(a6)
+; RV32IZCMP-SR-NEXT:    lui a5, %hi(var_test_irq)
+; RV32IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq)(a5)
 ; RV32IZCMP-SR-NEXT:    sw a0, 28(sp) # 4-byte Folded Spill
-; RV32IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq+4)(a6)
+; RV32IZCMP-SR-NEXT:    addi a2, a5, %lo(var_test_irq)
+; RV32IZCMP-SR-NEXT:    lw a0, 16(a2)
 ; RV32IZCMP-SR-NEXT:    sw a0, 24(sp) # 4-byte Folded Spill
-; RV32IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq+8)(a6)
+; RV32IZCMP-SR-NEXT:    lw a0, 20(a2)
 ; RV32IZCMP-SR-NEXT:    sw a0, 20(sp) # 4-byte Folded Spill
-; RV32IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq+12)(a6)
+; RV32IZCMP-SR-NEXT:    lw a0, 24(a2)
 ; RV32IZCMP-SR-NEXT:    sw a0, 16(sp) # 4-byte Folded Spill
-; RV32IZCMP-SR-NEXT:    addi a5, a6, %lo(var_test_irq)
-; RV32IZCMP-SR-NEXT:    lw a0, 16(a5)
+; RV32IZCMP-SR-NEXT:    lw a0, 28(a2)
 ; RV32IZCMP-SR-NEXT:    sw a0, 12(sp) # 4-byte Folded Spill
-; RV32IZCMP-SR-NEXT:    lw a0, 20(a5)
+; RV32IZCMP-SR-NEXT:    lw a0, 32(a2)
 ; RV32IZCMP-SR-NEXT:    sw a0, 8(sp) # 4-byte Folded Spill
-; RV32IZCMP-SR-NEXT:    lw t4, 24(a5)
-; RV32IZCMP-SR-NEXT:    lw t5, 28(a5)
-; RV32IZCMP-SR-NEXT:    lw t6, 32(a5)
-; RV32IZCMP-SR-NEXT:    lw s2, 36(a5)
-; RV32IZCMP-SR-NEXT:    lw s3, 40(a5)
-; RV32IZCMP-SR-NEXT:    lw s4, 44(a5)
-; RV32IZCMP-SR-NEXT:    lw s5, 48(a5)
-; RV32IZCMP-SR-NEXT:    lw s6, 52(a5)
-; RV32IZCMP-SR-NEXT:    lw s7, 56(a5)
-; RV32IZCMP-SR-NEXT:    lw s8, 60(a5)
-; RV32IZCMP-SR-NEXT:    lw s9, 64(a5)
-; RV32IZCMP-SR-NEXT:    lw s10, 68(a5)
-; RV32IZCMP-SR-NEXT:    lw s11, 72(a5)
-; RV32IZCMP-SR-NEXT:    lw ra, 76(a5)
-; RV32IZCMP-SR-NEXT:    lw s1, 80(a5)
-; RV32IZCMP-SR-NEXT:    lw t3, 84(a5)
-; RV32IZCMP-SR-NEXT:    lw t2, 88(a5)
-; RV32IZCMP-SR-NEXT:    lw t1, 92(a5)
-; RV32IZCMP-SR-NEXT:    lw t0, 96(a5)
-; RV32IZCMP-SR-NEXT:    lw s0, 100(a5)
-; RV32IZCMP-SR-NEXT:    lw a7, 104(a5)
-; RV32IZCMP-SR-NEXT:    lw a4, 108(a5)
-; RV32IZCMP-SR-NEXT:    lw a0, 124(a5)
-; RV32IZCMP-SR-NEXT:    lw a1, 120(a5)
-; RV32IZCMP-SR-NEXT:    lw a2, 116(a5)
-; RV32IZCMP-SR-NEXT:    lw a3, 112(a5)
-; RV32IZCMP-SR-NEXT:    sw a0, 124(a5)
-; RV32IZCMP-SR-NEXT:    sw a1, 120(a5)
-; RV32IZCMP-SR-NEXT:    sw a2, 116(a5)
-; RV32IZCMP-SR-NEXT:    sw a3, 112(a5)
-; RV32IZCMP-SR-NEXT:    sw a4, 108(a5)
-; RV32IZCMP-SR-NEXT:    sw a7, 104(a5)
-; RV32IZCMP-SR-NEXT:    sw s0, 100(a5)
-; RV32IZCMP-SR-NEXT:    sw t0, 96(a5)
-; RV32IZCMP-SR-NEXT:    sw t1, 92(a5)
-; RV32IZCMP-SR-NEXT:    sw t2, 88(a5)
-; RV32IZCMP-SR-NEXT:    sw t3, 84(a5)
-; RV32IZCMP-SR-NEXT:    sw s1, 80(a5)
-; RV32IZCMP-SR-NEXT:    sw ra, 76(a5)
-; RV32IZCMP-SR-NEXT:    sw s11, 72(a5)
-; RV32IZCMP-SR-NEXT:    sw s10, 68(a5)
-; RV32IZCMP-SR-NEXT:    sw s9, 64(a5)
-; RV32IZCMP-SR-NEXT:    sw s8, 60(a5)
-; RV32IZCMP-SR-NEXT:    sw s7, 56(a5)
-; RV32IZCMP-SR-NEXT:    sw s6, 52(a5)
-; RV32IZCMP-SR-NEXT:    sw s5, 48(a5)
-; RV32IZCMP-SR-NEXT:    sw s4, 44(a5)
-; RV32IZCMP-SR-NEXT:    sw s3, 40(a5)
-; RV32IZCMP-SR-NEXT:    sw s2, 36(a5)
-; RV32IZCMP-SR-NEXT:    sw t6, 32(a5)
-; RV32IZCMP-SR-NEXT:    sw t5, 28(a5)
-; RV32IZCMP-SR-NEXT:    sw t4, 24(a5)
+; RV32IZCMP-SR-NEXT:    lw t4, 36(a2)
+; RV32IZCMP-SR-NEXT:    lw t5, 40(a2)
+; RV32IZCMP-SR-NEXT:    lw t6, 44(a2)
+; RV32IZCMP-SR-NEXT:    lw s2, 48(a2)
+; RV32IZCMP-SR-NEXT:    lw s3, 52(a2)
+; RV32IZCMP-SR-NEXT:    lw s4, 56(a2)
+; RV32IZCMP-SR-NEXT:    lw s5, 60(a2)
+; RV32IZCMP-SR-NEXT:    lw s6, 64(a2)
+; RV32IZCMP-SR-NEXT:    lw s7, 68(a2)
+; RV32IZCMP-SR-NEXT:    lw s8, 72(a2)
+; RV32IZCMP-SR-NEXT:    lw s9, 76(a2)
+; RV32IZCMP-SR-NEXT:    lw s10, 80(a2)
+; RV32IZCMP-SR-NEXT:    lw s11, 84(a2)
+; RV32IZCMP-SR-NEXT:    lw ra, 88(a2)
+; RV32IZCMP-SR-NEXT:    lw s1, 92(a2)
+; RV32IZCMP-SR-NEXT:    lw t0, 96(a2)
+; RV32IZCMP-SR-NEXT:    lw a7, 100(a2)
+; RV32IZCMP-SR-NEXT:    lw a6, 104(a2)
+; RV32IZCMP-SR-NEXT:    lw a4, 108(a2)
+; RV32IZCMP-SR-NEXT:    lw s0, 112(a2)
+; RV32IZCMP-SR-NEXT:    lw a3, 116(a2)
+; RV32IZCMP-SR-NEXT:    lw a1, 120(a2)
+; RV32IZCMP-SR-NEXT:    lw a0, 124(a2)
+; RV32IZCMP-SR-NEXT:    lw t3, %lo(var_test_irq+4)(a5)
+; RV32IZCMP-SR-NEXT:    lw t2, %lo(var_test_irq+8)(a5)
+; RV32IZCMP-SR-NEXT:    lw t1, %lo(var_test_irq+12)(a5)
+; RV32IZCMP-SR-NEXT:    sw a0, 124(a2)
+; RV32IZCMP-SR-NEXT:    sw a1, 120(a2)
+; RV32IZCMP-SR-NEXT:    sw a3, 116(a2)
+; RV32IZCMP-SR-NEXT:    sw s0, 112(a2)
+; RV32IZCMP-SR-NEXT:    sw a4, 108(a2)
+; RV32IZCMP-SR-NEXT:    sw a6, 104(a2)
+; RV32IZCMP-SR-NEXT:    sw a7, 100(a2)
+; RV32IZCMP-SR-NEXT:    sw t0, 96(a2)
+; RV32IZCMP-SR-NEXT:    sw s1, 92(a2)
+; RV32IZCMP-SR-NEXT:    sw ra, 88(a2)
+; RV32IZCMP-SR-NEXT:    sw s11, 84(a2)
+; RV32IZCMP-SR-NEXT:    sw s10, 80(a2)
+; RV32IZCMP-SR-NEXT:    sw s9, 76(a2)
+; RV32IZCMP-SR-NEXT:    sw s8, 72(a2)
+; RV32IZCMP-SR-NEXT:    sw s7, 68(a2)
+; RV32IZCMP-SR-NEXT:    sw s6, 64(a2)
+; RV32IZCMP-SR-NEXT:    sw s5, 60(a2)
+; RV32IZCMP-SR-NEXT:    sw s4, 56(a2)
+; RV32IZCMP-SR-NEXT:    sw s3, 52(a2)
+; RV32IZCMP-SR-NEXT:    sw s2, 48(a2)
+; RV32IZCMP-SR-NEXT:    sw t6, 44(a2)
+; RV32IZCMP-SR-NEXT:    sw t5, 40(a2)
+; RV32IZCMP-SR-NEXT:    sw t4, 36(a2)
 ; RV32IZCMP-SR-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
-; RV32IZCMP-SR-NEXT:    sw a0, 20(a5)
+; RV32IZCMP-SR-NEXT:    sw a0, 32(a2)
 ; RV32IZCMP-SR-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
-; RV32IZCMP-SR-NEXT:    sw a0, 16(a5)
+; RV32IZCMP-SR-NEXT:    sw a0, 28(a2)
 ; RV32IZCMP-SR-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
-; RV32IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq+12)(a6)
+; RV32IZCMP-SR-NEXT:    sw a0, 24(a2)
 ; RV32IZCMP-SR-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
-; RV32IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq+8)(a6)
+; RV32IZCMP-SR-NEXT:    sw a0, 20(a2)
 ; RV32IZCMP-SR-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
-; RV32IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq+4)(a6)
+; RV32IZCMP-SR-NEXT:    sw a0, 16(a2)
+; RV32IZCMP-SR-NEXT:    sw t1, %lo(var_test_irq+12)(a5)
+; RV32IZCMP-SR-NEXT:    sw t2, %lo(var_test_irq+8)(a5)
+; RV32IZCMP-SR-NEXT:    sw t3, %lo(var_test_irq+4)(a5)
 ; RV32IZCMP-SR-NEXT:    lw a0, 28(sp) # 4-byte Folded Reload
-; RV32IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq)(a6)
+; RV32IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq)(a5)
 ; RV32IZCMP-SR-NEXT:    cm.popret {ra, s0-s11}, 96
 ;
 ; RV64IZCMP-SR-LABEL: callee_no_irq:
 ; RV64IZCMP-SR:       # %bb.0:
 ; RV64IZCMP-SR-NEXT:    cm.push {ra, s0-s11}, -160
-; RV64IZCMP-SR-NEXT:    lui a6, %hi(var_test_irq)
-; RV64IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq)(a6)
+; RV64IZCMP-SR-NEXT:    lui a5, %hi(var_test_irq)
+; RV64IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq)(a5)
 ; RV64IZCMP-SR-NEXT:    sd a0, 40(sp) # 8-byte Folded Spill
-; RV64IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq+4)(a6)
+; RV64IZCMP-SR-NEXT:    addi a2, a5, %lo(var_test_irq)
+; RV64IZCMP-SR-NEXT:    lw a0, 16(a2)
 ; RV64IZCMP-SR-NEXT:    sd a0, 32(sp) # 8-byte Folded Spill
-; RV64IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq+8)(a6)
+; RV64IZCMP-SR-NEXT:    lw a0, 20(a2)
 ; RV64IZCMP-SR-NEXT:    sd a0, 24(sp) # 8-byte Folded Spill
-; RV64IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq+12)(a6)
+; RV64IZCMP-SR-NEXT:    lw a0, 24(a2)
 ; RV64IZCMP-SR-NEXT:    sd a0, 16(sp) # 8-byte Folded Spill
-; RV64IZCMP-SR-NEXT:    addi a5, a6, %lo(var_test_irq)
-; RV64IZCMP-SR-NEXT:    lw a0, 16(a5)
+; RV64IZCMP-SR-NEXT:    lw a0, 28(a2)
 ; RV64IZCMP-SR-NEXT:    sd a0, 8(sp) # 8-byte Folded Spill
-; RV64IZCMP-SR-NEXT:    lw a0, 20(a5)
+; RV64IZCMP-SR-NEXT:    lw a0, 32(a2)
 ; RV64IZCMP-SR-NEXT:    sd a0, 0(sp) # 8-byte Folded Spill
-; RV64IZCMP-SR-NEXT:    lw t4, 24(a5)
-; RV64IZCMP-SR-NEXT:    lw t5, 28(a5)
-; RV64IZCMP-SR-NEXT:    lw t6, 32(a5)
-; RV64IZCMP-SR-NEXT:    lw s2, 36(a5)
-; RV64IZCMP-SR-NEXT:    lw s3, 40(a5)
-; RV64IZCMP-SR-NEXT:    lw s4, 44(a5)
-; RV64IZCMP-SR-NEXT:    lw s5, 48(a5)
-; RV64IZCMP-SR-NEXT:    lw s6, 52(a5)
-; RV64IZCMP-SR-NEXT:    lw s7, 56(a5)
-; RV64IZCMP-SR-NEXT:    lw s8, 60(a5)
-; RV64IZCMP-SR-NEXT:    lw s9, 64(a5)
-; RV64IZCMP-SR-NEXT:    lw s10, 68(a5)
-; RV64IZCMP-SR-NEXT:    lw s11, 72(a5)
-; RV64IZCMP-SR-NEXT:    lw ra, 76(a5)
-; RV64IZCMP-SR-NEXT:    lw s1, 80(a5)
-; RV64IZCMP-SR-NEXT:    lw t3, 84(a5)
-; RV64IZCMP-SR-NEXT:    lw t2, 88(a5)
-; RV64IZCMP-SR-NEXT:    lw t1, 92(a5)
-; RV64IZCMP-SR-NEXT:    lw t0, 96(a5)
-; RV64IZCMP-SR-NEXT:    lw s0, 100(a5)
-; RV64IZCMP-SR-NEXT:    lw a7, 104(a5)
-; RV64IZCMP-SR-NEXT:    lw a4, 108(a5)
-; RV64IZCMP-SR-NEXT:    lw a0, 124(a5)
-; RV64IZCMP-SR-NEXT:    lw a1, 120(a5)
-; RV64IZCMP-SR-NEXT:    lw a2, 116(a5)
-; RV64IZCMP-SR-NEXT:    lw a3, 112(a5)
-; RV64IZCMP-SR-NEXT:    sw a0, 124(a5)
-; RV64IZCMP-SR-NEXT:    sw a1, 120(a5)
-; RV64IZCMP-SR-NEXT:    sw a2, 116(a5)
-; RV64IZCMP-SR-NEXT:    sw a3, 112(a5)
-; RV64IZCMP-SR-NEXT:    sw a4, 108(a5)
-; RV64IZCMP-SR-NEXT:    sw a7, 104(a5)
-; RV64IZCMP-SR-NEXT:    sw s0, 100(a5)
-; RV64IZCMP-SR-NEXT:    sw t0, 96(a5)
-; RV64IZCMP-SR-NEXT:    sw t1, 92(a5)
-; RV64IZCMP-SR-NEXT:    sw t2, 88(a5)
-; RV64IZCMP-SR-NEXT:    sw t3, 84(a5)
-; RV64IZCMP-SR-NEXT:    sw s1, 80(a5)
-; RV64IZCMP-SR-NEXT:    sw ra, 76(a5)
-; RV64IZCMP-SR-NEXT:    sw s11, 72(a5)
-; RV64IZCMP-SR-NEXT:    sw s10, 68(a5)
-; RV64IZCMP-SR-NEXT:    sw s9, 64(a5)
-; RV64IZCMP-SR-NEXT:    sw s8, 60(a5)
-; RV64IZCMP-SR-NEXT:    sw s7, 56(a5)
-; RV64IZCMP-SR-NEXT:    sw s6, 52(a5)
-; RV64IZCMP-SR-NEXT:    sw s5, 48(a5)
-; RV64IZCMP-SR-NEXT:    sw s4, 44(a5)
-; RV64IZCMP-SR-NEXT:    sw s3, 40(a5)
-; RV64IZCMP-SR-NEXT:    sw s2, 36(a5)
-; RV64IZCMP-SR-NEXT:    sw t6, 32(a5)
-; RV64IZCMP-SR-NEXT:    sw t5, 28(a5)
-; RV64IZCMP-SR-NEXT:    sw t4, 24(a5)
+; RV64IZCMP-SR-NEXT:    lw t4, 36(a2)
+; RV64IZCMP-SR-NEXT:    lw t5, 40(a2)
+; RV64IZCMP-SR-NEXT:    lw t6, 44(a2)
+; RV64IZCMP-SR-NEXT:    lw s2, 48(a2)
+; RV64IZCMP-SR-NEXT:    lw s3, 52(a2)
+; RV64IZCMP-SR-NEXT:    lw s4, 56(a2)
+; RV64IZCMP-SR-NEXT:    lw s5, 60(a2)
+; RV64IZCMP-SR-NEXT:    lw s6, 64(a2)
+; RV64IZCMP-SR-NEXT:    lw s7, 68(a2)
+; RV64IZCMP-SR-NEXT:    lw s8, 72(a2)
+; RV64IZCMP-SR-NEXT:    lw s9, 76(a2)
+; RV64IZCMP-SR-NEXT:    lw s10, 80(a2)
+; RV64IZCMP-SR-NEXT:    lw s11, 84(a2)
+; RV64IZCMP-SR-NEXT:    lw ra, 88(a2)
+; RV64IZCMP-SR-NEXT:    lw s1, 92(a2)
+; RV64IZCMP-SR-NEXT:    lw t0, 96(a2)
+; RV64IZCMP-SR-NEXT:    lw a7, 100(a2)
+; RV64IZCMP-SR-NEXT:    lw a6, 104(a2)
+; RV64IZCMP-SR-NEXT:    lw a4, 108(a2)
+; RV64IZCMP-SR-NEXT:    lw s0, 112(a2)
+; RV64IZCMP-SR-NEXT:    lw a3, 116(a2)
+; RV64IZCMP-SR-NEXT:    lw a1, 120(a2)
+; RV64IZCMP-SR-NEXT:    lw a0, 124(a2)
+; RV64IZCMP-SR-NEXT:    lw t3, %lo(var_test_irq+4)(a5)
+; RV64IZCMP-SR-NEXT:    lw t2, %lo(var_test_irq+8)(a5)
+; RV64IZCMP-SR-NEXT:    lw t1, %lo(var_test_irq+12)(a5)
+; RV64IZCMP-SR-NEXT:    sw a0, 124(a2)
+; RV64IZCMP-SR-NEXT:    sw a1, 120(a2)
+; RV64IZCMP-SR-NEXT:    sw a3, 116(a2)
+; RV64IZCMP-SR-NEXT:    sw s0, 112(a2)
+; RV64IZCMP-SR-NEXT:    sw a4, 108(a2)
+; RV64IZCMP-SR-NEXT:    sw a6, 104(a2)
+; RV64IZCMP-SR-NEXT:    sw a7, 100(a2)
+; RV64IZCMP-SR-NEXT:    sw t0, 96(a2)
+; RV64IZCMP-SR-NEXT:    sw s1, 92(a2)
+; RV64IZCMP-SR-NEXT:    sw ra, 88(a2)
+; RV64IZCMP-SR-NEXT:    sw s11, 84(a2)
+; RV64IZCMP-SR-NEXT:    sw s10, 80(a2)
+; RV64IZCMP-SR-NEXT:    sw s9, 76(a2)
+; RV64IZCMP-SR-NEXT:    sw s8, 72(a2)
+; RV64IZCMP-SR-NEXT:    sw s7, 68(a2)
+; RV64IZCMP-SR-NEXT:    sw s6, 64(a2)
+; RV64IZCMP-SR-NEXT:    sw s5, 60(a2)
+; RV64IZCMP-SR-NEXT:    sw s4, 56(a2)
+; RV64IZCMP-SR-NEXT:    sw s3, 52(a2)
+; RV64IZCMP-SR-NEXT:    sw s2, 48(a2)
+; RV64IZCMP-SR-NEXT:    sw t6, 44(a2)
+; RV64IZCMP-SR-NEXT:    sw t5, 40(a2)
+; RV64IZCMP-SR-NEXT:    sw t4, 36(a2)
 ; RV64IZCMP-SR-NEXT:    ld a0, 0(sp) # 8-byte Folded Reload
-; RV64IZCMP-SR-NEXT:    sw a0, 20(a5)
+; RV64IZCMP-SR-NEXT:    sw a0, 32(a2)
 ; RV64IZCMP-SR-NEXT:    ld a0, 8(sp) # 8-byte Folded Reload
-; RV64IZCMP-SR-NEXT:    sw a0, 16(a5)
+; RV64IZCMP-SR-NEXT:    sw a0, 28(a2)
 ; RV64IZCMP-SR-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
-; RV64IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq+12)(a6)
+; RV64IZCMP-SR-NEXT:    sw a0, 24(a2)
 ; RV64IZCMP-SR-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
-; RV64IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq+8)(a6)
+; RV64IZCMP-SR-NEXT:    sw a0, 20(a2)
 ; RV64IZCMP-SR-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
-; RV64IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq+4)(a6)
+; RV64IZCMP-SR-NEXT:    sw a0, 16(a2)
+; RV64IZCMP-SR-NEXT:    sw t1, %lo(var_test_irq+12)(a5)
+; RV64IZCMP-SR-NEXT:    sw t2, %lo(var_test_irq+8)(a5)
+; RV64IZCMP-SR-NEXT:    sw t3, %lo(var_test_irq+4)(a5)
 ; RV64IZCMP-SR-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
-; RV64IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq)(a6)
+; RV64IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq)(a5)
 ; RV64IZCMP-SR-NEXT:    cm.popret {ra, s0-s11}, 160
 ;
 ; RV32I-LABEL: callee_no_irq:
@@ -2891,84 +2891,84 @@ define void @callee_no_irq() nounwind{
 ; RV32I-NEXT:    sw s9, 36(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s10, 32(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s11, 28(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lui a6, %hi(var_test_irq)
-; RV32I-NEXT:    lw a0, %lo(var_test_irq)(a6)
+; RV32I-NEXT:    lui a4, %hi(var_test_irq)
+; RV32I-NEXT:    lw a0, %lo(var_test_irq)(a4)
 ; RV32I-NEXT:    sw a0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lw a0, %lo(var_test_irq+4)(a6)
+; RV32I-NEXT:    addi a2, a4, %lo(var_test_irq)
+; RV32I-NEXT:    lw a0, 16(a2)
 ; RV32I-NEXT:    sw a0, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lw a0, %lo(var_test_irq+8)(a6)
+; RV32I-NEXT:    lw a0, 20(a2)
 ; RV32I-NEXT:    sw a0, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lw a0, %lo(var_test_irq+12)(a6)
+; RV32I-NEXT:    lw a0, 24(a2)
 ; RV32I-NEXT:    sw a0, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    addi a5, a6, %lo(var_test_irq)
-; RV32I-NEXT:    lw a0, 16(a5)
+; RV32I-NEXT:    lw a0, 28(a2)
 ; RV32I-NEXT:    sw a0, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lw a0, 20(a5)
+; RV32I-NEXT:    lw a0, 32(a2)
 ; RV32I-NEXT:    sw a0, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lw t0, 24(a5)
-; RV32I-NEXT:    lw t1, 28(a5)
-; RV32I-NEXT:    lw t2, 32(a5)
-; RV32I-NEXT:    lw t3, 36(a5)
-; RV32I-NEXT:    lw t4, 40(a5)
-; RV32I-NEXT:    lw t5, 44(a5)
-; RV32I-NEXT:    lw t6, 48(a5)
-; RV32I-NEXT:    lw s0, 52(a5)
-; RV32I-NEXT:    lw s1, 56(a5)
-; RV32I-NEXT:    lw s2, 60(a5)
-; RV32I-NEXT:    lw s3, 64(a5)
-; RV32I-NEXT:    lw s4, 68(a5)
-; RV32I-NEXT:    lw s5, 72(a5)
-; RV32I-NEXT:    lw s6, 76(a5)
-; RV32I-NEXT:    lw s7, 80(a5)
-; RV32I-NEXT:    lw s8, 84(a5)
-; RV32I-NEXT:    lw s9, 88(a5)
-; RV32I-NEXT:    lw s10, 92(a5)
-; RV32I-NEXT:    lw s11, 96(a5)
-; RV32I-NEXT:    lw ra, 100(a5)
-; RV32I-NEXT:    lw a7, 104(a5)
-; RV32I-NEXT:    lw a4, 108(a5)
-; RV32I-NEXT:    lw a0, 124(a5)
-; RV32I-NEXT:    lw a1, 120(a5)
-; RV32I-NEXT:    lw a2, 116(a5)
-; RV32I-NEXT:    lw a3, 112(a5)
-; RV32I-NEXT:    sw a0, 124(a5)
-; RV32I-NEXT:    sw a1, 120(a5)
-; RV32I-NEXT:    sw a2, 116(a5)
-; RV32I-NEXT:    sw a3, 112(a5)
-; RV32I-NEXT:    sw a4, 108(a5)
-; RV32I-NEXT:    sw a7, 104(a5)
-; RV32I-NEXT:    sw ra, 100(a5)
-; RV32I-NEXT:    sw s11, 96(a5)
-; RV32I-NEXT:    sw s10, 92(a5)
-; RV32I-NEXT:    sw s9, 88(a5)
-; RV32I-NEXT:    sw s8, 84(a5)
-; RV32I-NEXT:    sw s7, 80(a5)
-; RV32I-NEXT:    sw s6, 76(a5)
-; RV32I-NEXT:    sw s5, 72(a5)
-; RV32I-NEXT:    sw s4, 68(a5)
-; RV32I-NEXT:    sw s3, 64(a5)
-; RV32I-NEXT:    sw s2, 60(a5)
-; RV32I-NEXT:    sw s1, 56(a5)
-; RV32I-NEXT:    sw s0, 52(a5)
-; RV32I-NEXT:    sw t6, 48(a5)
-; RV32I-NEXT:    sw t5, 44(a5)
-; RV32I-NEXT:    sw t4, 40(a5)
-; RV32I-NEXT:    sw t3, 36(a5)
-; RV32I-NEXT:    sw t2, 32(a5)
-; RV32I-NEXT:    sw t1, 28(a5)
-; RV32I-NEXT:    sw t0, 24(a5)
+; RV32I-NEXT:    lw t0, 36(a2)
+; RV32I-NEXT:    lw t1, 40(a2)
+; RV32I-NEXT:    lw t2, 44(a2)
+; RV32I-NEXT:    lw t3, 48(a2)
+; RV32I-NEXT:    lw t4, 52(a2)
+; RV32I-NEXT:    lw t5, 56(a2)
+; RV32I-NEXT:    lw t6, 60(a2)
+; RV32I-NEXT:    lw s0, 64(a2)
+; RV32I-NEXT:    lw s1, 68(a2)
+; RV32I-NEXT:    lw s2, 72(a2)
+; RV32I-NEXT:    lw s3, 76(a2)
+; RV32I-NEXT:    lw s4, 80(a2)
+; RV32I-NEXT:    lw s5, 84(a2)
+; RV32I-NEXT:    lw s6, 88(a2)
+; RV32I-NEXT:    lw s7, 92(a2)
+; RV32I-NEXT:    lw s8, 96(a2)
+; RV32I-NEXT:    lw s9, 100(a2)
+; RV32I-NEXT:    lw s10, 104(a2)
+; RV32I-NEXT:    lw s11, 108(a2)
+; RV32I-NEXT:    lw ra, 112(a2)
+; RV32I-NEXT:    lw a3, 116(a2)
+; RV32I-NEXT:    lw a1, 120(a2)
+; RV32I-NEXT:    lw a0, 124(a2)
+; RV32I-NEXT:    lw a7, %lo(var_test_irq+4)(a4)
+; RV32I-NEXT:    lw a6, %lo(var_test_irq+8)(a4)
+; RV32I-NEXT:    lw a5, %lo(var_test_irq+12)(a4)
+; RV32I-NEXT:    sw a0, 124(a2)
+; RV32I-NEXT:    sw a1, 120(a2)
+; RV32I-NEXT:    sw a3, 116(a2)
+; RV32I-NEXT:    sw ra, 112(a2)
+; RV32I-NEXT:    sw s11, 108(a2)
+; RV32I-NEXT:    sw s10, 104(a2)
+; RV32I-NEXT:    sw s9, 100(a2)
+; RV32I-NEXT:    sw s8, 96(a2)
+; RV32I-NEXT:    sw s7, 92(a2)
+; RV32I-NEXT:    sw s6, 88(a2)
+; RV32I-NEXT:    sw s5, 84(a2)
+; RV32I-NEXT:    sw s4, 80(a2)
+; RV32I-NEXT:    sw s3, 76(a2)
+; RV32I-NEXT:    sw s2, 72(a2)
+; RV32I-NEXT:    sw s1, 68(a2)
+; RV32I-NEXT:    sw s0, 64(a2)
+; RV32I-NEXT:    sw t6, 60(a2)
+; RV32I-NEXT:    sw t5, 56(a2)
+; RV32I-NEXT:    sw t4, 52(a2)
+; RV32I-NEXT:    sw t3, 48(a2)
+; RV32I-NEXT:    sw t2, 44(a2)
+; RV32I-NEXT:    sw t1, 40(a2)
+; RV32I-NEXT:    sw t0, 36(a2)
 ; RV32I-NEXT:    lw a0, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sw a0, 20(a5)
+; RV32I-NEXT:    sw a0, 32(a2)
 ; RV32I-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sw a0, 16(a5)
+; RV32I-NEXT:    sw a0, 28(a2)
 ; RV32I-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sw a0, %lo(var_test_irq+12)(a6)
+; RV32I-NEXT:    sw a0, 24(a2)
 ; RV32I-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sw a0, %lo(var_test_irq+8)(a6)
+; RV32I-NEXT:    sw a0, 20(a2)
 ; RV32I-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sw a0, %lo(var_test_irq+4)(a6)
+; RV32I-NEXT:    sw a0, 16(a2)
+; RV32I-NEXT:    sw a5, %lo(var_test_irq+12)(a4)
+; RV32I-NEXT:    sw a6, %lo(var_test_irq+8)(a4)
+; RV32I-NEXT:    sw a7, %lo(var_test_irq+4)(a4)
 ; RV32I-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sw a0, %lo(var_test_irq)(a6)
+; RV32I-NEXT:    sw a0, %lo(var_test_irq)(a4)
 ; RV32I-NEXT:    lw ra, 76(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 72(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 68(sp) # 4-byte Folded Reload
@@ -3001,84 +3001,84 @@ define void @callee_no_irq() nounwind{
 ; RV64I-NEXT:    sd s9, 72(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s10, 64(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s11, 56(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lui a6, %hi(var_test_irq)
-; RV64I-NEXT:    lw a0, %lo(var_test_irq)(a6)
+; RV64I-NEXT:    lui a4, %hi(var_test_irq)
+; RV64I-NEXT:    lw a0, %lo(var_test_irq)(a4)
 ; RV64I-NEXT:    sd a0, 48(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lw a0, %lo(var_test_irq+4)(a6)
+; RV64I-NEXT:    addi a2, a4, %lo(var_test_irq)
+; RV64I-NEXT:    lw a0, 16(a2)
 ; RV64I-NEXT:    sd a0, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lw a0, %lo(var_test_irq+8)(a6)
+; RV64I-NEXT:    lw a0, 20(a2)
 ; RV64I-NEXT:    sd a0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lw a0, %lo(var_test_irq+12)(a6)
+; RV64I-NEXT:    lw a0, 24(a2)
 ; RV64I-NEXT:    sd a0, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    addi a5, a6, %lo(var_test_irq)
-; RV64I-NEXT:    lw a0, 16(a5)
+; RV64I-NEXT:    lw a0, 28(a2)
 ; RV64I-NEXT:    sd a0, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lw a0, 20(a5)
+; RV64I-NEXT:    lw a0, 32(a2)
 ; RV64I-NEXT:    sd a0, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lw t0, 24(a5)
-; RV64I-NEXT:    lw t1, 28(a5)
-; RV64I-NEXT:    lw t2, 32(a5)
-; RV64I-NEXT:    lw t3, 36(a5)
-; RV64I-NEXT:    lw t4, 40(a5)
-; RV64I-NEXT:    lw t5, 44(a5)
-; RV64I-NEXT:    lw t6, 48(a5)
-; RV64I-NEXT:    lw s0, 52(a5)
-; RV64I-NEXT:    lw s1, 56(a5)
-; RV64I-NEXT:    lw s2, 60(a5)
-; RV64I-NEXT:    lw s3, 64(a5)
-; RV64I-NEXT:    lw s4, 68(a5)
-; RV64I-NEXT:    lw s5, 72(a5)
-; RV64I-NEXT:    lw s6, 76(a5)
-; RV64I-NEXT:    lw s7, 80(a5)
-; RV64I-NEXT:    lw s8, 84(a5)
-; RV64I-NEXT:    lw s9, 88(a5)
-; RV64I-NEXT:    lw s10, 92(a5)
-; RV64I-NEXT:    lw s11, 96(a5)
-; RV64I-NEXT:    lw ra, 100(a5)
-; RV64I-NEXT:    lw a7, 104(a5)
-; RV64I-NEXT:    lw a4, 108(a5)
-; RV64I-NEXT:    lw a0, 124(a5)
-; RV64I-NEXT:    lw a1, 120(a5)
-; RV64I-NEXT:    lw a2, 116(a5)
-; RV64I-NEXT:    lw a3, 112(a5)
-; RV64I-NEXT:    sw a0, 124(a5)
-; RV64I-NEXT:    sw a1, 120(a5)
-; RV64I-NEXT:    sw a2, 116(a5)
-; RV64I-NEXT:    sw a3, 112(a5)
-; RV64I-NEXT:    sw a4, 108(a5)
-; RV64I-NEXT:    sw a7, 104(a5)
-; RV64I-NEXT:    sw ra, 100(a5)
-; RV64I-NEXT:    sw s11, 96(a5)
-; RV64I-NEXT:    sw s10, 92(a5)
-; RV64I-NEXT:    sw s9, 88(a5)
-; RV64I-NEXT:    sw s8, 84(a5)
-; RV64I-NEXT:    sw s7, 80(a5)
-; RV64I-NEXT:    sw s6, 76(a5)
-; RV64I-NEXT:    sw s5, 72(a5)
-; RV64I-NEXT:    sw s4, 68(a5)
-; RV64I-NEXT:    sw s3, 64(a5)
-; RV64I-NEXT:    sw s2, 60(a5)
-; RV64I-NEXT:    sw s1, 56(a5)
-; RV64I-NEXT:    sw s0, 52(a5)
-; RV64I-NEXT:    sw t6, 48(a5)
-; RV64I-NEXT:    sw t5, 44(a5)
-; RV64I-NEXT:    sw t4, 40(a5)
-; RV64I-NEXT:    sw t3, 36(a5)
-; RV64I-NEXT:    sw t2, 32(a5)
-; RV64I-NEXT:    sw t1, 28(a5)
-; RV64I-NEXT:    sw t0, 24(a5)
+; RV64I-NEXT:    lw t0, 36(a2)
+; RV64I-NEXT:    lw t1, 40(a2)
+; RV64I-NEXT:    lw t2, 44(a2)
+; RV64I-NEXT:    lw t3, 48(a2)
+; RV64I-NEXT:    lw t4, 52(a2)
+; RV64I-NEXT:    lw t5, 56(a2)
+; RV64I-NEXT:    lw t6, 60(a2)
+; RV64I-NEXT:    lw s0, 64(a2)
+; RV64I-NEXT:    lw s1, 68(a2)
+; RV64I-NEXT:    lw s2, 72(a2)
+; RV64I-NEXT:    lw s3, 76(a2)
+; RV64I-NEXT:    lw s4, 80(a2)
+; RV64I-NEXT:    lw s5, 84(a2)
+; RV64I-NEXT:    lw s6, 88(a2)
+; RV64I-NEXT:    lw s7, 92(a2)
+; RV64I-NEXT:    lw s8, 96(a2)
+; RV64I-NEXT:    lw s9, 100(a2)
+; RV64I-NEXT:    lw s10, 104(a2)
+; RV64I-NEXT:    lw s11, 108(a2)
+; RV64I-NEXT:    lw ra, 112(a2)
+; RV64I-NEXT:    lw a3, 116(a2)
+; RV64I-NEXT:    lw a1, 120(a2)
+; RV64I-NEXT:    lw a0, 124(a2)
+; RV64I-NEXT:    lw a7, %lo(var_test_irq+4)(a4)
+; RV64I-NEXT:    lw a6, %lo(var_test_irq+8)(a4)
+; RV64I-NEXT:    lw a5, %lo(var_test_irq+12)(a4)
+; RV64I-NEXT:    sw a0, 124(a2)
+; RV64I-NEXT:    sw a1, 120(a2)
+; RV64I-NEXT:    sw a3, 116(a2)
+; RV64I-NEXT:    sw ra, 112(a2)
+; RV64I-NEXT:    sw s11, 108(a2)
+; RV64I-NEXT:    sw s10, 104(a2)
+; RV64I-NEXT:    sw s9, 100(a2)
+; RV64I-NEXT:    sw s8, 96(a2)
+; RV64I-NEXT:    sw s7, 92(a2)
+; RV64I-NEXT:    sw s6, 88(a2)
+; RV64I-NEXT:    sw s5, 84(a2)
+; RV64I-NEXT:    sw s4, 80(a2)
+; RV64I-NEXT:    sw s3, 76(a2)
+; RV64I-NEXT:    sw s2, 72(a2)
+; RV64I-NEXT:    sw s1, 68(a2)
+; RV64I-NEXT:    sw s0, 64(a2)
+; RV64I-NEXT:    sw t6, 60(a2)
+; RV64I-NEXT:    sw t5, 56(a2)
+; RV64I-NEXT:    sw t4, 52(a2)
+; RV64I-NEXT:    sw t3, 48(a2)
+; RV64I-NEXT:    sw t2, 44(a2)
+; RV64I-NEXT:    sw t1, 40(a2)
+; RV64I-NEXT:    sw t0, 36(a2)
 ; RV64I-NEXT:    ld a0, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sw a0, 20(a5)
+; RV64I-NEXT:    sw a0, 32(a2)
 ; RV64I-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sw a0, 16(a5)
+; RV64I-NEXT:    sw a0, 28(a2)
 ; RV64I-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sw a0, %lo(var_test_irq+12)(a6)
+; RV64I-NEXT:    sw a0, 24(a2)
 ; RV64I-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sw a0, %lo(var_test_irq+8)(a6)
+; RV64I-NEXT:    sw a0, 20(a2)
 ; RV64I-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sw a0, %lo(var_test_irq+4)(a6)
+; RV64I-NEXT:    sw a0, 16(a2)
+; RV64I-NEXT:    sw a5, %lo(var_test_irq+12)(a4)
+; RV64I-NEXT:    sw a6, %lo(var_test_irq+8)(a4)
+; RV64I-NEXT:    sw a7, %lo(var_test_irq+4)(a4)
 ; RV64I-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sw a0, %lo(var_test_irq)(a6)
+; RV64I-NEXT:    sw a0, %lo(var_test_irq)(a4)
 ; RV64I-NEXT:    ld ra, 152(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 144(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 136(sp) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/reduction-formation.ll b/llvm/test/CodeGen/RISCV/reduction-formation.ll
index b2dea4237f5a5a..1c5b42f038b178 100644
--- a/llvm/test/CodeGen/RISCV/reduction-formation.ll
+++ b/llvm/test/CodeGen/RISCV/reduction-formation.ll
@@ -8,24 +8,24 @@
 define i32 @reduce_sum_4xi32(<4 x i32> %v) {
 ; RV32-LABEL: reduce_sum_4xi32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    lw a1, 12(a0)
+; RV32-NEXT:    lw a1, 0(a0)
 ; RV32-NEXT:    lw a2, 4(a0)
-; RV32-NEXT:    lw a3, 0(a0)
-; RV32-NEXT:    lw a0, 8(a0)
-; RV32-NEXT:    add a2, a3, a2
-; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add a0, a2, a0
+; RV32-NEXT:    lw a3, 8(a0)
+; RV32-NEXT:    lw a0, 12(a0)
+; RV32-NEXT:    add a1, a1, a2
+; RV32-NEXT:    add a0, a3, a0
+; RV32-NEXT:    add a0, a1, a0
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: reduce_sum_4xi32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    lw a1, 24(a0)
+; RV64-NEXT:    lw a1, 0(a0)
 ; RV64-NEXT:    lw a2, 8(a0)
-; RV64-NEXT:    lw a3, 0(a0)
-; RV64-NEXT:    lw a0, 16(a0)
-; RV64-NEXT:    add a2, a3, a2
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    addw a0, a2, a0
+; RV64-NEXT:    lw a3, 16(a0)
+; RV64-NEXT:    lw a0, 24(a0)
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    add a0, a3, a0
+; RV64-NEXT:    addw a0, a1, a0
 ; RV64-NEXT:    ret
   %e0 = extractelement <4 x i32> %v, i32 0
   %e1 = extractelement <4 x i32> %v, i32 1
@@ -40,24 +40,24 @@ define i32 @reduce_sum_4xi32(<4 x i32> %v) {
 define i32 @reduce_xor_4xi32(<4 x i32> %v) {
 ; RV32-LABEL: reduce_xor_4xi32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    lw a1, 12(a0)
+; RV32-NEXT:    lw a1, 0(a0)
 ; RV32-NEXT:    lw a2, 4(a0)
-; RV32-NEXT:    lw a3, 0(a0)
-; RV32-NEXT:    lw a0, 8(a0)
-; RV32-NEXT:    xor a2, a3, a2
-; RV32-NEXT:    xor a0, a0, a1
-; RV32-NEXT:    xor a0, a2, a0
+; RV32-NEXT:    lw a3, 8(a0)
+; RV32-NEXT:    lw a0, 12(a0)
+; RV32-NEXT:    xor a1, a1, a2
+; RV32-NEXT:    xor a0, a3, a0
+; RV32-NEXT:    xor a0, a1, a0
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: reduce_xor_4xi32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    ld a1, 24(a0)
+; RV64-NEXT:    ld a1, 0(a0)
 ; RV64-NEXT:    ld a2, 8(a0)
-; RV64-NEXT:    ld a3, 0(a0)
-; RV64-NEXT:    ld a0, 16(a0)
-; RV64-NEXT:    xor a2, a3, a2
-; RV64-NEXT:    xor a0, a0, a1
-; RV64-NEXT:    xor a0, a2, a0
+; RV64-NEXT:    ld a3, 16(a0)
+; RV64-NEXT:    ld a0, 24(a0)
+; RV64-NEXT:    xor a1, a1, a2
+; RV64-NEXT:    xor a0, a3, a0
+; RV64-NEXT:    xor a0, a1, a0
 ; RV64-NEXT:    ret
   %e0 = extractelement <4 x i32> %v, i32 0
   %e1 = extractelement <4 x i32> %v, i32 1
@@ -72,24 +72,24 @@ define i32 @reduce_xor_4xi32(<4 x i32> %v) {
 define i32 @reduce_or_4xi32(<4 x i32> %v) {
 ; RV32-LABEL: reduce_or_4xi32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    lw a1, 12(a0)
+; RV32-NEXT:    lw a1, 0(a0)
 ; RV32-NEXT:    lw a2, 4(a0)
-; RV32-NEXT:    lw a3, 0(a0)
-; RV32-NEXT:    lw a0, 8(a0)
-; RV32-NEXT:    or a2, a3, a2
-; RV32-NEXT:    or a0, a0, a1
-; RV32-NEXT:    or a0, a2, a0
+; RV32-NEXT:    lw a3, 8(a0)
+; RV32-NEXT:    lw a0, 12(a0)
+; RV32-NEXT:    or a1, a1, a2
+; RV32-NEXT:    or a0, a3, a0
+; RV32-NEXT:    or a0, a1, a0
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: reduce_or_4xi32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    ld a1, 24(a0)
+; RV64-NEXT:    ld a1, 0(a0)
 ; RV64-NEXT:    ld a2, 8(a0)
-; RV64-NEXT:    ld a3, 0(a0)
-; RV64-NEXT:    ld a0, 16(a0)
-; RV64-NEXT:    or a2, a3, a2
-; RV64-NEXT:    or a0, a0, a1
-; RV64-NEXT:    or a0, a2, a0
+; RV64-NEXT:    ld a3, 16(a0)
+; RV64-NEXT:    ld a0, 24(a0)
+; RV64-NEXT:    or a1, a1, a2
+; RV64-NEXT:    or a0, a3, a0
+; RV64-NEXT:    or a0, a1, a0
 ; RV64-NEXT:    ret
   %e0 = extractelement <4 x i32> %v, i32 0
   %e1 = extractelement <4 x i32> %v, i32 1
diff --git a/llvm/test/CodeGen/RISCV/rv32zbb.ll b/llvm/test/CodeGen/RISCV/rv32zbb.ll
index 5f9ca503bcb053..5a91481572552f 100644
--- a/llvm/test/CodeGen/RISCV/rv32zbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv32zbb.ll
@@ -751,9 +751,9 @@ define <2 x i64> @ctpop_v2i64(<2 x i64> %a) nounwind {
 ; RV32I-NEXT:    sw s8, 8(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a0
 ; RV32I-NEXT:    lw a0, 4(a1)
+; RV32I-NEXT:    lw s5, 0(a1)
 ; RV32I-NEXT:    lw s2, 8(a1)
-; RV32I-NEXT:    lw s5, 12(a1)
-; RV32I-NEXT:    lw s6, 0(a1)
+; RV32I-NEXT:    lw s6, 12(a1)
 ; RV32I-NEXT:    srli a1, a0, 1
 ; RV32I-NEXT:    lui a2, 349525
 ; RV32I-NEXT:    addi s3, a2, 1365
@@ -775,9 +775,9 @@ define <2 x i64> @ctpop_v2i64(<2 x i64> %a) nounwind {
 ; RV32I-NEXT:    mv a1, s1
 ; RV32I-NEXT:    call __mulsi3 at plt
 ; RV32I-NEXT:    srli s8, a0, 24
-; RV32I-NEXT:    srli a0, s6, 1
+; RV32I-NEXT:    srli a0, s5, 1
 ; RV32I-NEXT:    and a0, a0, s3
-; RV32I-NEXT:    sub a0, s6, a0
+; RV32I-NEXT:    sub a0, s5, a0
 ; RV32I-NEXT:    and a1, a0, s4
 ; RV32I-NEXT:    srli a0, a0, 2
 ; RV32I-NEXT:    and a0, a0, s4
@@ -789,9 +789,9 @@ define <2 x i64> @ctpop_v2i64(<2 x i64> %a) nounwind {
 ; RV32I-NEXT:    call __mulsi3 at plt
 ; RV32I-NEXT:    srli a0, a0, 24
 ; RV32I-NEXT:    add s8, a0, s8
-; RV32I-NEXT:    srli a0, s5, 1
+; RV32I-NEXT:    srli a0, s6, 1
 ; RV32I-NEXT:    and a0, a0, s3
-; RV32I-NEXT:    sub a0, s5, a0
+; RV32I-NEXT:    sub a0, s6, a0
 ; RV32I-NEXT:    and a1, a0, s4
 ; RV32I-NEXT:    srli a0, a0, 2
 ; RV32I-NEXT:    and a0, a0, s4
@@ -858,21 +858,21 @@ define <2 x i1> @ctpop_v2i64_ult_two(<2 x i64> %a) nounwind {
 ; RV32I-LABEL: ctpop_v2i64_ult_two:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lw a1, 0(a0)
-; RV32I-NEXT:    lw a2, 12(a0)
+; RV32I-NEXT:    lw a2, 4(a0)
 ; RV32I-NEXT:    lw a3, 8(a0)
-; RV32I-NEXT:    lw a0, 4(a0)
-; RV32I-NEXT:    addi a4, a1, -1
-; RV32I-NEXT:    and a4, a1, a4
+; RV32I-NEXT:    lw a4, 12(a0)
+; RV32I-NEXT:    addi a0, a1, -1
+; RV32I-NEXT:    and a0, a1, a0
 ; RV32I-NEXT:    seqz a1, a1
-; RV32I-NEXT:    sub a1, a0, a1
-; RV32I-NEXT:    and a0, a0, a1
-; RV32I-NEXT:    or a0, a4, a0
+; RV32I-NEXT:    sub a1, a2, a1
+; RV32I-NEXT:    and a1, a2, a1
+; RV32I-NEXT:    or a0, a0, a1
 ; RV32I-NEXT:    seqz a0, a0
 ; RV32I-NEXT:    addi a1, a3, -1
 ; RV32I-NEXT:    and a1, a3, a1
-; RV32I-NEXT:    seqz a3, a3
-; RV32I-NEXT:    sub a3, a2, a3
-; RV32I-NEXT:    and a2, a2, a3
+; RV32I-NEXT:    seqz a2, a3
+; RV32I-NEXT:    sub a2, a4, a2
+; RV32I-NEXT:    and a2, a4, a2
 ; RV32I-NEXT:    or a1, a1, a2
 ; RV32I-NEXT:    seqz a1, a1
 ; RV32I-NEXT:    ret
@@ -901,21 +901,21 @@ define <2 x i1> @ctpop_v2i64_ugt_one(<2 x i64> %a) nounwind {
 ; RV32I-LABEL: ctpop_v2i64_ugt_one:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lw a1, 0(a0)
-; RV32I-NEXT:    lw a2, 12(a0)
+; RV32I-NEXT:    lw a2, 4(a0)
 ; RV32I-NEXT:    lw a3, 8(a0)
-; RV32I-NEXT:    lw a0, 4(a0)
-; RV32I-NEXT:    addi a4, a1, -1
-; RV32I-NEXT:    and a4, a1, a4
+; RV32I-NEXT:    lw a4, 12(a0)
+; RV32I-NEXT:    addi a0, a1, -1
+; RV32I-NEXT:    and a0, a1, a0
 ; RV32I-NEXT:    seqz a1, a1
-; RV32I-NEXT:    sub a1, a0, a1
-; RV32I-NEXT:    and a0, a0, a1
-; RV32I-NEXT:    or a0, a4, a0
+; RV32I-NEXT:    sub a1, a2, a1
+; RV32I-NEXT:    and a1, a2, a1
+; RV32I-NEXT:    or a0, a0, a1
 ; RV32I-NEXT:    snez a0, a0
 ; RV32I-NEXT:    addi a1, a3, -1
 ; RV32I-NEXT:    and a1, a3, a1
-; RV32I-NEXT:    seqz a3, a3
-; RV32I-NEXT:    sub a3, a2, a3
-; RV32I-NEXT:    and a2, a2, a3
+; RV32I-NEXT:    seqz a2, a3
+; RV32I-NEXT:    sub a2, a4, a2
+; RV32I-NEXT:    and a2, a4, a2
 ; RV32I-NEXT:    or a1, a1, a2
 ; RV32I-NEXT:    snez a1, a1
 ; RV32I-NEXT:    ret
@@ -946,15 +946,15 @@ define <2 x i1> @ctpop_v2i64_eq_one(<2 x i64> %a) nounwind {
 ; RV32I-LABEL: ctpop_v2i64_eq_one:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    mv a1, a0
-; RV32I-NEXT:    lw a2, 12(a0)
-; RV32I-NEXT:    lw a0, 4(a0)
-; RV32I-NEXT:    lw a3, 0(a1)
-; RV32I-NEXT:    beqz a0, .LBB22_3
+; RV32I-NEXT:    lw a0, 0(a0)
+; RV32I-NEXT:    lw a3, 4(a1)
+; RV32I-NEXT:    lw a2, 12(a1)
+; RV32I-NEXT:    beqz a3, .LBB22_3
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    seqz a3, a3
-; RV32I-NEXT:    sub a3, a0, a3
-; RV32I-NEXT:    xor a0, a0, a3
-; RV32I-NEXT:    sltu a0, a3, a0
+; RV32I-NEXT:    seqz a0, a0
+; RV32I-NEXT:    sub a0, a3, a0
+; RV32I-NEXT:    xor a3, a3, a0
+; RV32I-NEXT:    sltu a0, a0, a3
 ; RV32I-NEXT:    lw a1, 8(a1)
 ; RV32I-NEXT:    bnez a2, .LBB22_4
 ; RV32I-NEXT:  .LBB22_2:
@@ -963,9 +963,9 @@ define <2 x i1> @ctpop_v2i64_eq_one(<2 x i64> %a) nounwind {
 ; RV32I-NEXT:    sltu a1, a2, a1
 ; RV32I-NEXT:    ret
 ; RV32I-NEXT:  .LBB22_3:
-; RV32I-NEXT:    addi a0, a3, -1
-; RV32I-NEXT:    xor a3, a3, a0
-; RV32I-NEXT:    sltu a0, a0, a3
+; RV32I-NEXT:    addi a3, a0, -1
+; RV32I-NEXT:    xor a0, a0, a3
+; RV32I-NEXT:    sltu a0, a3, a0
 ; RV32I-NEXT:    lw a1, 8(a1)
 ; RV32I-NEXT:    beqz a2, .LBB22_2
 ; RV32I-NEXT:  .LBB22_4:
@@ -1000,20 +1000,20 @@ define <2 x i1> @ctpop_v2i64_eq_one(<2 x i64> %a) nounwind {
 define <2 x i1> @ctpop_v2i64_ne_one(<2 x i64> %a) nounwind {
 ; RV32I-LABEL: ctpop_v2i64_ne_one:
 ; RV32I:       # %bb.0:
+; RV32I-NEXT:    lw a2, 0(a0)
+; RV32I-NEXT:    lw a3, 4(a0)
 ; RV32I-NEXT:    lw a1, 12(a0)
-; RV32I-NEXT:    lw a2, 4(a0)
-; RV32I-NEXT:    lw a3, 0(a0)
-; RV32I-NEXT:    beqz a2, .LBB23_2
+; RV32I-NEXT:    beqz a3, .LBB23_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    seqz a3, a3
-; RV32I-NEXT:    sub a3, a2, a3
-; RV32I-NEXT:    xor a2, a2, a3
-; RV32I-NEXT:    sltu a2, a3, a2
-; RV32I-NEXT:    j .LBB23_3
-; RV32I-NEXT:  .LBB23_2:
-; RV32I-NEXT:    addi a2, a3, -1
+; RV32I-NEXT:    seqz a2, a2
+; RV32I-NEXT:    sub a2, a3, a2
 ; RV32I-NEXT:    xor a3, a3, a2
 ; RV32I-NEXT:    sltu a2, a2, a3
+; RV32I-NEXT:    j .LBB23_3
+; RV32I-NEXT:  .LBB23_2:
+; RV32I-NEXT:    addi a3, a2, -1
+; RV32I-NEXT:    xor a2, a2, a3
+; RV32I-NEXT:    sltu a2, a3, a2
 ; RV32I-NEXT:  .LBB23_3:
 ; RV32I-NEXT:    lw a3, 8(a0)
 ; RV32I-NEXT:    xori a0, a2, 1
diff --git a/llvm/test/CodeGen/RISCV/rv64i-shift-sext.ll b/llvm/test/CodeGen/RISCV/rv64i-shift-sext.ll
index f38aa71fb158d0..6c4466796aeedd 100644
--- a/llvm/test/CodeGen/RISCV/rv64i-shift-sext.ll
+++ b/llvm/test/CodeGen/RISCV/rv64i-shift-sext.ll
@@ -177,12 +177,12 @@ define i8 @test13(ptr %0, i64 %1) {
 ; RV64I-NEXT:    li a2, 1
 ; RV64I-NEXT:    subw a2, a2, a1
 ; RV64I-NEXT:    add a2, a0, a2
-; RV64I-NEXT:    lbu a2, 0(a2)
 ; RV64I-NEXT:    li a3, 2
 ; RV64I-NEXT:    subw a3, a3, a1
 ; RV64I-NEXT:    add a0, a0, a3
+; RV64I-NEXT:    lbu a1, 0(a2)
 ; RV64I-NEXT:    lbu a0, 0(a0)
-; RV64I-NEXT:    add a0, a2, a0
+; RV64I-NEXT:    add a0, a1, a0
 ; RV64I-NEXT:    ret
   %3 = mul i64 %1, -4294967296
   %4 = add i64 %3, 4294967296 ; 1 << 32
diff --git a/llvm/test/CodeGen/RISCV/rvv/expand-no-v.ll b/llvm/test/CodeGen/RISCV/rvv/expand-no-v.ll
index d34c10798f4821..92b88054a1d3bc 100644
--- a/llvm/test/CodeGen/RISCV/rvv/expand-no-v.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/expand-no-v.ll
@@ -8,14 +8,14 @@ declare i32 @llvm.vp.reduce.add.v4i32(i32, <4 x i32>, <4 x i1>, i32)
 define i32 @vpreduce_add_v4i32(i32 %s, <4 x i32> %v, <4 x i1> %m, i32 %evl) {
 ; RV32-LABEL: vpreduce_add_v4i32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    lw a4, 4(a1)
-; RV32-NEXT:    lw a5, 12(a1)
+; RV32-NEXT:    lw a4, 0(a1)
+; RV32-NEXT:    lw a5, 4(a1)
 ; RV32-NEXT:    lw a6, 8(a1)
-; RV32-NEXT:    lw a1, 0(a1)
+; RV32-NEXT:    lw a1, 12(a1)
 ; RV32-NEXT:    lw a7, 0(a2)
-; RV32-NEXT:    lw t0, 8(a2)
-; RV32-NEXT:    lw t1, 12(a2)
-; RV32-NEXT:    lw a2, 4(a2)
+; RV32-NEXT:    lw t0, 4(a2)
+; RV32-NEXT:    lw t1, 8(a2)
+; RV32-NEXT:    lw a2, 12(a2)
 ; RV32-NEXT:    snez t2, a3
 ; RV32-NEXT:    sltiu t3, a3, 3
 ; RV32-NEXT:    xori t3, t3, 1
@@ -23,34 +23,34 @@ define i32 @vpreduce_add_v4i32(i32 %s, <4 x i32> %v, <4 x i1> %m, i32 %evl) {
 ; RV32-NEXT:    xori t4, t4, 1
 ; RV32-NEXT:    sltiu a3, a3, 2
 ; RV32-NEXT:    xori a3, a3, 1
-; RV32-NEXT:    and a2, a3, a2
-; RV32-NEXT:    and a3, t4, t1
-; RV32-NEXT:    and t0, t3, t0
+; RV32-NEXT:    and a3, a3, t0
+; RV32-NEXT:    and a2, t4, a2
+; RV32-NEXT:    and t0, t3, t1
 ; RV32-NEXT:    and a7, t2, a7
 ; RV32-NEXT:    neg a7, a7
-; RV32-NEXT:    and a1, a7, a1
+; RV32-NEXT:    and a4, a7, a4
 ; RV32-NEXT:    neg a7, t0
 ; RV32-NEXT:    and a6, a7, a6
-; RV32-NEXT:    neg a3, a3
-; RV32-NEXT:    and a3, a3, a5
 ; RV32-NEXT:    neg a2, a2
-; RV32-NEXT:    and a2, a2, a4
-; RV32-NEXT:    add a2, a2, a3
-; RV32-NEXT:    add a1, a1, a6
-; RV32-NEXT:    add a1, a1, a2
+; RV32-NEXT:    and a1, a2, a1
+; RV32-NEXT:    neg a2, a3
+; RV32-NEXT:    and a2, a2, a5
+; RV32-NEXT:    add a1, a2, a1
+; RV32-NEXT:    add a4, a4, a6
+; RV32-NEXT:    add a1, a4, a1
 ; RV32-NEXT:    add a0, a1, a0
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vpreduce_add_v4i32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    lw a4, 8(a1)
-; RV64-NEXT:    lw a5, 24(a1)
+; RV64-NEXT:    lw a4, 0(a1)
+; RV64-NEXT:    lw a5, 8(a1)
 ; RV64-NEXT:    lw a6, 16(a1)
-; RV64-NEXT:    lw a1, 0(a1)
+; RV64-NEXT:    lw a1, 24(a1)
 ; RV64-NEXT:    ld a7, 0(a2)
-; RV64-NEXT:    ld t0, 16(a2)
-; RV64-NEXT:    ld t1, 24(a2)
-; RV64-NEXT:    ld a2, 8(a2)
+; RV64-NEXT:    ld t0, 8(a2)
+; RV64-NEXT:    ld t1, 16(a2)
+; RV64-NEXT:    ld a2, 24(a2)
 ; RV64-NEXT:    sext.w a3, a3
 ; RV64-NEXT:    snez t2, a3
 ; RV64-NEXT:    sltiu t3, a3, 3
@@ -59,21 +59,21 @@ define i32 @vpreduce_add_v4i32(i32 %s, <4 x i32> %v, <4 x i1> %m, i32 %evl) {
 ; RV64-NEXT:    xori t4, t4, 1
 ; RV64-NEXT:    sltiu a3, a3, 2
 ; RV64-NEXT:    xori a3, a3, 1
-; RV64-NEXT:    and a2, a3, a2
-; RV64-NEXT:    and a3, t4, t1
-; RV64-NEXT:    and t0, t3, t0
+; RV64-NEXT:    and a3, a3, t0
+; RV64-NEXT:    and a2, t4, a2
+; RV64-NEXT:    and t0, t3, t1
 ; RV64-NEXT:    and a7, t2, a7
 ; RV64-NEXT:    negw a7, a7
-; RV64-NEXT:    and a1, a7, a1
+; RV64-NEXT:    and a4, a7, a4
 ; RV64-NEXT:    negw a7, t0
 ; RV64-NEXT:    and a6, a7, a6
-; RV64-NEXT:    negw a3, a3
-; RV64-NEXT:    and a3, a3, a5
 ; RV64-NEXT:    negw a2, a2
-; RV64-NEXT:    and a2, a2, a4
-; RV64-NEXT:    add a2, a2, a3
-; RV64-NEXT:    add a1, a1, a6
-; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    and a1, a2, a1
+; RV64-NEXT:    negw a2, a3
+; RV64-NEXT:    and a2, a2, a5
+; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    add a4, a4, a6
+; RV64-NEXT:    add a1, a4, a1
 ; RV64-NEXT:    addw a0, a1, a0
 ; RV64-NEXT:    ret
   %r = call i32 @llvm.vp.reduce.add.v4i32(i32 %s, <4 x i32> %v, <4 x i1> %m, i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-elen.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-elen.ll
index 8ed19ddb1af5cf..81e20a29881630 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-elen.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-elen.ll
@@ -26,38 +26,38 @@ define void @add_v4i32(ptr %x, ptr %y) {
 define void @add_v2i64(ptr %x, ptr %y) {
 ; RV32-LABEL: add_v2i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    lw a2, 8(a0)
-; RV32-NEXT:    lw a3, 12(a0)
+; RV32-NEXT:    lw a2, 0(a1)
+; RV32-NEXT:    lw a3, 4(a1)
 ; RV32-NEXT:    lw a4, 0(a0)
 ; RV32-NEXT:    lw a5, 4(a0)
-; RV32-NEXT:    lw a6, 4(a1)
-; RV32-NEXT:    lw a7, 0(a1)
+; RV32-NEXT:    lw a6, 8(a0)
+; RV32-NEXT:    lw a7, 12(a0)
 ; RV32-NEXT:    lw t0, 8(a1)
 ; RV32-NEXT:    lw a1, 12(a1)
-; RV32-NEXT:    add a5, a5, a6
-; RV32-NEXT:    add a7, a4, a7
-; RV32-NEXT:    sltu a4, a7, a4
-; RV32-NEXT:    add a4, a5, a4
-; RV32-NEXT:    add a1, a3, a1
-; RV32-NEXT:    add t0, a2, t0
-; RV32-NEXT:    sltu a2, t0, a2
-; RV32-NEXT:    add a1, a1, a2
+; RV32-NEXT:    add a3, a5, a3
+; RV32-NEXT:    add a2, a4, a2
+; RV32-NEXT:    sltu a4, a2, a4
+; RV32-NEXT:    add a3, a3, a4
+; RV32-NEXT:    add a1, a7, a1
+; RV32-NEXT:    add t0, a6, t0
+; RV32-NEXT:    sltu a4, t0, a6
+; RV32-NEXT:    add a1, a1, a4
 ; RV32-NEXT:    sw t0, 8(a0)
-; RV32-NEXT:    sw a7, 0(a0)
+; RV32-NEXT:    sw a2, 0(a0)
 ; RV32-NEXT:    sw a1, 12(a0)
-; RV32-NEXT:    sw a4, 4(a0)
+; RV32-NEXT:    sw a3, 4(a0)
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: add_v2i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    ld a2, 8(a0)
-; RV64-NEXT:    ld a3, 0(a0)
+; RV64-NEXT:    ld a2, 0(a0)
+; RV64-NEXT:    ld a3, 8(a0)
 ; RV64-NEXT:    ld a4, 0(a1)
 ; RV64-NEXT:    ld a1, 8(a1)
-; RV64-NEXT:    add a3, a3, a4
-; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    add a1, a3, a1
 ; RV64-NEXT:    sd a1, 8(a0)
-; RV64-NEXT:    sd a3, 0(a0)
+; RV64-NEXT:    sd a2, 0(a0)
 ; RV64-NEXT:    ret
   %a = load <2 x i64>, ptr %x
   %b = load <2 x i64>, ptr %y
@@ -134,14 +134,14 @@ define void @fadd_v4f32(ptr %x, ptr %y) {
 define void @fadd_v2f64(ptr %x, ptr %y) {
 ; CHECK-LABEL: fadd_v2f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    fld fa5, 8(a0)
-; CHECK-NEXT:    fld fa4, 0(a0)
+; CHECK-NEXT:    fld fa5, 0(a0)
+; CHECK-NEXT:    fld fa4, 8(a0)
 ; CHECK-NEXT:    fld fa3, 0(a1)
 ; CHECK-NEXT:    fld fa2, 8(a1)
-; CHECK-NEXT:    fadd.d fa4, fa4, fa3
-; CHECK-NEXT:    fadd.d fa5, fa5, fa2
-; CHECK-NEXT:    fsd fa5, 8(a0)
-; CHECK-NEXT:    fsd fa4, 0(a0)
+; CHECK-NEXT:    fadd.d fa5, fa5, fa3
+; CHECK-NEXT:    fadd.d fa4, fa4, fa2
+; CHECK-NEXT:    fsd fa4, 8(a0)
+; CHECK-NEXT:    fsd fa5, 0(a0)
 ; CHECK-NEXT:    ret
   %a = load <2 x double>, ptr %x
   %b = load <2 x double>, ptr %y
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i-sat.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i-sat.ll
index ed0b15c6add5cd..3c0aaaa94fb55c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i-sat.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i-sat.ll
@@ -362,36 +362,36 @@ define void @fp2si_v8f64_v8i8(ptr %x, ptr %y) {
 ; RV32-NEXT:    fcvt.w.d a2, fa3, rtz
 ; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; RV32-NEXT:    fld fa3, 32(sp)
-; RV32-NEXT:    neg a0, a0
-; RV32-NEXT:    and a0, a0, a2
-; RV32-NEXT:    vslide1down.vx v8, v10, a0
-; RV32-NEXT:    feq.d a0, fa3, fa3
+; RV32-NEXT:    fld fa2, 40(sp)
+; RV32-NEXT:    fld fa1, 48(sp)
+; RV32-NEXT:    fld fa0, 56(sp)
+; RV32-NEXT:    feq.d a3, fa3, fa3
 ; RV32-NEXT:    fmax.d fa3, fa3, fa5
 ; RV32-NEXT:    fmin.d fa3, fa3, fa4
-; RV32-NEXT:    fcvt.w.d a2, fa3, rtz
-; RV32-NEXT:    fld fa3, 40(sp)
-; RV32-NEXT:    neg a0, a0
-; RV32-NEXT:    and a0, a0, a2
-; RV32-NEXT:    vslide1down.vx v8, v8, a0
-; RV32-NEXT:    feq.d a0, fa3, fa3
-; RV32-NEXT:    fmax.d fa3, fa3, fa5
+; RV32-NEXT:    fcvt.w.d a4, fa3, rtz
+; RV32-NEXT:    feq.d a5, fa2, fa2
+; RV32-NEXT:    fmax.d fa3, fa2, fa5
 ; RV32-NEXT:    fmin.d fa3, fa3, fa4
-; RV32-NEXT:    fcvt.w.d a2, fa3, rtz
-; RV32-NEXT:    fld fa3, 48(sp)
-; RV32-NEXT:    neg a0, a0
-; RV32-NEXT:    and a0, a0, a2
-; RV32-NEXT:    vslide1down.vx v8, v8, a0
-; RV32-NEXT:    feq.d a0, fa3, fa3
-; RV32-NEXT:    fmax.d fa3, fa3, fa5
+; RV32-NEXT:    fcvt.w.d a6, fa3, rtz
+; RV32-NEXT:    feq.d a7, fa1, fa1
+; RV32-NEXT:    fmax.d fa3, fa1, fa5
 ; RV32-NEXT:    fmin.d fa3, fa3, fa4
-; RV32-NEXT:    fcvt.w.d a2, fa3, rtz
-; RV32-NEXT:    fld fa3, 56(sp)
+; RV32-NEXT:    fcvt.w.d t0, fa3, rtz
 ; RV32-NEXT:    neg a0, a0
 ; RV32-NEXT:    and a0, a0, a2
+; RV32-NEXT:    vslide1down.vx v8, v10, a0
+; RV32-NEXT:    neg a0, a3
+; RV32-NEXT:    and a0, a0, a4
 ; RV32-NEXT:    vslide1down.vx v8, v8, a0
-; RV32-NEXT:    feq.d a0, fa3, fa3
+; RV32-NEXT:    neg a0, a5
+; RV32-NEXT:    and a0, a0, a6
+; RV32-NEXT:    vslide1down.vx v8, v8, a0
+; RV32-NEXT:    neg a0, a7
+; RV32-NEXT:    and a0, a0, t0
+; RV32-NEXT:    vslide1down.vx v8, v8, a0
+; RV32-NEXT:    feq.d a0, fa0, fa0
 ; RV32-NEXT:    neg a0, a0
-; RV32-NEXT:    fmax.d fa5, fa3, fa5
+; RV32-NEXT:    fmax.d fa5, fa0, fa5
 ; RV32-NEXT:    fmin.d fa5, fa5, fa4
 ; RV32-NEXT:    fcvt.w.d a2, fa5, rtz
 ; RV32-NEXT:    and a0, a0, a2
@@ -461,36 +461,36 @@ define void @fp2si_v8f64_v8i8(ptr %x, ptr %y) {
 ; RV64-NEXT:    fcvt.l.d a2, fa3, rtz
 ; RV64-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; RV64-NEXT:    fld fa3, 32(sp)
-; RV64-NEXT:    neg a0, a0
-; RV64-NEXT:    and a0, a0, a2
-; RV64-NEXT:    vslide1down.vx v8, v10, a0
-; RV64-NEXT:    feq.d a0, fa3, fa3
+; RV64-NEXT:    fld fa2, 40(sp)
+; RV64-NEXT:    fld fa1, 48(sp)
+; RV64-NEXT:    fld fa0, 56(sp)
+; RV64-NEXT:    feq.d a3, fa3, fa3
 ; RV64-NEXT:    fmax.d fa3, fa3, fa5
 ; RV64-NEXT:    fmin.d fa3, fa3, fa4
-; RV64-NEXT:    fcvt.l.d a2, fa3, rtz
-; RV64-NEXT:    fld fa3, 40(sp)
-; RV64-NEXT:    neg a0, a0
-; RV64-NEXT:    and a0, a0, a2
-; RV64-NEXT:    vslide1down.vx v8, v8, a0
-; RV64-NEXT:    feq.d a0, fa3, fa3
-; RV64-NEXT:    fmax.d fa3, fa3, fa5
+; RV64-NEXT:    fcvt.l.d a4, fa3, rtz
+; RV64-NEXT:    feq.d a5, fa2, fa2
+; RV64-NEXT:    fmax.d fa3, fa2, fa5
 ; RV64-NEXT:    fmin.d fa3, fa3, fa4
-; RV64-NEXT:    fcvt.l.d a2, fa3, rtz
-; RV64-NEXT:    fld fa3, 48(sp)
-; RV64-NEXT:    neg a0, a0
-; RV64-NEXT:    and a0, a0, a2
-; RV64-NEXT:    vslide1down.vx v8, v8, a0
-; RV64-NEXT:    feq.d a0, fa3, fa3
-; RV64-NEXT:    fmax.d fa3, fa3, fa5
+; RV64-NEXT:    fcvt.l.d a6, fa3, rtz
+; RV64-NEXT:    feq.d a7, fa1, fa1
+; RV64-NEXT:    fmax.d fa3, fa1, fa5
 ; RV64-NEXT:    fmin.d fa3, fa3, fa4
-; RV64-NEXT:    fcvt.l.d a2, fa3, rtz
-; RV64-NEXT:    fld fa3, 56(sp)
+; RV64-NEXT:    fcvt.l.d t0, fa3, rtz
 ; RV64-NEXT:    neg a0, a0
 ; RV64-NEXT:    and a0, a0, a2
+; RV64-NEXT:    vslide1down.vx v8, v10, a0
+; RV64-NEXT:    neg a0, a3
+; RV64-NEXT:    and a0, a0, a4
 ; RV64-NEXT:    vslide1down.vx v8, v8, a0
-; RV64-NEXT:    feq.d a0, fa3, fa3
+; RV64-NEXT:    neg a0, a5
+; RV64-NEXT:    and a0, a0, a6
+; RV64-NEXT:    vslide1down.vx v8, v8, a0
+; RV64-NEXT:    neg a0, a7
+; RV64-NEXT:    and a0, a0, t0
+; RV64-NEXT:    vslide1down.vx v8, v8, a0
+; RV64-NEXT:    feq.d a0, fa0, fa0
 ; RV64-NEXT:    neg a0, a0
-; RV64-NEXT:    fmax.d fa5, fa3, fa5
+; RV64-NEXT:    fmax.d fa5, fa0, fa5
 ; RV64-NEXT:    fmin.d fa5, fa5, fa4
 ; RV64-NEXT:    fcvt.l.d a2, fa5, rtz
 ; RV64-NEXT:    and a0, a0, a2
@@ -553,27 +553,27 @@ define void @fp2ui_v8f64_v8i8(ptr %x, ptr %y) {
 ; RV32-NEXT:    vslidedown.vi v8, v8, 3
 ; RV32-NEXT:    vfmv.f.s fa4, v8
 ; RV32-NEXT:    fmax.d fa4, fa4, fa3
-; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; RV32-NEXT:    fld fa2, 32(sp)
 ; RV32-NEXT:    fmin.d fa4, fa4, fa5
 ; RV32-NEXT:    fcvt.wu.d a0, fa4, rtz
-; RV32-NEXT:    fld fa4, 40(sp)
-; RV32-NEXT:    fmax.d fa2, fa2, fa3
-; RV32-NEXT:    fmin.d fa2, fa2, fa5
-; RV32-NEXT:    fcvt.wu.d a2, fa2, rtz
+; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; RV32-NEXT:    fld fa4, 32(sp)
+; RV32-NEXT:    fld fa2, 40(sp)
+; RV32-NEXT:    fld fa1, 48(sp)
+; RV32-NEXT:    fld fa0, 56(sp)
 ; RV32-NEXT:    fmax.d fa4, fa4, fa3
-; RV32-NEXT:    fld fa2, 48(sp)
 ; RV32-NEXT:    fmin.d fa4, fa4, fa5
-; RV32-NEXT:    fcvt.wu.d a3, fa4, rtz
-; RV32-NEXT:    vslide1down.vx v8, v10, a0
+; RV32-NEXT:    fcvt.wu.d a2, fa4, rtz
 ; RV32-NEXT:    fmax.d fa4, fa2, fa3
 ; RV32-NEXT:    fmin.d fa4, fa4, fa5
-; RV32-NEXT:    fcvt.wu.d a0, fa4, rtz
-; RV32-NEXT:    fld fa4, 56(sp)
+; RV32-NEXT:    fcvt.wu.d a3, fa4, rtz
+; RV32-NEXT:    fmax.d fa4, fa1, fa3
+; RV32-NEXT:    fmin.d fa4, fa4, fa5
+; RV32-NEXT:    fcvt.wu.d a4, fa4, rtz
+; RV32-NEXT:    vslide1down.vx v8, v10, a0
 ; RV32-NEXT:    vslide1down.vx v8, v8, a2
 ; RV32-NEXT:    vslide1down.vx v8, v8, a3
-; RV32-NEXT:    vslide1down.vx v8, v8, a0
-; RV32-NEXT:    fmax.d fa4, fa4, fa3
+; RV32-NEXT:    vslide1down.vx v8, v8, a4
+; RV32-NEXT:    fmax.d fa4, fa0, fa3
 ; RV32-NEXT:    fmin.d fa5, fa4, fa5
 ; RV32-NEXT:    fcvt.wu.d a0, fa5, rtz
 ; RV32-NEXT:    vslide1down.vx v8, v8, a0
@@ -627,27 +627,27 @@ define void @fp2ui_v8f64_v8i8(ptr %x, ptr %y) {
 ; RV64-NEXT:    vslidedown.vi v8, v8, 3
 ; RV64-NEXT:    vfmv.f.s fa4, v8
 ; RV64-NEXT:    fmax.d fa4, fa4, fa3
-; RV64-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; RV64-NEXT:    fld fa2, 32(sp)
 ; RV64-NEXT:    fmin.d fa4, fa4, fa5
 ; RV64-NEXT:    fcvt.lu.d a0, fa4, rtz
-; RV64-NEXT:    fld fa4, 40(sp)
-; RV64-NEXT:    fmax.d fa2, fa2, fa3
-; RV64-NEXT:    fmin.d fa2, fa2, fa5
-; RV64-NEXT:    fcvt.lu.d a2, fa2, rtz
+; RV64-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; RV64-NEXT:    fld fa4, 32(sp)
+; RV64-NEXT:    fld fa2, 40(sp)
+; RV64-NEXT:    fld fa1, 48(sp)
+; RV64-NEXT:    fld fa0, 56(sp)
 ; RV64-NEXT:    fmax.d fa4, fa4, fa3
-; RV64-NEXT:    fld fa2, 48(sp)
 ; RV64-NEXT:    fmin.d fa4, fa4, fa5
-; RV64-NEXT:    fcvt.lu.d a3, fa4, rtz
-; RV64-NEXT:    vslide1down.vx v8, v10, a0
+; RV64-NEXT:    fcvt.lu.d a2, fa4, rtz
 ; RV64-NEXT:    fmax.d fa4, fa2, fa3
 ; RV64-NEXT:    fmin.d fa4, fa4, fa5
-; RV64-NEXT:    fcvt.lu.d a0, fa4, rtz
-; RV64-NEXT:    fld fa4, 56(sp)
+; RV64-NEXT:    fcvt.lu.d a3, fa4, rtz
+; RV64-NEXT:    fmax.d fa4, fa1, fa3
+; RV64-NEXT:    fmin.d fa4, fa4, fa5
+; RV64-NEXT:    fcvt.lu.d a4, fa4, rtz
+; RV64-NEXT:    vslide1down.vx v8, v10, a0
 ; RV64-NEXT:    vslide1down.vx v8, v8, a2
 ; RV64-NEXT:    vslide1down.vx v8, v8, a3
-; RV64-NEXT:    vslide1down.vx v8, v8, a0
-; RV64-NEXT:    fmax.d fa4, fa4, fa3
+; RV64-NEXT:    vslide1down.vx v8, v8, a4
+; RV64-NEXT:    fmax.d fa4, fa0, fa3
 ; RV64-NEXT:    fmin.d fa5, fa4, fa5
 ; RV64-NEXT:    fcvt.lu.d a0, fa5, rtz
 ; RV64-NEXT:    vslide1down.vx v8, v8, a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-i2fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-i2fp.ll
index ec11ada12eaa76..924d9a84ce9278 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-i2fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-i2fp.ll
@@ -134,12 +134,12 @@ define <3 x float> @si2fp_v3i1_v3f32(<3 x i1> %x) {
 define <3 x float> @si2fp_v3i7_v3f32(<3 x i7> %x) {
 ; LMULMAX8RV32-LABEL: si2fp_v3i7_v3f32:
 ; LMULMAX8RV32:       # %bb.0:
-; LMULMAX8RV32-NEXT:    lw a1, 4(a0)
-; LMULMAX8RV32-NEXT:    lw a2, 0(a0)
+; LMULMAX8RV32-NEXT:    lw a1, 0(a0)
+; LMULMAX8RV32-NEXT:    lw a2, 4(a0)
 ; LMULMAX8RV32-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; LMULMAX8RV32-NEXT:    lw a0, 8(a0)
-; LMULMAX8RV32-NEXT:    vmv.v.x v8, a2
-; LMULMAX8RV32-NEXT:    vslide1down.vx v8, v8, a1
+; LMULMAX8RV32-NEXT:    vmv.v.x v8, a1
+; LMULMAX8RV32-NEXT:    vslide1down.vx v8, v8, a2
 ; LMULMAX8RV32-NEXT:    vslide1down.vx v8, v8, a0
 ; LMULMAX8RV32-NEXT:    vslidedown.vi v8, v8, 1
 ; LMULMAX8RV32-NEXT:    vadd.vv v8, v8, v8
@@ -151,12 +151,12 @@ define <3 x float> @si2fp_v3i7_v3f32(<3 x i7> %x) {
 ;
 ; LMULMAX8RV64-LABEL: si2fp_v3i7_v3f32:
 ; LMULMAX8RV64:       # %bb.0:
-; LMULMAX8RV64-NEXT:    ld a1, 8(a0)
-; LMULMAX8RV64-NEXT:    ld a2, 0(a0)
+; LMULMAX8RV64-NEXT:    ld a1, 0(a0)
+; LMULMAX8RV64-NEXT:    ld a2, 8(a0)
 ; LMULMAX8RV64-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; LMULMAX8RV64-NEXT:    ld a0, 16(a0)
-; LMULMAX8RV64-NEXT:    vmv.v.x v8, a2
-; LMULMAX8RV64-NEXT:    vslide1down.vx v8, v8, a1
+; LMULMAX8RV64-NEXT:    vmv.v.x v8, a1
+; LMULMAX8RV64-NEXT:    vslide1down.vx v8, v8, a2
 ; LMULMAX8RV64-NEXT:    vslide1down.vx v8, v8, a0
 ; LMULMAX8RV64-NEXT:    vslidedown.vi v8, v8, 1
 ; LMULMAX8RV64-NEXT:    vadd.vv v8, v8, v8
@@ -168,12 +168,12 @@ define <3 x float> @si2fp_v3i7_v3f32(<3 x i7> %x) {
 ;
 ; LMULMAX1RV32-LABEL: si2fp_v3i7_v3f32:
 ; LMULMAX1RV32:       # %bb.0:
-; LMULMAX1RV32-NEXT:    lw a1, 4(a0)
-; LMULMAX1RV32-NEXT:    lw a2, 0(a0)
+; LMULMAX1RV32-NEXT:    lw a1, 0(a0)
+; LMULMAX1RV32-NEXT:    lw a2, 4(a0)
 ; LMULMAX1RV32-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; LMULMAX1RV32-NEXT:    lw a0, 8(a0)
-; LMULMAX1RV32-NEXT:    vmv.v.x v8, a2
-; LMULMAX1RV32-NEXT:    vslide1down.vx v8, v8, a1
+; LMULMAX1RV32-NEXT:    vmv.v.x v8, a1
+; LMULMAX1RV32-NEXT:    vslide1down.vx v8, v8, a2
 ; LMULMAX1RV32-NEXT:    vslide1down.vx v8, v8, a0
 ; LMULMAX1RV32-NEXT:    vslidedown.vi v8, v8, 1
 ; LMULMAX1RV32-NEXT:    vadd.vv v8, v8, v8
@@ -185,12 +185,12 @@ define <3 x float> @si2fp_v3i7_v3f32(<3 x i7> %x) {
 ;
 ; LMULMAX1RV64-LABEL: si2fp_v3i7_v3f32:
 ; LMULMAX1RV64:       # %bb.0:
-; LMULMAX1RV64-NEXT:    ld a1, 8(a0)
-; LMULMAX1RV64-NEXT:    ld a2, 0(a0)
+; LMULMAX1RV64-NEXT:    ld a1, 0(a0)
+; LMULMAX1RV64-NEXT:    ld a2, 8(a0)
 ; LMULMAX1RV64-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; LMULMAX1RV64-NEXT:    ld a0, 16(a0)
-; LMULMAX1RV64-NEXT:    vmv.v.x v8, a2
-; LMULMAX1RV64-NEXT:    vslide1down.vx v8, v8, a1
+; LMULMAX1RV64-NEXT:    vmv.v.x v8, a1
+; LMULMAX1RV64-NEXT:    vslide1down.vx v8, v8, a2
 ; LMULMAX1RV64-NEXT:    vslide1down.vx v8, v8, a0
 ; LMULMAX1RV64-NEXT:    vslidedown.vi v8, v8, 1
 ; LMULMAX1RV64-NEXT:    vadd.vv v8, v8, v8
@@ -202,12 +202,12 @@ define <3 x float> @si2fp_v3i7_v3f32(<3 x i7> %x) {
 ;
 ; LMULMAX8RV32ZVFHMIN-LABEL: si2fp_v3i7_v3f32:
 ; LMULMAX8RV32ZVFHMIN:       # %bb.0:
-; LMULMAX8RV32ZVFHMIN-NEXT:    lw a1, 4(a0)
-; LMULMAX8RV32ZVFHMIN-NEXT:    lw a2, 0(a0)
+; LMULMAX8RV32ZVFHMIN-NEXT:    lw a1, 0(a0)
+; LMULMAX8RV32ZVFHMIN-NEXT:    lw a2, 4(a0)
 ; LMULMAX8RV32ZVFHMIN-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; LMULMAX8RV32ZVFHMIN-NEXT:    lw a0, 8(a0)
-; LMULMAX8RV32ZVFHMIN-NEXT:    vmv.v.x v8, a2
-; LMULMAX8RV32ZVFHMIN-NEXT:    vslide1down.vx v8, v8, a1
+; LMULMAX8RV32ZVFHMIN-NEXT:    vmv.v.x v8, a1
+; LMULMAX8RV32ZVFHMIN-NEXT:    vslide1down.vx v8, v8, a2
 ; LMULMAX8RV32ZVFHMIN-NEXT:    vslide1down.vx v8, v8, a0
 ; LMULMAX8RV32ZVFHMIN-NEXT:    vslidedown.vi v8, v8, 1
 ; LMULMAX8RV32ZVFHMIN-NEXT:    vadd.vv v8, v8, v8
@@ -219,12 +219,12 @@ define <3 x float> @si2fp_v3i7_v3f32(<3 x i7> %x) {
 ;
 ; LMULMAX8RV64ZVFHMIN-LABEL: si2fp_v3i7_v3f32:
 ; LMULMAX8RV64ZVFHMIN:       # %bb.0:
-; LMULMAX8RV64ZVFHMIN-NEXT:    ld a1, 8(a0)
-; LMULMAX8RV64ZVFHMIN-NEXT:    ld a2, 0(a0)
+; LMULMAX8RV64ZVFHMIN-NEXT:    ld a1, 0(a0)
+; LMULMAX8RV64ZVFHMIN-NEXT:    ld a2, 8(a0)
 ; LMULMAX8RV64ZVFHMIN-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; LMULMAX8RV64ZVFHMIN-NEXT:    ld a0, 16(a0)
-; LMULMAX8RV64ZVFHMIN-NEXT:    vmv.v.x v8, a2
-; LMULMAX8RV64ZVFHMIN-NEXT:    vslide1down.vx v8, v8, a1
+; LMULMAX8RV64ZVFHMIN-NEXT:    vmv.v.x v8, a1
+; LMULMAX8RV64ZVFHMIN-NEXT:    vslide1down.vx v8, v8, a2
 ; LMULMAX8RV64ZVFHMIN-NEXT:    vslide1down.vx v8, v8, a0
 ; LMULMAX8RV64ZVFHMIN-NEXT:    vslidedown.vi v8, v8, 1
 ; LMULMAX8RV64ZVFHMIN-NEXT:    vadd.vv v8, v8, v8
@@ -241,12 +241,12 @@ define <3 x float> @si2fp_v3i7_v3f32(<3 x i7> %x) {
 define <3 x float> @ui2fp_v3i7_v3f32(<3 x i7> %x) {
 ; LMULMAX8RV32-LABEL: ui2fp_v3i7_v3f32:
 ; LMULMAX8RV32:       # %bb.0:
-; LMULMAX8RV32-NEXT:    lw a1, 4(a0)
-; LMULMAX8RV32-NEXT:    lw a2, 0(a0)
+; LMULMAX8RV32-NEXT:    lw a1, 0(a0)
+; LMULMAX8RV32-NEXT:    lw a2, 4(a0)
 ; LMULMAX8RV32-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; LMULMAX8RV32-NEXT:    lw a0, 8(a0)
-; LMULMAX8RV32-NEXT:    vmv.v.x v8, a2
-; LMULMAX8RV32-NEXT:    vslide1down.vx v8, v8, a1
+; LMULMAX8RV32-NEXT:    vmv.v.x v8, a1
+; LMULMAX8RV32-NEXT:    vslide1down.vx v8, v8, a2
 ; LMULMAX8RV32-NEXT:    vslide1down.vx v8, v8, a0
 ; LMULMAX8RV32-NEXT:    vslidedown.vi v8, v8, 1
 ; LMULMAX8RV32-NEXT:    li a0, 127
@@ -258,12 +258,12 @@ define <3 x float> @ui2fp_v3i7_v3f32(<3 x i7> %x) {
 ;
 ; LMULMAX8RV64-LABEL: ui2fp_v3i7_v3f32:
 ; LMULMAX8RV64:       # %bb.0:
-; LMULMAX8RV64-NEXT:    ld a1, 8(a0)
-; LMULMAX8RV64-NEXT:    ld a2, 0(a0)
+; LMULMAX8RV64-NEXT:    ld a1, 0(a0)
+; LMULMAX8RV64-NEXT:    ld a2, 8(a0)
 ; LMULMAX8RV64-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; LMULMAX8RV64-NEXT:    ld a0, 16(a0)
-; LMULMAX8RV64-NEXT:    vmv.v.x v8, a2
-; LMULMAX8RV64-NEXT:    vslide1down.vx v8, v8, a1
+; LMULMAX8RV64-NEXT:    vmv.v.x v8, a1
+; LMULMAX8RV64-NEXT:    vslide1down.vx v8, v8, a2
 ; LMULMAX8RV64-NEXT:    vslide1down.vx v8, v8, a0
 ; LMULMAX8RV64-NEXT:    vslidedown.vi v8, v8, 1
 ; LMULMAX8RV64-NEXT:    li a0, 127
@@ -275,12 +275,12 @@ define <3 x float> @ui2fp_v3i7_v3f32(<3 x i7> %x) {
 ;
 ; LMULMAX1RV32-LABEL: ui2fp_v3i7_v3f32:
 ; LMULMAX1RV32:       # %bb.0:
-; LMULMAX1RV32-NEXT:    lw a1, 4(a0)
-; LMULMAX1RV32-NEXT:    lw a2, 0(a0)
+; LMULMAX1RV32-NEXT:    lw a1, 0(a0)
+; LMULMAX1RV32-NEXT:    lw a2, 4(a0)
 ; LMULMAX1RV32-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; LMULMAX1RV32-NEXT:    lw a0, 8(a0)
-; LMULMAX1RV32-NEXT:    vmv.v.x v8, a2
-; LMULMAX1RV32-NEXT:    vslide1down.vx v8, v8, a1
+; LMULMAX1RV32-NEXT:    vmv.v.x v8, a1
+; LMULMAX1RV32-NEXT:    vslide1down.vx v8, v8, a2
 ; LMULMAX1RV32-NEXT:    vslide1down.vx v8, v8, a0
 ; LMULMAX1RV32-NEXT:    vslidedown.vi v8, v8, 1
 ; LMULMAX1RV32-NEXT:    li a0, 127
@@ -292,12 +292,12 @@ define <3 x float> @ui2fp_v3i7_v3f32(<3 x i7> %x) {
 ;
 ; LMULMAX1RV64-LABEL: ui2fp_v3i7_v3f32:
 ; LMULMAX1RV64:       # %bb.0:
-; LMULMAX1RV64-NEXT:    ld a1, 8(a0)
-; LMULMAX1RV64-NEXT:    ld a2, 0(a0)
+; LMULMAX1RV64-NEXT:    ld a1, 0(a0)
+; LMULMAX1RV64-NEXT:    ld a2, 8(a0)
 ; LMULMAX1RV64-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; LMULMAX1RV64-NEXT:    ld a0, 16(a0)
-; LMULMAX1RV64-NEXT:    vmv.v.x v8, a2
-; LMULMAX1RV64-NEXT:    vslide1down.vx v8, v8, a1
+; LMULMAX1RV64-NEXT:    vmv.v.x v8, a1
+; LMULMAX1RV64-NEXT:    vslide1down.vx v8, v8, a2
 ; LMULMAX1RV64-NEXT:    vslide1down.vx v8, v8, a0
 ; LMULMAX1RV64-NEXT:    vslidedown.vi v8, v8, 1
 ; LMULMAX1RV64-NEXT:    li a0, 127
@@ -309,12 +309,12 @@ define <3 x float> @ui2fp_v3i7_v3f32(<3 x i7> %x) {
 ;
 ; LMULMAX8RV32ZVFHMIN-LABEL: ui2fp_v3i7_v3f32:
 ; LMULMAX8RV32ZVFHMIN:       # %bb.0:
-; LMULMAX8RV32ZVFHMIN-NEXT:    lw a1, 4(a0)
-; LMULMAX8RV32ZVFHMIN-NEXT:    lw a2, 0(a0)
+; LMULMAX8RV32ZVFHMIN-NEXT:    lw a1, 0(a0)
+; LMULMAX8RV32ZVFHMIN-NEXT:    lw a2, 4(a0)
 ; LMULMAX8RV32ZVFHMIN-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; LMULMAX8RV32ZVFHMIN-NEXT:    lw a0, 8(a0)
-; LMULMAX8RV32ZVFHMIN-NEXT:    vmv.v.x v8, a2
-; LMULMAX8RV32ZVFHMIN-NEXT:    vslide1down.vx v8, v8, a1
+; LMULMAX8RV32ZVFHMIN-NEXT:    vmv.v.x v8, a1
+; LMULMAX8RV32ZVFHMIN-NEXT:    vslide1down.vx v8, v8, a2
 ; LMULMAX8RV32ZVFHMIN-NEXT:    vslide1down.vx v8, v8, a0
 ; LMULMAX8RV32ZVFHMIN-NEXT:    vslidedown.vi v8, v8, 1
 ; LMULMAX8RV32ZVFHMIN-NEXT:    li a0, 127
@@ -326,12 +326,12 @@ define <3 x float> @ui2fp_v3i7_v3f32(<3 x i7> %x) {
 ;
 ; LMULMAX8RV64ZVFHMIN-LABEL: ui2fp_v3i7_v3f32:
 ; LMULMAX8RV64ZVFHMIN:       # %bb.0:
-; LMULMAX8RV64ZVFHMIN-NEXT:    ld a1, 8(a0)
-; LMULMAX8RV64ZVFHMIN-NEXT:    ld a2, 0(a0)
+; LMULMAX8RV64ZVFHMIN-NEXT:    ld a1, 0(a0)
+; LMULMAX8RV64ZVFHMIN-NEXT:    ld a2, 8(a0)
 ; LMULMAX8RV64ZVFHMIN-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; LMULMAX8RV64ZVFHMIN-NEXT:    ld a0, 16(a0)
-; LMULMAX8RV64ZVFHMIN-NEXT:    vmv.v.x v8, a2
-; LMULMAX8RV64ZVFHMIN-NEXT:    vslide1down.vx v8, v8, a1
+; LMULMAX8RV64ZVFHMIN-NEXT:    vmv.v.x v8, a1
+; LMULMAX8RV64ZVFHMIN-NEXT:    vslide1down.vx v8, v8, a2
 ; LMULMAX8RV64ZVFHMIN-NEXT:    vslide1down.vx v8, v8, a0
 ; LMULMAX8RV64ZVFHMIN-NEXT:    vslidedown.vi v8, v8, 1
 ; LMULMAX8RV64ZVFHMIN-NEXT:    li a0, 127
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access-zve32x.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access-zve32x.ll
index 8acc70faaa1fc9..bad3836c506035 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access-zve32x.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access-zve32x.ll
@@ -7,25 +7,25 @@
 define <4 x i1> @load_large_vector(ptr %p) {
 ; ZVE32X-LABEL: load_large_vector:
 ; ZVE32X:       # %bb.0:
-; ZVE32X-NEXT:    ld a1, 80(a0)
-; ZVE32X-NEXT:    ld a2, 72(a0)
-; ZVE32X-NEXT:    ld a3, 56(a0)
+; ZVE32X-NEXT:    ld a1, 0(a0)
+; ZVE32X-NEXT:    ld a2, 8(a0)
+; ZVE32X-NEXT:    ld a3, 24(a0)
 ; ZVE32X-NEXT:    ld a4, 32(a0)
-; ZVE32X-NEXT:    ld a5, 24(a0)
-; ZVE32X-NEXT:    ld a6, 48(a0)
-; ZVE32X-NEXT:    ld a7, 8(a0)
-; ZVE32X-NEXT:    ld a0, 0(a0)
-; ZVE32X-NEXT:    xor a4, a5, a4
-; ZVE32X-NEXT:    snez a4, a4
+; ZVE32X-NEXT:    ld a5, 48(a0)
+; ZVE32X-NEXT:    ld a6, 56(a0)
+; ZVE32X-NEXT:    ld a7, 72(a0)
+; ZVE32X-NEXT:    ld a0, 80(a0)
+; ZVE32X-NEXT:    xor a3, a3, a4
+; ZVE32X-NEXT:    snez a3, a3
 ; ZVE32X-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
-; ZVE32X-NEXT:    vmv.s.x v8, a4
+; ZVE32X-NEXT:    vmv.s.x v8, a3
 ; ZVE32X-NEXT:    vand.vi v8, v8, 1
 ; ZVE32X-NEXT:    vmsne.vi v0, v8, 0
 ; ZVE32X-NEXT:    vmv.s.x v8, zero
 ; ZVE32X-NEXT:    vmerge.vim v9, v8, 1, v0
-; ZVE32X-NEXT:    xor a0, a0, a7
-; ZVE32X-NEXT:    snez a0, a0
-; ZVE32X-NEXT:    vmv.s.x v10, a0
+; ZVE32X-NEXT:    xor a1, a1, a2
+; ZVE32X-NEXT:    snez a1, a1
+; ZVE32X-NEXT:    vmv.s.x v10, a1
 ; ZVE32X-NEXT:    vand.vi v10, v10, 1
 ; ZVE32X-NEXT:    vmsne.vi v0, v10, 0
 ; ZVE32X-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
@@ -36,9 +36,9 @@ define <4 x i1> @load_large_vector(ptr %p) {
 ; ZVE32X-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; ZVE32X-NEXT:    vmsne.vi v0, v11, 0
 ; ZVE32X-NEXT:    vmerge.vim v9, v10, 1, v0
-; ZVE32X-NEXT:    xor a0, a6, a3
-; ZVE32X-NEXT:    snez a0, a0
-; ZVE32X-NEXT:    vmv.s.x v11, a0
+; ZVE32X-NEXT:    xor a1, a5, a6
+; ZVE32X-NEXT:    snez a1, a1
+; ZVE32X-NEXT:    vmv.s.x v11, a1
 ; ZVE32X-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; ZVE32X-NEXT:    vand.vi v11, v11, 1
 ; ZVE32X-NEXT:    vmsne.vi v0, v11, 0
@@ -48,8 +48,8 @@ define <4 x i1> @load_large_vector(ptr %p) {
 ; ZVE32X-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; ZVE32X-NEXT:    vmsne.vi v0, v9, 0
 ; ZVE32X-NEXT:    vmerge.vim v9, v10, 1, v0
-; ZVE32X-NEXT:    xor a1, a2, a1
-; ZVE32X-NEXT:    snez a0, a1
+; ZVE32X-NEXT:    xor a0, a7, a0
+; ZVE32X-NEXT:    snez a0, a0
 ; ZVE32X-NEXT:    vmv.s.x v10, a0
 ; ZVE32X-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; ZVE32X-NEXT:    vand.vi v10, v10, 1
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-lrint.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-lrint.ll
index 224f5066138cde..39dda1fd5ebefa 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-lrint.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-lrint.ll
@@ -574,20 +574,20 @@ define <8 x iXLen> @lrint_v8f64(<8 x double> %x) {
 ; RV32-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
 ; RV32-NEXT:    vslidedown.vi v8, v8, 3
 ; RV32-NEXT:    vfmv.f.s fa5, v8
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    fld fa4, 32(sp)
-; RV32-NEXT:    fld fa3, 40(sp)
 ; RV32-NEXT:    fcvt.w.d a0, fa5
-; RV32-NEXT:    fld fa5, 48(sp)
-; RV32-NEXT:    fcvt.w.d a1, fa4
-; RV32-NEXT:    fcvt.w.d a2, fa3
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    fld fa5, 32(sp)
+; RV32-NEXT:    fld fa4, 40(sp)
+; RV32-NEXT:    fld fa3, 48(sp)
+; RV32-NEXT:    fld fa2, 56(sp)
+; RV32-NEXT:    fcvt.w.d a1, fa5
+; RV32-NEXT:    fcvt.w.d a2, fa4
+; RV32-NEXT:    fcvt.w.d a3, fa3
 ; RV32-NEXT:    vslide1down.vx v8, v10, a0
-; RV32-NEXT:    fcvt.w.d a0, fa5
-; RV32-NEXT:    fld fa5, 56(sp)
 ; RV32-NEXT:    vslide1down.vx v8, v8, a1
 ; RV32-NEXT:    vslide1down.vx v8, v8, a2
-; RV32-NEXT:    vslide1down.vx v8, v8, a0
-; RV32-NEXT:    fcvt.w.d a0, fa5
+; RV32-NEXT:    vslide1down.vx v8, v8, a3
+; RV32-NEXT:    fcvt.w.d a0, fa2
 ; RV32-NEXT:    vslide1down.vx v8, v8, a0
 ; RV32-NEXT:    addi sp, s0, -128
 ; RV32-NEXT:    lw ra, 124(sp) # 4-byte Folded Reload
@@ -627,20 +627,20 @@ define <8 x iXLen> @lrint_v8f64(<8 x double> %x) {
 ; RV64-i32-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
 ; RV64-i32-NEXT:    vslidedown.vi v8, v8, 3
 ; RV64-i32-NEXT:    vfmv.f.s fa5, v8
-; RV64-i32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV64-i32-NEXT:    fld fa4, 32(sp)
-; RV64-i32-NEXT:    fld fa3, 40(sp)
 ; RV64-i32-NEXT:    fcvt.l.d a0, fa5
-; RV64-i32-NEXT:    fld fa5, 48(sp)
-; RV64-i32-NEXT:    fcvt.l.d a1, fa4
-; RV64-i32-NEXT:    fcvt.l.d a2, fa3
+; RV64-i32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV64-i32-NEXT:    fld fa5, 32(sp)
+; RV64-i32-NEXT:    fld fa4, 40(sp)
+; RV64-i32-NEXT:    fld fa3, 48(sp)
+; RV64-i32-NEXT:    fld fa2, 56(sp)
+; RV64-i32-NEXT:    fcvt.l.d a1, fa5
+; RV64-i32-NEXT:    fcvt.l.d a2, fa4
+; RV64-i32-NEXT:    fcvt.l.d a3, fa3
 ; RV64-i32-NEXT:    vslide1down.vx v8, v10, a0
-; RV64-i32-NEXT:    fcvt.l.d a0, fa5
-; RV64-i32-NEXT:    fld fa5, 56(sp)
 ; RV64-i32-NEXT:    vslide1down.vx v8, v8, a1
 ; RV64-i32-NEXT:    vslide1down.vx v8, v8, a2
-; RV64-i32-NEXT:    vslide1down.vx v8, v8, a0
-; RV64-i32-NEXT:    fcvt.l.d a0, fa5
+; RV64-i32-NEXT:    vslide1down.vx v8, v8, a3
+; RV64-i32-NEXT:    fcvt.l.d a0, fa2
 ; RV64-i32-NEXT:    vslide1down.vx v8, v8, a0
 ; RV64-i32-NEXT:    addi sp, s0, -128
 ; RV64-i32-NEXT:    ld ra, 120(sp) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
index ac3bf0d89b5ed9..6e8470af27a862 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
@@ -3599,9 +3599,9 @@ define <1 x i64> @mgather_v1i64(<1 x ptr> %ptrs, <1 x i1> %m, <1 x i64> %passthr
 ; RV32ZVE32F-NEXT:    bnez a2, .LBB42_2
 ; RV32ZVE32F-NEXT:  # %bb.1: # %cond.load
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV32ZVE32F-NEXT:    vmv.x.s a0, v8
-; RV32ZVE32F-NEXT:    lw a1, 4(a0)
-; RV32ZVE32F-NEXT:    lw a0, 0(a0)
+; RV32ZVE32F-NEXT:    vmv.x.s a1, v8
+; RV32ZVE32F-NEXT:    lw a0, 0(a1)
+; RV32ZVE32F-NEXT:    lw a1, 4(a1)
 ; RV32ZVE32F-NEXT:  .LBB42_2: # %else
 ; RV32ZVE32F-NEXT:    ret
 ;
@@ -3645,30 +3645,30 @@ define <2 x i64> @mgather_v2i64(<2 x ptr> %ptrs, <2 x i1> %m, <2 x i64> %passthr
 ; RV32ZVE32F-NEXT:  # %bb.1: # %cond.load
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s a3, v8
-; RV32ZVE32F-NEXT:    lw a2, 4(a3)
-; RV32ZVE32F-NEXT:    lw a3, 0(a3)
+; RV32ZVE32F-NEXT:    lw a2, 0(a3)
+; RV32ZVE32F-NEXT:    lw a3, 4(a3)
 ; RV32ZVE32F-NEXT:    andi a4, a4, 2
 ; RV32ZVE32F-NEXT:    bnez a4, .LBB43_4
 ; RV32ZVE32F-NEXT:  .LBB43_2:
-; RV32ZVE32F-NEXT:    lw a4, 12(a1)
-; RV32ZVE32F-NEXT:    lw a1, 8(a1)
+; RV32ZVE32F-NEXT:    lw a4, 8(a1)
+; RV32ZVE32F-NEXT:    lw a1, 12(a1)
 ; RV32ZVE32F-NEXT:    j .LBB43_5
 ; RV32ZVE32F-NEXT:  .LBB43_3:
-; RV32ZVE32F-NEXT:    lw a2, 4(a1)
-; RV32ZVE32F-NEXT:    lw a3, 0(a1)
+; RV32ZVE32F-NEXT:    lw a2, 0(a1)
+; RV32ZVE32F-NEXT:    lw a3, 4(a1)
 ; RV32ZVE32F-NEXT:    andi a4, a4, 2
 ; RV32ZVE32F-NEXT:    beqz a4, .LBB43_2
 ; RV32ZVE32F-NEXT:  .LBB43_4: # %cond.load1
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 1
 ; RV32ZVE32F-NEXT:    vmv.x.s a1, v8
-; RV32ZVE32F-NEXT:    lw a4, 4(a1)
-; RV32ZVE32F-NEXT:    lw a1, 0(a1)
+; RV32ZVE32F-NEXT:    lw a4, 0(a1)
+; RV32ZVE32F-NEXT:    lw a1, 4(a1)
 ; RV32ZVE32F-NEXT:  .LBB43_5: # %else2
-; RV32ZVE32F-NEXT:    sw a3, 0(a0)
-; RV32ZVE32F-NEXT:    sw a2, 4(a0)
-; RV32ZVE32F-NEXT:    sw a1, 8(a0)
-; RV32ZVE32F-NEXT:    sw a4, 12(a0)
+; RV32ZVE32F-NEXT:    sw a2, 0(a0)
+; RV32ZVE32F-NEXT:    sw a3, 4(a0)
+; RV32ZVE32F-NEXT:    sw a4, 8(a0)
+; RV32ZVE32F-NEXT:    sw a1, 12(a0)
 ; RV32ZVE32F-NEXT:    ret
 ;
 ; RV64ZVE32F-LABEL: mgather_v2i64:
@@ -3718,60 +3718,60 @@ define <4 x i64> @mgather_v4i64(<4 x ptr> %ptrs, <4 x i1> %m, <4 x i64> %passthr
 ; RV32ZVE32F-NEXT:  # %bb.1: # %cond.load
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s a3, v8
-; RV32ZVE32F-NEXT:    lw a2, 4(a3)
-; RV32ZVE32F-NEXT:    lw a3, 0(a3)
+; RV32ZVE32F-NEXT:    lw a2, 0(a3)
+; RV32ZVE32F-NEXT:    lw a3, 4(a3)
 ; RV32ZVE32F-NEXT:    andi a4, a6, 2
 ; RV32ZVE32F-NEXT:    bnez a4, .LBB44_6
 ; RV32ZVE32F-NEXT:  .LBB44_2:
-; RV32ZVE32F-NEXT:    lw a4, 12(a1)
-; RV32ZVE32F-NEXT:    lw a5, 8(a1)
+; RV32ZVE32F-NEXT:    lw a4, 8(a1)
+; RV32ZVE32F-NEXT:    lw a5, 12(a1)
 ; RV32ZVE32F-NEXT:    andi a7, a6, 4
 ; RV32ZVE32F-NEXT:    bnez a7, .LBB44_7
 ; RV32ZVE32F-NEXT:  .LBB44_3:
-; RV32ZVE32F-NEXT:    lw a7, 20(a1)
-; RV32ZVE32F-NEXT:    lw t0, 16(a1)
+; RV32ZVE32F-NEXT:    lw a7, 16(a1)
+; RV32ZVE32F-NEXT:    lw t0, 20(a1)
 ; RV32ZVE32F-NEXT:    andi a6, a6, 8
 ; RV32ZVE32F-NEXT:    bnez a6, .LBB44_8
 ; RV32ZVE32F-NEXT:  .LBB44_4:
-; RV32ZVE32F-NEXT:    lw a6, 28(a1)
-; RV32ZVE32F-NEXT:    lw a1, 24(a1)
+; RV32ZVE32F-NEXT:    lw a6, 24(a1)
+; RV32ZVE32F-NEXT:    lw a1, 28(a1)
 ; RV32ZVE32F-NEXT:    j .LBB44_9
 ; RV32ZVE32F-NEXT:  .LBB44_5:
-; RV32ZVE32F-NEXT:    lw a2, 4(a1)
-; RV32ZVE32F-NEXT:    lw a3, 0(a1)
+; RV32ZVE32F-NEXT:    lw a2, 0(a1)
+; RV32ZVE32F-NEXT:    lw a3, 4(a1)
 ; RV32ZVE32F-NEXT:    andi a4, a6, 2
 ; RV32ZVE32F-NEXT:    beqz a4, .LBB44_2
 ; RV32ZVE32F-NEXT:  .LBB44_6: # %cond.load1
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v9, v8, 1
 ; RV32ZVE32F-NEXT:    vmv.x.s a5, v9
-; RV32ZVE32F-NEXT:    lw a4, 4(a5)
-; RV32ZVE32F-NEXT:    lw a5, 0(a5)
+; RV32ZVE32F-NEXT:    lw a4, 0(a5)
+; RV32ZVE32F-NEXT:    lw a5, 4(a5)
 ; RV32ZVE32F-NEXT:    andi a7, a6, 4
 ; RV32ZVE32F-NEXT:    beqz a7, .LBB44_3
 ; RV32ZVE32F-NEXT:  .LBB44_7: # %cond.load4
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v9, v8, 2
 ; RV32ZVE32F-NEXT:    vmv.x.s t0, v9
-; RV32ZVE32F-NEXT:    lw a7, 4(t0)
-; RV32ZVE32F-NEXT:    lw t0, 0(t0)
+; RV32ZVE32F-NEXT:    lw a7, 0(t0)
+; RV32ZVE32F-NEXT:    lw t0, 4(t0)
 ; RV32ZVE32F-NEXT:    andi a6, a6, 8
 ; RV32ZVE32F-NEXT:    beqz a6, .LBB44_4
 ; RV32ZVE32F-NEXT:  .LBB44_8: # %cond.load7
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 3
 ; RV32ZVE32F-NEXT:    vmv.x.s a1, v8
-; RV32ZVE32F-NEXT:    lw a6, 4(a1)
-; RV32ZVE32F-NEXT:    lw a1, 0(a1)
+; RV32ZVE32F-NEXT:    lw a6, 0(a1)
+; RV32ZVE32F-NEXT:    lw a1, 4(a1)
 ; RV32ZVE32F-NEXT:  .LBB44_9: # %else8
-; RV32ZVE32F-NEXT:    sw a3, 0(a0)
-; RV32ZVE32F-NEXT:    sw a2, 4(a0)
-; RV32ZVE32F-NEXT:    sw a5, 8(a0)
-; RV32ZVE32F-NEXT:    sw a4, 12(a0)
-; RV32ZVE32F-NEXT:    sw t0, 16(a0)
-; RV32ZVE32F-NEXT:    sw a7, 20(a0)
-; RV32ZVE32F-NEXT:    sw a1, 24(a0)
-; RV32ZVE32F-NEXT:    sw a6, 28(a0)
+; RV32ZVE32F-NEXT:    sw a2, 0(a0)
+; RV32ZVE32F-NEXT:    sw a3, 4(a0)
+; RV32ZVE32F-NEXT:    sw a4, 8(a0)
+; RV32ZVE32F-NEXT:    sw a5, 12(a0)
+; RV32ZVE32F-NEXT:    sw a7, 16(a0)
+; RV32ZVE32F-NEXT:    sw t0, 20(a0)
+; RV32ZVE32F-NEXT:    sw a6, 24(a0)
+; RV32ZVE32F-NEXT:    sw a1, 28(a0)
 ; RV32ZVE32F-NEXT:    ret
 ;
 ; RV64ZVE32F-LABEL: mgather_v4i64:
@@ -3846,60 +3846,60 @@ define <4 x i64> @mgather_truemask_v4i64(<4 x ptr> %ptrs, <4 x i64> %passthru) {
 ; RV32ZVE32F-NEXT:  # %bb.1: # %cond.load
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s a3, v8
-; RV32ZVE32F-NEXT:    lw a2, 4(a3)
-; RV32ZVE32F-NEXT:    lw a3, 0(a3)
+; RV32ZVE32F-NEXT:    lw a2, 0(a3)
+; RV32ZVE32F-NEXT:    lw a3, 4(a3)
 ; RV32ZVE32F-NEXT:    andi a4, a6, 2
 ; RV32ZVE32F-NEXT:    bnez a4, .LBB45_6
 ; RV32ZVE32F-NEXT:  .LBB45_2:
-; RV32ZVE32F-NEXT:    lw a4, 12(a1)
-; RV32ZVE32F-NEXT:    lw a5, 8(a1)
+; RV32ZVE32F-NEXT:    lw a4, 8(a1)
+; RV32ZVE32F-NEXT:    lw a5, 12(a1)
 ; RV32ZVE32F-NEXT:    andi a7, a6, 4
 ; RV32ZVE32F-NEXT:    bnez a7, .LBB45_7
 ; RV32ZVE32F-NEXT:  .LBB45_3:
-; RV32ZVE32F-NEXT:    lw a7, 20(a1)
-; RV32ZVE32F-NEXT:    lw t0, 16(a1)
+; RV32ZVE32F-NEXT:    lw a7, 16(a1)
+; RV32ZVE32F-NEXT:    lw t0, 20(a1)
 ; RV32ZVE32F-NEXT:    andi a6, a6, 8
 ; RV32ZVE32F-NEXT:    bnez a6, .LBB45_8
 ; RV32ZVE32F-NEXT:  .LBB45_4:
-; RV32ZVE32F-NEXT:    lw a6, 28(a1)
-; RV32ZVE32F-NEXT:    lw a1, 24(a1)
+; RV32ZVE32F-NEXT:    lw a6, 24(a1)
+; RV32ZVE32F-NEXT:    lw a1, 28(a1)
 ; RV32ZVE32F-NEXT:    j .LBB45_9
 ; RV32ZVE32F-NEXT:  .LBB45_5:
-; RV32ZVE32F-NEXT:    lw a2, 4(a1)
-; RV32ZVE32F-NEXT:    lw a3, 0(a1)
+; RV32ZVE32F-NEXT:    lw a2, 0(a1)
+; RV32ZVE32F-NEXT:    lw a3, 4(a1)
 ; RV32ZVE32F-NEXT:    andi a4, a6, 2
 ; RV32ZVE32F-NEXT:    beqz a4, .LBB45_2
 ; RV32ZVE32F-NEXT:  .LBB45_6: # %cond.load1
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v9, v8, 1
 ; RV32ZVE32F-NEXT:    vmv.x.s a5, v9
-; RV32ZVE32F-NEXT:    lw a4, 4(a5)
-; RV32ZVE32F-NEXT:    lw a5, 0(a5)
+; RV32ZVE32F-NEXT:    lw a4, 0(a5)
+; RV32ZVE32F-NEXT:    lw a5, 4(a5)
 ; RV32ZVE32F-NEXT:    andi a7, a6, 4
 ; RV32ZVE32F-NEXT:    beqz a7, .LBB45_3
 ; RV32ZVE32F-NEXT:  .LBB45_7: # %cond.load4
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v9, v8, 2
 ; RV32ZVE32F-NEXT:    vmv.x.s t0, v9
-; RV32ZVE32F-NEXT:    lw a7, 4(t0)
-; RV32ZVE32F-NEXT:    lw t0, 0(t0)
+; RV32ZVE32F-NEXT:    lw a7, 0(t0)
+; RV32ZVE32F-NEXT:    lw t0, 4(t0)
 ; RV32ZVE32F-NEXT:    andi a6, a6, 8
 ; RV32ZVE32F-NEXT:    beqz a6, .LBB45_4
 ; RV32ZVE32F-NEXT:  .LBB45_8: # %cond.load7
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 3
 ; RV32ZVE32F-NEXT:    vmv.x.s a1, v8
-; RV32ZVE32F-NEXT:    lw a6, 4(a1)
-; RV32ZVE32F-NEXT:    lw a1, 0(a1)
+; RV32ZVE32F-NEXT:    lw a6, 0(a1)
+; RV32ZVE32F-NEXT:    lw a1, 4(a1)
 ; RV32ZVE32F-NEXT:  .LBB45_9: # %else8
-; RV32ZVE32F-NEXT:    sw a3, 0(a0)
-; RV32ZVE32F-NEXT:    sw a2, 4(a0)
-; RV32ZVE32F-NEXT:    sw a5, 8(a0)
-; RV32ZVE32F-NEXT:    sw a4, 12(a0)
-; RV32ZVE32F-NEXT:    sw t0, 16(a0)
-; RV32ZVE32F-NEXT:    sw a7, 20(a0)
-; RV32ZVE32F-NEXT:    sw a1, 24(a0)
-; RV32ZVE32F-NEXT:    sw a6, 28(a0)
+; RV32ZVE32F-NEXT:    sw a2, 0(a0)
+; RV32ZVE32F-NEXT:    sw a3, 4(a0)
+; RV32ZVE32F-NEXT:    sw a4, 8(a0)
+; RV32ZVE32F-NEXT:    sw a5, 12(a0)
+; RV32ZVE32F-NEXT:    sw a7, 16(a0)
+; RV32ZVE32F-NEXT:    sw t0, 20(a0)
+; RV32ZVE32F-NEXT:    sw a6, 24(a0)
+; RV32ZVE32F-NEXT:    sw a1, 28(a0)
 ; RV32ZVE32F-NEXT:    ret
 ;
 ; RV64ZVE32F-LABEL: mgather_truemask_v4i64:
@@ -3966,22 +3966,22 @@ define <4 x i64> @mgather_falsemask_v4i64(<4 x ptr> %ptrs, <4 x i64> %passthru)
 ;
 ; RV32ZVE32F-LABEL: mgather_falsemask_v4i64:
 ; RV32ZVE32F:       # %bb.0:
-; RV32ZVE32F-NEXT:    lw a2, 0(a1)
-; RV32ZVE32F-NEXT:    lw a3, 4(a1)
-; RV32ZVE32F-NEXT:    lw a4, 8(a1)
-; RV32ZVE32F-NEXT:    lw a5, 12(a1)
-; RV32ZVE32F-NEXT:    lw a6, 28(a1)
-; RV32ZVE32F-NEXT:    lw a7, 24(a1)
-; RV32ZVE32F-NEXT:    lw t0, 20(a1)
+; RV32ZVE32F-NEXT:    lw a2, 20(a1)
+; RV32ZVE32F-NEXT:    lw a3, 24(a1)
+; RV32ZVE32F-NEXT:    lw a4, 28(a1)
+; RV32ZVE32F-NEXT:    lw a5, 0(a1)
+; RV32ZVE32F-NEXT:    lw a6, 4(a1)
+; RV32ZVE32F-NEXT:    lw a7, 8(a1)
+; RV32ZVE32F-NEXT:    lw t0, 12(a1)
 ; RV32ZVE32F-NEXT:    lw a1, 16(a1)
-; RV32ZVE32F-NEXT:    sw a6, 28(a0)
-; RV32ZVE32F-NEXT:    sw a7, 24(a0)
-; RV32ZVE32F-NEXT:    sw t0, 20(a0)
+; RV32ZVE32F-NEXT:    sw a4, 28(a0)
+; RV32ZVE32F-NEXT:    sw a3, 24(a0)
+; RV32ZVE32F-NEXT:    sw a2, 20(a0)
 ; RV32ZVE32F-NEXT:    sw a1, 16(a0)
-; RV32ZVE32F-NEXT:    sw a5, 12(a0)
-; RV32ZVE32F-NEXT:    sw a4, 8(a0)
-; RV32ZVE32F-NEXT:    sw a3, 4(a0)
-; RV32ZVE32F-NEXT:    sw a2, 0(a0)
+; RV32ZVE32F-NEXT:    sw t0, 12(a0)
+; RV32ZVE32F-NEXT:    sw a7, 8(a0)
+; RV32ZVE32F-NEXT:    sw a6, 4(a0)
+; RV32ZVE32F-NEXT:    sw a5, 0(a0)
 ; RV32ZVE32F-NEXT:    ret
 ;
 ; RV64ZVE32F-LABEL: mgather_falsemask_v4i64:
@@ -4025,77 +4025,77 @@ define <8 x i64> @mgather_v8i64(<8 x ptr> %ptrs, <8 x i1> %m, <8 x i64> %passthr
 ; RV32ZVE32F-NEXT:  # %bb.1: # %cond.load
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s a3, v8
-; RV32ZVE32F-NEXT:    lw a2, 4(a3)
-; RV32ZVE32F-NEXT:    lw a3, 0(a3)
+; RV32ZVE32F-NEXT:    lw a2, 0(a3)
+; RV32ZVE32F-NEXT:    lw a3, 4(a3)
 ; RV32ZVE32F-NEXT:    andi a4, t0, 2
 ; RV32ZVE32F-NEXT:    bnez a4, .LBB47_8
 ; RV32ZVE32F-NEXT:  .LBB47_2:
-; RV32ZVE32F-NEXT:    lw a4, 12(a1)
-; RV32ZVE32F-NEXT:    lw a5, 8(a1)
+; RV32ZVE32F-NEXT:    lw a4, 8(a1)
+; RV32ZVE32F-NEXT:    lw a5, 12(a1)
 ; RV32ZVE32F-NEXT:    andi a6, t0, 4
 ; RV32ZVE32F-NEXT:    bnez a6, .LBB47_9
 ; RV32ZVE32F-NEXT:  .LBB47_3:
-; RV32ZVE32F-NEXT:    lw a6, 20(a1)
-; RV32ZVE32F-NEXT:    lw a7, 16(a1)
+; RV32ZVE32F-NEXT:    lw a6, 16(a1)
+; RV32ZVE32F-NEXT:    lw a7, 20(a1)
 ; RV32ZVE32F-NEXT:    andi t1, t0, 8
 ; RV32ZVE32F-NEXT:    bnez t1, .LBB47_10
 ; RV32ZVE32F-NEXT:  .LBB47_4:
-; RV32ZVE32F-NEXT:    lw t1, 28(a1)
-; RV32ZVE32F-NEXT:    lw t2, 24(a1)
+; RV32ZVE32F-NEXT:    lw t1, 24(a1)
+; RV32ZVE32F-NEXT:    lw t2, 28(a1)
 ; RV32ZVE32F-NEXT:    andi t3, t0, 16
 ; RV32ZVE32F-NEXT:    bnez t3, .LBB47_11
 ; RV32ZVE32F-NEXT:  .LBB47_5:
-; RV32ZVE32F-NEXT:    lw t3, 36(a1)
-; RV32ZVE32F-NEXT:    lw t4, 32(a1)
+; RV32ZVE32F-NEXT:    lw t3, 32(a1)
+; RV32ZVE32F-NEXT:    lw t4, 36(a1)
 ; RV32ZVE32F-NEXT:    andi t5, t0, 32
 ; RV32ZVE32F-NEXT:    bnez t5, .LBB47_12
 ; RV32ZVE32F-NEXT:  .LBB47_6:
-; RV32ZVE32F-NEXT:    lw t5, 44(a1)
-; RV32ZVE32F-NEXT:    lw t6, 40(a1)
+; RV32ZVE32F-NEXT:    lw t5, 40(a1)
+; RV32ZVE32F-NEXT:    lw t6, 44(a1)
 ; RV32ZVE32F-NEXT:    j .LBB47_13
 ; RV32ZVE32F-NEXT:  .LBB47_7:
-; RV32ZVE32F-NEXT:    lw a2, 4(a1)
-; RV32ZVE32F-NEXT:    lw a3, 0(a1)
+; RV32ZVE32F-NEXT:    lw a2, 0(a1)
+; RV32ZVE32F-NEXT:    lw a3, 4(a1)
 ; RV32ZVE32F-NEXT:    andi a4, t0, 2
 ; RV32ZVE32F-NEXT:    beqz a4, .LBB47_2
 ; RV32ZVE32F-NEXT:  .LBB47_8: # %cond.load1
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 1
 ; RV32ZVE32F-NEXT:    vmv.x.s a5, v10
-; RV32ZVE32F-NEXT:    lw a4, 4(a5)
-; RV32ZVE32F-NEXT:    lw a5, 0(a5)
+; RV32ZVE32F-NEXT:    lw a4, 0(a5)
+; RV32ZVE32F-NEXT:    lw a5, 4(a5)
 ; RV32ZVE32F-NEXT:    andi a6, t0, 4
 ; RV32ZVE32F-NEXT:    beqz a6, .LBB47_3
 ; RV32ZVE32F-NEXT:  .LBB47_9: # %cond.load4
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 2
 ; RV32ZVE32F-NEXT:    vmv.x.s a7, v10
-; RV32ZVE32F-NEXT:    lw a6, 4(a7)
-; RV32ZVE32F-NEXT:    lw a7, 0(a7)
+; RV32ZVE32F-NEXT:    lw a6, 0(a7)
+; RV32ZVE32F-NEXT:    lw a7, 4(a7)
 ; RV32ZVE32F-NEXT:    andi t1, t0, 8
 ; RV32ZVE32F-NEXT:    beqz t1, .LBB47_4
 ; RV32ZVE32F-NEXT:  .LBB47_10: # %cond.load7
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 3
 ; RV32ZVE32F-NEXT:    vmv.x.s t2, v10
-; RV32ZVE32F-NEXT:    lw t1, 4(t2)
-; RV32ZVE32F-NEXT:    lw t2, 0(t2)
+; RV32ZVE32F-NEXT:    lw t1, 0(t2)
+; RV32ZVE32F-NEXT:    lw t2, 4(t2)
 ; RV32ZVE32F-NEXT:    andi t3, t0, 16
 ; RV32ZVE32F-NEXT:    beqz t3, .LBB47_5
 ; RV32ZVE32F-NEXT:  .LBB47_11: # %cond.load10
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
 ; RV32ZVE32F-NEXT:    vmv.x.s t4, v10
-; RV32ZVE32F-NEXT:    lw t3, 4(t4)
-; RV32ZVE32F-NEXT:    lw t4, 0(t4)
+; RV32ZVE32F-NEXT:    lw t3, 0(t4)
+; RV32ZVE32F-NEXT:    lw t4, 4(t4)
 ; RV32ZVE32F-NEXT:    andi t5, t0, 32
 ; RV32ZVE32F-NEXT:    beqz t5, .LBB47_6
 ; RV32ZVE32F-NEXT:  .LBB47_12: # %cond.load13
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 5
 ; RV32ZVE32F-NEXT:    vmv.x.s t6, v10
-; RV32ZVE32F-NEXT:    lw t5, 4(t6)
-; RV32ZVE32F-NEXT:    lw t6, 0(t6)
+; RV32ZVE32F-NEXT:    lw t5, 0(t6)
+; RV32ZVE32F-NEXT:    lw t6, 4(t6)
 ; RV32ZVE32F-NEXT:  .LBB47_13: # %else14
 ; RV32ZVE32F-NEXT:    addi sp, sp, -16
 ; RV32ZVE32F-NEXT:    .cfi_def_cfa_offset 16
@@ -4109,42 +4109,42 @@ define <8 x i64> @mgather_v8i64(<8 x ptr> %ptrs, <8 x i1> %m, <8 x i64> %passthr
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 6
 ; RV32ZVE32F-NEXT:    vmv.x.s s1, v10
-; RV32ZVE32F-NEXT:    lw s0, 4(s1)
-; RV32ZVE32F-NEXT:    lw s1, 0(s1)
+; RV32ZVE32F-NEXT:    lw s0, 0(s1)
+; RV32ZVE32F-NEXT:    lw s1, 4(s1)
 ; RV32ZVE32F-NEXT:    andi t0, t0, -128
 ; RV32ZVE32F-NEXT:    bnez t0, .LBB47_17
 ; RV32ZVE32F-NEXT:  .LBB47_15:
-; RV32ZVE32F-NEXT:    lw t0, 60(a1)
-; RV32ZVE32F-NEXT:    lw a1, 56(a1)
+; RV32ZVE32F-NEXT:    lw t0, 56(a1)
+; RV32ZVE32F-NEXT:    lw a1, 60(a1)
 ; RV32ZVE32F-NEXT:    j .LBB47_18
 ; RV32ZVE32F-NEXT:  .LBB47_16:
-; RV32ZVE32F-NEXT:    lw s0, 52(a1)
-; RV32ZVE32F-NEXT:    lw s1, 48(a1)
+; RV32ZVE32F-NEXT:    lw s0, 48(a1)
+; RV32ZVE32F-NEXT:    lw s1, 52(a1)
 ; RV32ZVE32F-NEXT:    andi t0, t0, -128
 ; RV32ZVE32F-NEXT:    beqz t0, .LBB47_15
 ; RV32ZVE32F-NEXT:  .LBB47_17: # %cond.load19
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
 ; RV32ZVE32F-NEXT:    vmv.x.s a1, v8
-; RV32ZVE32F-NEXT:    lw t0, 4(a1)
-; RV32ZVE32F-NEXT:    lw a1, 0(a1)
+; RV32ZVE32F-NEXT:    lw t0, 0(a1)
+; RV32ZVE32F-NEXT:    lw a1, 4(a1)
 ; RV32ZVE32F-NEXT:  .LBB47_18: # %else20
-; RV32ZVE32F-NEXT:    sw a3, 0(a0)
-; RV32ZVE32F-NEXT:    sw a2, 4(a0)
-; RV32ZVE32F-NEXT:    sw a5, 8(a0)
-; RV32ZVE32F-NEXT:    sw a4, 12(a0)
-; RV32ZVE32F-NEXT:    sw a7, 16(a0)
-; RV32ZVE32F-NEXT:    sw a6, 20(a0)
-; RV32ZVE32F-NEXT:    sw t2, 24(a0)
-; RV32ZVE32F-NEXT:    sw t1, 28(a0)
-; RV32ZVE32F-NEXT:    sw t4, 32(a0)
-; RV32ZVE32F-NEXT:    sw t3, 36(a0)
-; RV32ZVE32F-NEXT:    sw t6, 40(a0)
-; RV32ZVE32F-NEXT:    sw t5, 44(a0)
-; RV32ZVE32F-NEXT:    sw s1, 48(a0)
-; RV32ZVE32F-NEXT:    sw s0, 52(a0)
-; RV32ZVE32F-NEXT:    sw a1, 56(a0)
-; RV32ZVE32F-NEXT:    sw t0, 60(a0)
+; RV32ZVE32F-NEXT:    sw a2, 0(a0)
+; RV32ZVE32F-NEXT:    sw a3, 4(a0)
+; RV32ZVE32F-NEXT:    sw a4, 8(a0)
+; RV32ZVE32F-NEXT:    sw a5, 12(a0)
+; RV32ZVE32F-NEXT:    sw a6, 16(a0)
+; RV32ZVE32F-NEXT:    sw a7, 20(a0)
+; RV32ZVE32F-NEXT:    sw t1, 24(a0)
+; RV32ZVE32F-NEXT:    sw t2, 28(a0)
+; RV32ZVE32F-NEXT:    sw t3, 32(a0)
+; RV32ZVE32F-NEXT:    sw t4, 36(a0)
+; RV32ZVE32F-NEXT:    sw t5, 40(a0)
+; RV32ZVE32F-NEXT:    sw t6, 44(a0)
+; RV32ZVE32F-NEXT:    sw s0, 48(a0)
+; RV32ZVE32F-NEXT:    sw s1, 52(a0)
+; RV32ZVE32F-NEXT:    sw t0, 56(a0)
+; RV32ZVE32F-NEXT:    sw a1, 60(a0)
 ; RV32ZVE32F-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    addi sp, sp, 16
@@ -4272,77 +4272,77 @@ define <8 x i64> @mgather_baseidx_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8 x i1>
 ; RV32ZVE32F-NEXT:  # %bb.1: # %cond.load
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s a3, v8
-; RV32ZVE32F-NEXT:    lw a1, 4(a3)
-; RV32ZVE32F-NEXT:    lw a3, 0(a3)
+; RV32ZVE32F-NEXT:    lw a1, 0(a3)
+; RV32ZVE32F-NEXT:    lw a3, 4(a3)
 ; RV32ZVE32F-NEXT:    andi a4, t0, 2
 ; RV32ZVE32F-NEXT:    bnez a4, .LBB48_8
 ; RV32ZVE32F-NEXT:  .LBB48_2:
-; RV32ZVE32F-NEXT:    lw a4, 12(a2)
-; RV32ZVE32F-NEXT:    lw a5, 8(a2)
+; RV32ZVE32F-NEXT:    lw a4, 8(a2)
+; RV32ZVE32F-NEXT:    lw a5, 12(a2)
 ; RV32ZVE32F-NEXT:    andi a6, t0, 4
 ; RV32ZVE32F-NEXT:    bnez a6, .LBB48_9
 ; RV32ZVE32F-NEXT:  .LBB48_3:
-; RV32ZVE32F-NEXT:    lw a6, 20(a2)
-; RV32ZVE32F-NEXT:    lw a7, 16(a2)
+; RV32ZVE32F-NEXT:    lw a6, 16(a2)
+; RV32ZVE32F-NEXT:    lw a7, 20(a2)
 ; RV32ZVE32F-NEXT:    andi t1, t0, 8
 ; RV32ZVE32F-NEXT:    bnez t1, .LBB48_10
 ; RV32ZVE32F-NEXT:  .LBB48_4:
-; RV32ZVE32F-NEXT:    lw t1, 28(a2)
-; RV32ZVE32F-NEXT:    lw t2, 24(a2)
+; RV32ZVE32F-NEXT:    lw t1, 24(a2)
+; RV32ZVE32F-NEXT:    lw t2, 28(a2)
 ; RV32ZVE32F-NEXT:    andi t3, t0, 16
 ; RV32ZVE32F-NEXT:    bnez t3, .LBB48_11
 ; RV32ZVE32F-NEXT:  .LBB48_5:
-; RV32ZVE32F-NEXT:    lw t3, 36(a2)
-; RV32ZVE32F-NEXT:    lw t4, 32(a2)
+; RV32ZVE32F-NEXT:    lw t3, 32(a2)
+; RV32ZVE32F-NEXT:    lw t4, 36(a2)
 ; RV32ZVE32F-NEXT:    andi t5, t0, 32
 ; RV32ZVE32F-NEXT:    bnez t5, .LBB48_12
 ; RV32ZVE32F-NEXT:  .LBB48_6:
-; RV32ZVE32F-NEXT:    lw t5, 44(a2)
-; RV32ZVE32F-NEXT:    lw t6, 40(a2)
+; RV32ZVE32F-NEXT:    lw t5, 40(a2)
+; RV32ZVE32F-NEXT:    lw t6, 44(a2)
 ; RV32ZVE32F-NEXT:    j .LBB48_13
 ; RV32ZVE32F-NEXT:  .LBB48_7:
-; RV32ZVE32F-NEXT:    lw a1, 4(a2)
-; RV32ZVE32F-NEXT:    lw a3, 0(a2)
+; RV32ZVE32F-NEXT:    lw a1, 0(a2)
+; RV32ZVE32F-NEXT:    lw a3, 4(a2)
 ; RV32ZVE32F-NEXT:    andi a4, t0, 2
 ; RV32ZVE32F-NEXT:    beqz a4, .LBB48_2
 ; RV32ZVE32F-NEXT:  .LBB48_8: # %cond.load1
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 1
 ; RV32ZVE32F-NEXT:    vmv.x.s a5, v10
-; RV32ZVE32F-NEXT:    lw a4, 4(a5)
-; RV32ZVE32F-NEXT:    lw a5, 0(a5)
+; RV32ZVE32F-NEXT:    lw a4, 0(a5)
+; RV32ZVE32F-NEXT:    lw a5, 4(a5)
 ; RV32ZVE32F-NEXT:    andi a6, t0, 4
 ; RV32ZVE32F-NEXT:    beqz a6, .LBB48_3
 ; RV32ZVE32F-NEXT:  .LBB48_9: # %cond.load4
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 2
 ; RV32ZVE32F-NEXT:    vmv.x.s a7, v10
-; RV32ZVE32F-NEXT:    lw a6, 4(a7)
-; RV32ZVE32F-NEXT:    lw a7, 0(a7)
+; RV32ZVE32F-NEXT:    lw a6, 0(a7)
+; RV32ZVE32F-NEXT:    lw a7, 4(a7)
 ; RV32ZVE32F-NEXT:    andi t1, t0, 8
 ; RV32ZVE32F-NEXT:    beqz t1, .LBB48_4
 ; RV32ZVE32F-NEXT:  .LBB48_10: # %cond.load7
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 3
 ; RV32ZVE32F-NEXT:    vmv.x.s t2, v10
-; RV32ZVE32F-NEXT:    lw t1, 4(t2)
-; RV32ZVE32F-NEXT:    lw t2, 0(t2)
+; RV32ZVE32F-NEXT:    lw t1, 0(t2)
+; RV32ZVE32F-NEXT:    lw t2, 4(t2)
 ; RV32ZVE32F-NEXT:    andi t3, t0, 16
 ; RV32ZVE32F-NEXT:    beqz t3, .LBB48_5
 ; RV32ZVE32F-NEXT:  .LBB48_11: # %cond.load10
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
 ; RV32ZVE32F-NEXT:    vmv.x.s t4, v10
-; RV32ZVE32F-NEXT:    lw t3, 4(t4)
-; RV32ZVE32F-NEXT:    lw t4, 0(t4)
+; RV32ZVE32F-NEXT:    lw t3, 0(t4)
+; RV32ZVE32F-NEXT:    lw t4, 4(t4)
 ; RV32ZVE32F-NEXT:    andi t5, t0, 32
 ; RV32ZVE32F-NEXT:    beqz t5, .LBB48_6
 ; RV32ZVE32F-NEXT:  .LBB48_12: # %cond.load13
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 5
 ; RV32ZVE32F-NEXT:    vmv.x.s t6, v10
-; RV32ZVE32F-NEXT:    lw t5, 4(t6)
-; RV32ZVE32F-NEXT:    lw t6, 0(t6)
+; RV32ZVE32F-NEXT:    lw t5, 0(t6)
+; RV32ZVE32F-NEXT:    lw t6, 4(t6)
 ; RV32ZVE32F-NEXT:  .LBB48_13: # %else14
 ; RV32ZVE32F-NEXT:    addi sp, sp, -16
 ; RV32ZVE32F-NEXT:    .cfi_def_cfa_offset 16
@@ -4356,42 +4356,42 @@ define <8 x i64> @mgather_baseidx_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8 x i1>
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 6
 ; RV32ZVE32F-NEXT:    vmv.x.s s1, v10
-; RV32ZVE32F-NEXT:    lw s0, 4(s1)
-; RV32ZVE32F-NEXT:    lw s1, 0(s1)
+; RV32ZVE32F-NEXT:    lw s0, 0(s1)
+; RV32ZVE32F-NEXT:    lw s1, 4(s1)
 ; RV32ZVE32F-NEXT:    andi t0, t0, -128
 ; RV32ZVE32F-NEXT:    bnez t0, .LBB48_17
 ; RV32ZVE32F-NEXT:  .LBB48_15:
-; RV32ZVE32F-NEXT:    lw t0, 60(a2)
-; RV32ZVE32F-NEXT:    lw a2, 56(a2)
+; RV32ZVE32F-NEXT:    lw t0, 56(a2)
+; RV32ZVE32F-NEXT:    lw a2, 60(a2)
 ; RV32ZVE32F-NEXT:    j .LBB48_18
 ; RV32ZVE32F-NEXT:  .LBB48_16:
-; RV32ZVE32F-NEXT:    lw s0, 52(a2)
-; RV32ZVE32F-NEXT:    lw s1, 48(a2)
+; RV32ZVE32F-NEXT:    lw s0, 48(a2)
+; RV32ZVE32F-NEXT:    lw s1, 52(a2)
 ; RV32ZVE32F-NEXT:    andi t0, t0, -128
 ; RV32ZVE32F-NEXT:    beqz t0, .LBB48_15
 ; RV32ZVE32F-NEXT:  .LBB48_17: # %cond.load19
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
 ; RV32ZVE32F-NEXT:    vmv.x.s a2, v8
-; RV32ZVE32F-NEXT:    lw t0, 4(a2)
-; RV32ZVE32F-NEXT:    lw a2, 0(a2)
+; RV32ZVE32F-NEXT:    lw t0, 0(a2)
+; RV32ZVE32F-NEXT:    lw a2, 4(a2)
 ; RV32ZVE32F-NEXT:  .LBB48_18: # %else20
-; RV32ZVE32F-NEXT:    sw a3, 0(a0)
-; RV32ZVE32F-NEXT:    sw a1, 4(a0)
-; RV32ZVE32F-NEXT:    sw a5, 8(a0)
-; RV32ZVE32F-NEXT:    sw a4, 12(a0)
-; RV32ZVE32F-NEXT:    sw a7, 16(a0)
-; RV32ZVE32F-NEXT:    sw a6, 20(a0)
-; RV32ZVE32F-NEXT:    sw t2, 24(a0)
-; RV32ZVE32F-NEXT:    sw t1, 28(a0)
-; RV32ZVE32F-NEXT:    sw t4, 32(a0)
-; RV32ZVE32F-NEXT:    sw t3, 36(a0)
-; RV32ZVE32F-NEXT:    sw t6, 40(a0)
-; RV32ZVE32F-NEXT:    sw t5, 44(a0)
-; RV32ZVE32F-NEXT:    sw s1, 48(a0)
-; RV32ZVE32F-NEXT:    sw s0, 52(a0)
-; RV32ZVE32F-NEXT:    sw a2, 56(a0)
-; RV32ZVE32F-NEXT:    sw t0, 60(a0)
+; RV32ZVE32F-NEXT:    sw a1, 0(a0)
+; RV32ZVE32F-NEXT:    sw a3, 4(a0)
+; RV32ZVE32F-NEXT:    sw a4, 8(a0)
+; RV32ZVE32F-NEXT:    sw a5, 12(a0)
+; RV32ZVE32F-NEXT:    sw a6, 16(a0)
+; RV32ZVE32F-NEXT:    sw a7, 20(a0)
+; RV32ZVE32F-NEXT:    sw t1, 24(a0)
+; RV32ZVE32F-NEXT:    sw t2, 28(a0)
+; RV32ZVE32F-NEXT:    sw t3, 32(a0)
+; RV32ZVE32F-NEXT:    sw t4, 36(a0)
+; RV32ZVE32F-NEXT:    sw t5, 40(a0)
+; RV32ZVE32F-NEXT:    sw t6, 44(a0)
+; RV32ZVE32F-NEXT:    sw s0, 48(a0)
+; RV32ZVE32F-NEXT:    sw s1, 52(a0)
+; RV32ZVE32F-NEXT:    sw t0, 56(a0)
+; RV32ZVE32F-NEXT:    sw a2, 60(a0)
 ; RV32ZVE32F-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    addi sp, sp, 16
@@ -4546,77 +4546,77 @@ define <8 x i64> @mgather_baseidx_sext_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8
 ; RV32ZVE32F-NEXT:  # %bb.1: # %cond.load
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s a3, v8
-; RV32ZVE32F-NEXT:    lw a1, 4(a3)
-; RV32ZVE32F-NEXT:    lw a3, 0(a3)
+; RV32ZVE32F-NEXT:    lw a1, 0(a3)
+; RV32ZVE32F-NEXT:    lw a3, 4(a3)
 ; RV32ZVE32F-NEXT:    andi a4, t0, 2
 ; RV32ZVE32F-NEXT:    bnez a4, .LBB49_8
 ; RV32ZVE32F-NEXT:  .LBB49_2:
-; RV32ZVE32F-NEXT:    lw a4, 12(a2)
-; RV32ZVE32F-NEXT:    lw a5, 8(a2)
+; RV32ZVE32F-NEXT:    lw a4, 8(a2)
+; RV32ZVE32F-NEXT:    lw a5, 12(a2)
 ; RV32ZVE32F-NEXT:    andi a6, t0, 4
 ; RV32ZVE32F-NEXT:    bnez a6, .LBB49_9
 ; RV32ZVE32F-NEXT:  .LBB49_3:
-; RV32ZVE32F-NEXT:    lw a6, 20(a2)
-; RV32ZVE32F-NEXT:    lw a7, 16(a2)
+; RV32ZVE32F-NEXT:    lw a6, 16(a2)
+; RV32ZVE32F-NEXT:    lw a7, 20(a2)
 ; RV32ZVE32F-NEXT:    andi t1, t0, 8
 ; RV32ZVE32F-NEXT:    bnez t1, .LBB49_10
 ; RV32ZVE32F-NEXT:  .LBB49_4:
-; RV32ZVE32F-NEXT:    lw t1, 28(a2)
-; RV32ZVE32F-NEXT:    lw t2, 24(a2)
+; RV32ZVE32F-NEXT:    lw t1, 24(a2)
+; RV32ZVE32F-NEXT:    lw t2, 28(a2)
 ; RV32ZVE32F-NEXT:    andi t3, t0, 16
 ; RV32ZVE32F-NEXT:    bnez t3, .LBB49_11
 ; RV32ZVE32F-NEXT:  .LBB49_5:
-; RV32ZVE32F-NEXT:    lw t3, 36(a2)
-; RV32ZVE32F-NEXT:    lw t4, 32(a2)
+; RV32ZVE32F-NEXT:    lw t3, 32(a2)
+; RV32ZVE32F-NEXT:    lw t4, 36(a2)
 ; RV32ZVE32F-NEXT:    andi t5, t0, 32
 ; RV32ZVE32F-NEXT:    bnez t5, .LBB49_12
 ; RV32ZVE32F-NEXT:  .LBB49_6:
-; RV32ZVE32F-NEXT:    lw t5, 44(a2)
-; RV32ZVE32F-NEXT:    lw t6, 40(a2)
+; RV32ZVE32F-NEXT:    lw t5, 40(a2)
+; RV32ZVE32F-NEXT:    lw t6, 44(a2)
 ; RV32ZVE32F-NEXT:    j .LBB49_13
 ; RV32ZVE32F-NEXT:  .LBB49_7:
-; RV32ZVE32F-NEXT:    lw a1, 4(a2)
-; RV32ZVE32F-NEXT:    lw a3, 0(a2)
+; RV32ZVE32F-NEXT:    lw a1, 0(a2)
+; RV32ZVE32F-NEXT:    lw a3, 4(a2)
 ; RV32ZVE32F-NEXT:    andi a4, t0, 2
 ; RV32ZVE32F-NEXT:    beqz a4, .LBB49_2
 ; RV32ZVE32F-NEXT:  .LBB49_8: # %cond.load1
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 1
 ; RV32ZVE32F-NEXT:    vmv.x.s a5, v10
-; RV32ZVE32F-NEXT:    lw a4, 4(a5)
-; RV32ZVE32F-NEXT:    lw a5, 0(a5)
+; RV32ZVE32F-NEXT:    lw a4, 0(a5)
+; RV32ZVE32F-NEXT:    lw a5, 4(a5)
 ; RV32ZVE32F-NEXT:    andi a6, t0, 4
 ; RV32ZVE32F-NEXT:    beqz a6, .LBB49_3
 ; RV32ZVE32F-NEXT:  .LBB49_9: # %cond.load4
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 2
 ; RV32ZVE32F-NEXT:    vmv.x.s a7, v10
-; RV32ZVE32F-NEXT:    lw a6, 4(a7)
-; RV32ZVE32F-NEXT:    lw a7, 0(a7)
+; RV32ZVE32F-NEXT:    lw a6, 0(a7)
+; RV32ZVE32F-NEXT:    lw a7, 4(a7)
 ; RV32ZVE32F-NEXT:    andi t1, t0, 8
 ; RV32ZVE32F-NEXT:    beqz t1, .LBB49_4
 ; RV32ZVE32F-NEXT:  .LBB49_10: # %cond.load7
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 3
 ; RV32ZVE32F-NEXT:    vmv.x.s t2, v10
-; RV32ZVE32F-NEXT:    lw t1, 4(t2)
-; RV32ZVE32F-NEXT:    lw t2, 0(t2)
+; RV32ZVE32F-NEXT:    lw t1, 0(t2)
+; RV32ZVE32F-NEXT:    lw t2, 4(t2)
 ; RV32ZVE32F-NEXT:    andi t3, t0, 16
 ; RV32ZVE32F-NEXT:    beqz t3, .LBB49_5
 ; RV32ZVE32F-NEXT:  .LBB49_11: # %cond.load10
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
 ; RV32ZVE32F-NEXT:    vmv.x.s t4, v10
-; RV32ZVE32F-NEXT:    lw t3, 4(t4)
-; RV32ZVE32F-NEXT:    lw t4, 0(t4)
+; RV32ZVE32F-NEXT:    lw t3, 0(t4)
+; RV32ZVE32F-NEXT:    lw t4, 4(t4)
 ; RV32ZVE32F-NEXT:    andi t5, t0, 32
 ; RV32ZVE32F-NEXT:    beqz t5, .LBB49_6
 ; RV32ZVE32F-NEXT:  .LBB49_12: # %cond.load13
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 5
 ; RV32ZVE32F-NEXT:    vmv.x.s t6, v10
-; RV32ZVE32F-NEXT:    lw t5, 4(t6)
-; RV32ZVE32F-NEXT:    lw t6, 0(t6)
+; RV32ZVE32F-NEXT:    lw t5, 0(t6)
+; RV32ZVE32F-NEXT:    lw t6, 4(t6)
 ; RV32ZVE32F-NEXT:  .LBB49_13: # %else14
 ; RV32ZVE32F-NEXT:    addi sp, sp, -16
 ; RV32ZVE32F-NEXT:    .cfi_def_cfa_offset 16
@@ -4630,42 +4630,42 @@ define <8 x i64> @mgather_baseidx_sext_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 6
 ; RV32ZVE32F-NEXT:    vmv.x.s s1, v10
-; RV32ZVE32F-NEXT:    lw s0, 4(s1)
-; RV32ZVE32F-NEXT:    lw s1, 0(s1)
+; RV32ZVE32F-NEXT:    lw s0, 0(s1)
+; RV32ZVE32F-NEXT:    lw s1, 4(s1)
 ; RV32ZVE32F-NEXT:    andi t0, t0, -128
 ; RV32ZVE32F-NEXT:    bnez t0, .LBB49_17
 ; RV32ZVE32F-NEXT:  .LBB49_15:
-; RV32ZVE32F-NEXT:    lw t0, 60(a2)
-; RV32ZVE32F-NEXT:    lw a2, 56(a2)
+; RV32ZVE32F-NEXT:    lw t0, 56(a2)
+; RV32ZVE32F-NEXT:    lw a2, 60(a2)
 ; RV32ZVE32F-NEXT:    j .LBB49_18
 ; RV32ZVE32F-NEXT:  .LBB49_16:
-; RV32ZVE32F-NEXT:    lw s0, 52(a2)
-; RV32ZVE32F-NEXT:    lw s1, 48(a2)
+; RV32ZVE32F-NEXT:    lw s0, 48(a2)
+; RV32ZVE32F-NEXT:    lw s1, 52(a2)
 ; RV32ZVE32F-NEXT:    andi t0, t0, -128
 ; RV32ZVE32F-NEXT:    beqz t0, .LBB49_15
 ; RV32ZVE32F-NEXT:  .LBB49_17: # %cond.load19
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
 ; RV32ZVE32F-NEXT:    vmv.x.s a2, v8
-; RV32ZVE32F-NEXT:    lw t0, 4(a2)
-; RV32ZVE32F-NEXT:    lw a2, 0(a2)
+; RV32ZVE32F-NEXT:    lw t0, 0(a2)
+; RV32ZVE32F-NEXT:    lw a2, 4(a2)
 ; RV32ZVE32F-NEXT:  .LBB49_18: # %else20
-; RV32ZVE32F-NEXT:    sw a3, 0(a0)
-; RV32ZVE32F-NEXT:    sw a1, 4(a0)
-; RV32ZVE32F-NEXT:    sw a5, 8(a0)
-; RV32ZVE32F-NEXT:    sw a4, 12(a0)
-; RV32ZVE32F-NEXT:    sw a7, 16(a0)
-; RV32ZVE32F-NEXT:    sw a6, 20(a0)
-; RV32ZVE32F-NEXT:    sw t2, 24(a0)
-; RV32ZVE32F-NEXT:    sw t1, 28(a0)
-; RV32ZVE32F-NEXT:    sw t4, 32(a0)
-; RV32ZVE32F-NEXT:    sw t3, 36(a0)
-; RV32ZVE32F-NEXT:    sw t6, 40(a0)
-; RV32ZVE32F-NEXT:    sw t5, 44(a0)
-; RV32ZVE32F-NEXT:    sw s1, 48(a0)
-; RV32ZVE32F-NEXT:    sw s0, 52(a0)
-; RV32ZVE32F-NEXT:    sw a2, 56(a0)
-; RV32ZVE32F-NEXT:    sw t0, 60(a0)
+; RV32ZVE32F-NEXT:    sw a1, 0(a0)
+; RV32ZVE32F-NEXT:    sw a3, 4(a0)
+; RV32ZVE32F-NEXT:    sw a4, 8(a0)
+; RV32ZVE32F-NEXT:    sw a5, 12(a0)
+; RV32ZVE32F-NEXT:    sw a6, 16(a0)
+; RV32ZVE32F-NEXT:    sw a7, 20(a0)
+; RV32ZVE32F-NEXT:    sw t1, 24(a0)
+; RV32ZVE32F-NEXT:    sw t2, 28(a0)
+; RV32ZVE32F-NEXT:    sw t3, 32(a0)
+; RV32ZVE32F-NEXT:    sw t4, 36(a0)
+; RV32ZVE32F-NEXT:    sw t5, 40(a0)
+; RV32ZVE32F-NEXT:    sw t6, 44(a0)
+; RV32ZVE32F-NEXT:    sw s0, 48(a0)
+; RV32ZVE32F-NEXT:    sw s1, 52(a0)
+; RV32ZVE32F-NEXT:    sw t0, 56(a0)
+; RV32ZVE32F-NEXT:    sw a2, 60(a0)
 ; RV32ZVE32F-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    addi sp, sp, 16
@@ -4822,77 +4822,77 @@ define <8 x i64> @mgather_baseidx_zext_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8
 ; RV32ZVE32F-NEXT:  # %bb.1: # %cond.load
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s a3, v8
-; RV32ZVE32F-NEXT:    lw a1, 4(a3)
-; RV32ZVE32F-NEXT:    lw a3, 0(a3)
+; RV32ZVE32F-NEXT:    lw a1, 0(a3)
+; RV32ZVE32F-NEXT:    lw a3, 4(a3)
 ; RV32ZVE32F-NEXT:    andi a4, t0, 2
 ; RV32ZVE32F-NEXT:    bnez a4, .LBB50_8
 ; RV32ZVE32F-NEXT:  .LBB50_2:
-; RV32ZVE32F-NEXT:    lw a4, 12(a2)
-; RV32ZVE32F-NEXT:    lw a5, 8(a2)
+; RV32ZVE32F-NEXT:    lw a4, 8(a2)
+; RV32ZVE32F-NEXT:    lw a5, 12(a2)
 ; RV32ZVE32F-NEXT:    andi a6, t0, 4
 ; RV32ZVE32F-NEXT:    bnez a6, .LBB50_9
 ; RV32ZVE32F-NEXT:  .LBB50_3:
-; RV32ZVE32F-NEXT:    lw a6, 20(a2)
-; RV32ZVE32F-NEXT:    lw a7, 16(a2)
+; RV32ZVE32F-NEXT:    lw a6, 16(a2)
+; RV32ZVE32F-NEXT:    lw a7, 20(a2)
 ; RV32ZVE32F-NEXT:    andi t1, t0, 8
 ; RV32ZVE32F-NEXT:    bnez t1, .LBB50_10
 ; RV32ZVE32F-NEXT:  .LBB50_4:
-; RV32ZVE32F-NEXT:    lw t1, 28(a2)
-; RV32ZVE32F-NEXT:    lw t2, 24(a2)
+; RV32ZVE32F-NEXT:    lw t1, 24(a2)
+; RV32ZVE32F-NEXT:    lw t2, 28(a2)
 ; RV32ZVE32F-NEXT:    andi t3, t0, 16
 ; RV32ZVE32F-NEXT:    bnez t3, .LBB50_11
 ; RV32ZVE32F-NEXT:  .LBB50_5:
-; RV32ZVE32F-NEXT:    lw t3, 36(a2)
-; RV32ZVE32F-NEXT:    lw t4, 32(a2)
+; RV32ZVE32F-NEXT:    lw t3, 32(a2)
+; RV32ZVE32F-NEXT:    lw t4, 36(a2)
 ; RV32ZVE32F-NEXT:    andi t5, t0, 32
 ; RV32ZVE32F-NEXT:    bnez t5, .LBB50_12
 ; RV32ZVE32F-NEXT:  .LBB50_6:
-; RV32ZVE32F-NEXT:    lw t5, 44(a2)
-; RV32ZVE32F-NEXT:    lw t6, 40(a2)
+; RV32ZVE32F-NEXT:    lw t5, 40(a2)
+; RV32ZVE32F-NEXT:    lw t6, 44(a2)
 ; RV32ZVE32F-NEXT:    j .LBB50_13
 ; RV32ZVE32F-NEXT:  .LBB50_7:
-; RV32ZVE32F-NEXT:    lw a1, 4(a2)
-; RV32ZVE32F-NEXT:    lw a3, 0(a2)
+; RV32ZVE32F-NEXT:    lw a1, 0(a2)
+; RV32ZVE32F-NEXT:    lw a3, 4(a2)
 ; RV32ZVE32F-NEXT:    andi a4, t0, 2
 ; RV32ZVE32F-NEXT:    beqz a4, .LBB50_2
 ; RV32ZVE32F-NEXT:  .LBB50_8: # %cond.load1
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 1
 ; RV32ZVE32F-NEXT:    vmv.x.s a5, v10
-; RV32ZVE32F-NEXT:    lw a4, 4(a5)
-; RV32ZVE32F-NEXT:    lw a5, 0(a5)
+; RV32ZVE32F-NEXT:    lw a4, 0(a5)
+; RV32ZVE32F-NEXT:    lw a5, 4(a5)
 ; RV32ZVE32F-NEXT:    andi a6, t0, 4
 ; RV32ZVE32F-NEXT:    beqz a6, .LBB50_3
 ; RV32ZVE32F-NEXT:  .LBB50_9: # %cond.load4
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 2
 ; RV32ZVE32F-NEXT:    vmv.x.s a7, v10
-; RV32ZVE32F-NEXT:    lw a6, 4(a7)
-; RV32ZVE32F-NEXT:    lw a7, 0(a7)
+; RV32ZVE32F-NEXT:    lw a6, 0(a7)
+; RV32ZVE32F-NEXT:    lw a7, 4(a7)
 ; RV32ZVE32F-NEXT:    andi t1, t0, 8
 ; RV32ZVE32F-NEXT:    beqz t1, .LBB50_4
 ; RV32ZVE32F-NEXT:  .LBB50_10: # %cond.load7
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 3
 ; RV32ZVE32F-NEXT:    vmv.x.s t2, v10
-; RV32ZVE32F-NEXT:    lw t1, 4(t2)
-; RV32ZVE32F-NEXT:    lw t2, 0(t2)
+; RV32ZVE32F-NEXT:    lw t1, 0(t2)
+; RV32ZVE32F-NEXT:    lw t2, 4(t2)
 ; RV32ZVE32F-NEXT:    andi t3, t0, 16
 ; RV32ZVE32F-NEXT:    beqz t3, .LBB50_5
 ; RV32ZVE32F-NEXT:  .LBB50_11: # %cond.load10
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
 ; RV32ZVE32F-NEXT:    vmv.x.s t4, v10
-; RV32ZVE32F-NEXT:    lw t3, 4(t4)
-; RV32ZVE32F-NEXT:    lw t4, 0(t4)
+; RV32ZVE32F-NEXT:    lw t3, 0(t4)
+; RV32ZVE32F-NEXT:    lw t4, 4(t4)
 ; RV32ZVE32F-NEXT:    andi t5, t0, 32
 ; RV32ZVE32F-NEXT:    beqz t5, .LBB50_6
 ; RV32ZVE32F-NEXT:  .LBB50_12: # %cond.load13
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 5
 ; RV32ZVE32F-NEXT:    vmv.x.s t6, v10
-; RV32ZVE32F-NEXT:    lw t5, 4(t6)
-; RV32ZVE32F-NEXT:    lw t6, 0(t6)
+; RV32ZVE32F-NEXT:    lw t5, 0(t6)
+; RV32ZVE32F-NEXT:    lw t6, 4(t6)
 ; RV32ZVE32F-NEXT:  .LBB50_13: # %else14
 ; RV32ZVE32F-NEXT:    addi sp, sp, -16
 ; RV32ZVE32F-NEXT:    .cfi_def_cfa_offset 16
@@ -4906,42 +4906,42 @@ define <8 x i64> @mgather_baseidx_zext_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 6
 ; RV32ZVE32F-NEXT:    vmv.x.s s1, v10
-; RV32ZVE32F-NEXT:    lw s0, 4(s1)
-; RV32ZVE32F-NEXT:    lw s1, 0(s1)
+; RV32ZVE32F-NEXT:    lw s0, 0(s1)
+; RV32ZVE32F-NEXT:    lw s1, 4(s1)
 ; RV32ZVE32F-NEXT:    andi t0, t0, -128
 ; RV32ZVE32F-NEXT:    bnez t0, .LBB50_17
 ; RV32ZVE32F-NEXT:  .LBB50_15:
-; RV32ZVE32F-NEXT:    lw t0, 60(a2)
-; RV32ZVE32F-NEXT:    lw a2, 56(a2)
+; RV32ZVE32F-NEXT:    lw t0, 56(a2)
+; RV32ZVE32F-NEXT:    lw a2, 60(a2)
 ; RV32ZVE32F-NEXT:    j .LBB50_18
 ; RV32ZVE32F-NEXT:  .LBB50_16:
-; RV32ZVE32F-NEXT:    lw s0, 52(a2)
-; RV32ZVE32F-NEXT:    lw s1, 48(a2)
+; RV32ZVE32F-NEXT:    lw s0, 48(a2)
+; RV32ZVE32F-NEXT:    lw s1, 52(a2)
 ; RV32ZVE32F-NEXT:    andi t0, t0, -128
 ; RV32ZVE32F-NEXT:    beqz t0, .LBB50_15
 ; RV32ZVE32F-NEXT:  .LBB50_17: # %cond.load19
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
 ; RV32ZVE32F-NEXT:    vmv.x.s a2, v8
-; RV32ZVE32F-NEXT:    lw t0, 4(a2)
-; RV32ZVE32F-NEXT:    lw a2, 0(a2)
+; RV32ZVE32F-NEXT:    lw t0, 0(a2)
+; RV32ZVE32F-NEXT:    lw a2, 4(a2)
 ; RV32ZVE32F-NEXT:  .LBB50_18: # %else20
-; RV32ZVE32F-NEXT:    sw a3, 0(a0)
-; RV32ZVE32F-NEXT:    sw a1, 4(a0)
-; RV32ZVE32F-NEXT:    sw a5, 8(a0)
-; RV32ZVE32F-NEXT:    sw a4, 12(a0)
-; RV32ZVE32F-NEXT:    sw a7, 16(a0)
-; RV32ZVE32F-NEXT:    sw a6, 20(a0)
-; RV32ZVE32F-NEXT:    sw t2, 24(a0)
-; RV32ZVE32F-NEXT:    sw t1, 28(a0)
-; RV32ZVE32F-NEXT:    sw t4, 32(a0)
-; RV32ZVE32F-NEXT:    sw t3, 36(a0)
-; RV32ZVE32F-NEXT:    sw t6, 40(a0)
-; RV32ZVE32F-NEXT:    sw t5, 44(a0)
-; RV32ZVE32F-NEXT:    sw s1, 48(a0)
-; RV32ZVE32F-NEXT:    sw s0, 52(a0)
-; RV32ZVE32F-NEXT:    sw a2, 56(a0)
-; RV32ZVE32F-NEXT:    sw t0, 60(a0)
+; RV32ZVE32F-NEXT:    sw a1, 0(a0)
+; RV32ZVE32F-NEXT:    sw a3, 4(a0)
+; RV32ZVE32F-NEXT:    sw a4, 8(a0)
+; RV32ZVE32F-NEXT:    sw a5, 12(a0)
+; RV32ZVE32F-NEXT:    sw a6, 16(a0)
+; RV32ZVE32F-NEXT:    sw a7, 20(a0)
+; RV32ZVE32F-NEXT:    sw t1, 24(a0)
+; RV32ZVE32F-NEXT:    sw t2, 28(a0)
+; RV32ZVE32F-NEXT:    sw t3, 32(a0)
+; RV32ZVE32F-NEXT:    sw t4, 36(a0)
+; RV32ZVE32F-NEXT:    sw t5, 40(a0)
+; RV32ZVE32F-NEXT:    sw t6, 44(a0)
+; RV32ZVE32F-NEXT:    sw s0, 48(a0)
+; RV32ZVE32F-NEXT:    sw s1, 52(a0)
+; RV32ZVE32F-NEXT:    sw t0, 56(a0)
+; RV32ZVE32F-NEXT:    sw a2, 60(a0)
 ; RV32ZVE32F-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    addi sp, sp, 16
@@ -5105,77 +5105,77 @@ define <8 x i64> @mgather_baseidx_v8i16_v8i64(ptr %base, <8 x i16> %idxs, <8 x i
 ; RV32ZVE32F-NEXT:  # %bb.1: # %cond.load
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s a3, v8
-; RV32ZVE32F-NEXT:    lw a1, 4(a3)
-; RV32ZVE32F-NEXT:    lw a3, 0(a3)
+; RV32ZVE32F-NEXT:    lw a1, 0(a3)
+; RV32ZVE32F-NEXT:    lw a3, 4(a3)
 ; RV32ZVE32F-NEXT:    andi a4, t0, 2
 ; RV32ZVE32F-NEXT:    bnez a4, .LBB51_8
 ; RV32ZVE32F-NEXT:  .LBB51_2:
-; RV32ZVE32F-NEXT:    lw a4, 12(a2)
-; RV32ZVE32F-NEXT:    lw a5, 8(a2)
+; RV32ZVE32F-NEXT:    lw a4, 8(a2)
+; RV32ZVE32F-NEXT:    lw a5, 12(a2)
 ; RV32ZVE32F-NEXT:    andi a6, t0, 4
 ; RV32ZVE32F-NEXT:    bnez a6, .LBB51_9
 ; RV32ZVE32F-NEXT:  .LBB51_3:
-; RV32ZVE32F-NEXT:    lw a6, 20(a2)
-; RV32ZVE32F-NEXT:    lw a7, 16(a2)
+; RV32ZVE32F-NEXT:    lw a6, 16(a2)
+; RV32ZVE32F-NEXT:    lw a7, 20(a2)
 ; RV32ZVE32F-NEXT:    andi t1, t0, 8
 ; RV32ZVE32F-NEXT:    bnez t1, .LBB51_10
 ; RV32ZVE32F-NEXT:  .LBB51_4:
-; RV32ZVE32F-NEXT:    lw t1, 28(a2)
-; RV32ZVE32F-NEXT:    lw t2, 24(a2)
+; RV32ZVE32F-NEXT:    lw t1, 24(a2)
+; RV32ZVE32F-NEXT:    lw t2, 28(a2)
 ; RV32ZVE32F-NEXT:    andi t3, t0, 16
 ; RV32ZVE32F-NEXT:    bnez t3, .LBB51_11
 ; RV32ZVE32F-NEXT:  .LBB51_5:
-; RV32ZVE32F-NEXT:    lw t3, 36(a2)
-; RV32ZVE32F-NEXT:    lw t4, 32(a2)
+; RV32ZVE32F-NEXT:    lw t3, 32(a2)
+; RV32ZVE32F-NEXT:    lw t4, 36(a2)
 ; RV32ZVE32F-NEXT:    andi t5, t0, 32
 ; RV32ZVE32F-NEXT:    bnez t5, .LBB51_12
 ; RV32ZVE32F-NEXT:  .LBB51_6:
-; RV32ZVE32F-NEXT:    lw t5, 44(a2)
-; RV32ZVE32F-NEXT:    lw t6, 40(a2)
+; RV32ZVE32F-NEXT:    lw t5, 40(a2)
+; RV32ZVE32F-NEXT:    lw t6, 44(a2)
 ; RV32ZVE32F-NEXT:    j .LBB51_13
 ; RV32ZVE32F-NEXT:  .LBB51_7:
-; RV32ZVE32F-NEXT:    lw a1, 4(a2)
-; RV32ZVE32F-NEXT:    lw a3, 0(a2)
+; RV32ZVE32F-NEXT:    lw a1, 0(a2)
+; RV32ZVE32F-NEXT:    lw a3, 4(a2)
 ; RV32ZVE32F-NEXT:    andi a4, t0, 2
 ; RV32ZVE32F-NEXT:    beqz a4, .LBB51_2
 ; RV32ZVE32F-NEXT:  .LBB51_8: # %cond.load1
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 1
 ; RV32ZVE32F-NEXT:    vmv.x.s a5, v10
-; RV32ZVE32F-NEXT:    lw a4, 4(a5)
-; RV32ZVE32F-NEXT:    lw a5, 0(a5)
+; RV32ZVE32F-NEXT:    lw a4, 0(a5)
+; RV32ZVE32F-NEXT:    lw a5, 4(a5)
 ; RV32ZVE32F-NEXT:    andi a6, t0, 4
 ; RV32ZVE32F-NEXT:    beqz a6, .LBB51_3
 ; RV32ZVE32F-NEXT:  .LBB51_9: # %cond.load4
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 2
 ; RV32ZVE32F-NEXT:    vmv.x.s a7, v10
-; RV32ZVE32F-NEXT:    lw a6, 4(a7)
-; RV32ZVE32F-NEXT:    lw a7, 0(a7)
+; RV32ZVE32F-NEXT:    lw a6, 0(a7)
+; RV32ZVE32F-NEXT:    lw a7, 4(a7)
 ; RV32ZVE32F-NEXT:    andi t1, t0, 8
 ; RV32ZVE32F-NEXT:    beqz t1, .LBB51_4
 ; RV32ZVE32F-NEXT:  .LBB51_10: # %cond.load7
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 3
 ; RV32ZVE32F-NEXT:    vmv.x.s t2, v10
-; RV32ZVE32F-NEXT:    lw t1, 4(t2)
-; RV32ZVE32F-NEXT:    lw t2, 0(t2)
+; RV32ZVE32F-NEXT:    lw t1, 0(t2)
+; RV32ZVE32F-NEXT:    lw t2, 4(t2)
 ; RV32ZVE32F-NEXT:    andi t3, t0, 16
 ; RV32ZVE32F-NEXT:    beqz t3, .LBB51_5
 ; RV32ZVE32F-NEXT:  .LBB51_11: # %cond.load10
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
 ; RV32ZVE32F-NEXT:    vmv.x.s t4, v10
-; RV32ZVE32F-NEXT:    lw t3, 4(t4)
-; RV32ZVE32F-NEXT:    lw t4, 0(t4)
+; RV32ZVE32F-NEXT:    lw t3, 0(t4)
+; RV32ZVE32F-NEXT:    lw t4, 4(t4)
 ; RV32ZVE32F-NEXT:    andi t5, t0, 32
 ; RV32ZVE32F-NEXT:    beqz t5, .LBB51_6
 ; RV32ZVE32F-NEXT:  .LBB51_12: # %cond.load13
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 5
 ; RV32ZVE32F-NEXT:    vmv.x.s t6, v10
-; RV32ZVE32F-NEXT:    lw t5, 4(t6)
-; RV32ZVE32F-NEXT:    lw t6, 0(t6)
+; RV32ZVE32F-NEXT:    lw t5, 0(t6)
+; RV32ZVE32F-NEXT:    lw t6, 4(t6)
 ; RV32ZVE32F-NEXT:  .LBB51_13: # %else14
 ; RV32ZVE32F-NEXT:    addi sp, sp, -16
 ; RV32ZVE32F-NEXT:    .cfi_def_cfa_offset 16
@@ -5189,42 +5189,42 @@ define <8 x i64> @mgather_baseidx_v8i16_v8i64(ptr %base, <8 x i16> %idxs, <8 x i
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 6
 ; RV32ZVE32F-NEXT:    vmv.x.s s1, v10
-; RV32ZVE32F-NEXT:    lw s0, 4(s1)
-; RV32ZVE32F-NEXT:    lw s1, 0(s1)
+; RV32ZVE32F-NEXT:    lw s0, 0(s1)
+; RV32ZVE32F-NEXT:    lw s1, 4(s1)
 ; RV32ZVE32F-NEXT:    andi t0, t0, -128
 ; RV32ZVE32F-NEXT:    bnez t0, .LBB51_17
 ; RV32ZVE32F-NEXT:  .LBB51_15:
-; RV32ZVE32F-NEXT:    lw t0, 60(a2)
-; RV32ZVE32F-NEXT:    lw a2, 56(a2)
+; RV32ZVE32F-NEXT:    lw t0, 56(a2)
+; RV32ZVE32F-NEXT:    lw a2, 60(a2)
 ; RV32ZVE32F-NEXT:    j .LBB51_18
 ; RV32ZVE32F-NEXT:  .LBB51_16:
-; RV32ZVE32F-NEXT:    lw s0, 52(a2)
-; RV32ZVE32F-NEXT:    lw s1, 48(a2)
+; RV32ZVE32F-NEXT:    lw s0, 48(a2)
+; RV32ZVE32F-NEXT:    lw s1, 52(a2)
 ; RV32ZVE32F-NEXT:    andi t0, t0, -128
 ; RV32ZVE32F-NEXT:    beqz t0, .LBB51_15
 ; RV32ZVE32F-NEXT:  .LBB51_17: # %cond.load19
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
 ; RV32ZVE32F-NEXT:    vmv.x.s a2, v8
-; RV32ZVE32F-NEXT:    lw t0, 4(a2)
-; RV32ZVE32F-NEXT:    lw a2, 0(a2)
+; RV32ZVE32F-NEXT:    lw t0, 0(a2)
+; RV32ZVE32F-NEXT:    lw a2, 4(a2)
 ; RV32ZVE32F-NEXT:  .LBB51_18: # %else20
-; RV32ZVE32F-NEXT:    sw a3, 0(a0)
-; RV32ZVE32F-NEXT:    sw a1, 4(a0)
-; RV32ZVE32F-NEXT:    sw a5, 8(a0)
-; RV32ZVE32F-NEXT:    sw a4, 12(a0)
-; RV32ZVE32F-NEXT:    sw a7, 16(a0)
-; RV32ZVE32F-NEXT:    sw a6, 20(a0)
-; RV32ZVE32F-NEXT:    sw t2, 24(a0)
-; RV32ZVE32F-NEXT:    sw t1, 28(a0)
-; RV32ZVE32F-NEXT:    sw t4, 32(a0)
-; RV32ZVE32F-NEXT:    sw t3, 36(a0)
-; RV32ZVE32F-NEXT:    sw t6, 40(a0)
-; RV32ZVE32F-NEXT:    sw t5, 44(a0)
-; RV32ZVE32F-NEXT:    sw s1, 48(a0)
-; RV32ZVE32F-NEXT:    sw s0, 52(a0)
-; RV32ZVE32F-NEXT:    sw a2, 56(a0)
-; RV32ZVE32F-NEXT:    sw t0, 60(a0)
+; RV32ZVE32F-NEXT:    sw a1, 0(a0)
+; RV32ZVE32F-NEXT:    sw a3, 4(a0)
+; RV32ZVE32F-NEXT:    sw a4, 8(a0)
+; RV32ZVE32F-NEXT:    sw a5, 12(a0)
+; RV32ZVE32F-NEXT:    sw a6, 16(a0)
+; RV32ZVE32F-NEXT:    sw a7, 20(a0)
+; RV32ZVE32F-NEXT:    sw t1, 24(a0)
+; RV32ZVE32F-NEXT:    sw t2, 28(a0)
+; RV32ZVE32F-NEXT:    sw t3, 32(a0)
+; RV32ZVE32F-NEXT:    sw t4, 36(a0)
+; RV32ZVE32F-NEXT:    sw t5, 40(a0)
+; RV32ZVE32F-NEXT:    sw t6, 44(a0)
+; RV32ZVE32F-NEXT:    sw s0, 48(a0)
+; RV32ZVE32F-NEXT:    sw s1, 52(a0)
+; RV32ZVE32F-NEXT:    sw t0, 56(a0)
+; RV32ZVE32F-NEXT:    sw a2, 60(a0)
 ; RV32ZVE32F-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    addi sp, sp, 16
@@ -5380,77 +5380,77 @@ define <8 x i64> @mgather_baseidx_sext_v8i16_v8i64(ptr %base, <8 x i16> %idxs, <
 ; RV32ZVE32F-NEXT:  # %bb.1: # %cond.load
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s a3, v8
-; RV32ZVE32F-NEXT:    lw a1, 4(a3)
-; RV32ZVE32F-NEXT:    lw a3, 0(a3)
+; RV32ZVE32F-NEXT:    lw a1, 0(a3)
+; RV32ZVE32F-NEXT:    lw a3, 4(a3)
 ; RV32ZVE32F-NEXT:    andi a4, t0, 2
 ; RV32ZVE32F-NEXT:    bnez a4, .LBB52_8
 ; RV32ZVE32F-NEXT:  .LBB52_2:
-; RV32ZVE32F-NEXT:    lw a4, 12(a2)
-; RV32ZVE32F-NEXT:    lw a5, 8(a2)
+; RV32ZVE32F-NEXT:    lw a4, 8(a2)
+; RV32ZVE32F-NEXT:    lw a5, 12(a2)
 ; RV32ZVE32F-NEXT:    andi a6, t0, 4
 ; RV32ZVE32F-NEXT:    bnez a6, .LBB52_9
 ; RV32ZVE32F-NEXT:  .LBB52_3:
-; RV32ZVE32F-NEXT:    lw a6, 20(a2)
-; RV32ZVE32F-NEXT:    lw a7, 16(a2)
+; RV32ZVE32F-NEXT:    lw a6, 16(a2)
+; RV32ZVE32F-NEXT:    lw a7, 20(a2)
 ; RV32ZVE32F-NEXT:    andi t1, t0, 8
 ; RV32ZVE32F-NEXT:    bnez t1, .LBB52_10
 ; RV32ZVE32F-NEXT:  .LBB52_4:
-; RV32ZVE32F-NEXT:    lw t1, 28(a2)
-; RV32ZVE32F-NEXT:    lw t2, 24(a2)
+; RV32ZVE32F-NEXT:    lw t1, 24(a2)
+; RV32ZVE32F-NEXT:    lw t2, 28(a2)
 ; RV32ZVE32F-NEXT:    andi t3, t0, 16
 ; RV32ZVE32F-NEXT:    bnez t3, .LBB52_11
 ; RV32ZVE32F-NEXT:  .LBB52_5:
-; RV32ZVE32F-NEXT:    lw t3, 36(a2)
-; RV32ZVE32F-NEXT:    lw t4, 32(a2)
+; RV32ZVE32F-NEXT:    lw t3, 32(a2)
+; RV32ZVE32F-NEXT:    lw t4, 36(a2)
 ; RV32ZVE32F-NEXT:    andi t5, t0, 32
 ; RV32ZVE32F-NEXT:    bnez t5, .LBB52_12
 ; RV32ZVE32F-NEXT:  .LBB52_6:
-; RV32ZVE32F-NEXT:    lw t5, 44(a2)
-; RV32ZVE32F-NEXT:    lw t6, 40(a2)
+; RV32ZVE32F-NEXT:    lw t5, 40(a2)
+; RV32ZVE32F-NEXT:    lw t6, 44(a2)
 ; RV32ZVE32F-NEXT:    j .LBB52_13
 ; RV32ZVE32F-NEXT:  .LBB52_7:
-; RV32ZVE32F-NEXT:    lw a1, 4(a2)
-; RV32ZVE32F-NEXT:    lw a3, 0(a2)
+; RV32ZVE32F-NEXT:    lw a1, 0(a2)
+; RV32ZVE32F-NEXT:    lw a3, 4(a2)
 ; RV32ZVE32F-NEXT:    andi a4, t0, 2
 ; RV32ZVE32F-NEXT:    beqz a4, .LBB52_2
 ; RV32ZVE32F-NEXT:  .LBB52_8: # %cond.load1
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 1
 ; RV32ZVE32F-NEXT:    vmv.x.s a5, v10
-; RV32ZVE32F-NEXT:    lw a4, 4(a5)
-; RV32ZVE32F-NEXT:    lw a5, 0(a5)
+; RV32ZVE32F-NEXT:    lw a4, 0(a5)
+; RV32ZVE32F-NEXT:    lw a5, 4(a5)
 ; RV32ZVE32F-NEXT:    andi a6, t0, 4
 ; RV32ZVE32F-NEXT:    beqz a6, .LBB52_3
 ; RV32ZVE32F-NEXT:  .LBB52_9: # %cond.load4
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 2
 ; RV32ZVE32F-NEXT:    vmv.x.s a7, v10
-; RV32ZVE32F-NEXT:    lw a6, 4(a7)
-; RV32ZVE32F-NEXT:    lw a7, 0(a7)
+; RV32ZVE32F-NEXT:    lw a6, 0(a7)
+; RV32ZVE32F-NEXT:    lw a7, 4(a7)
 ; RV32ZVE32F-NEXT:    andi t1, t0, 8
 ; RV32ZVE32F-NEXT:    beqz t1, .LBB52_4
 ; RV32ZVE32F-NEXT:  .LBB52_10: # %cond.load7
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 3
 ; RV32ZVE32F-NEXT:    vmv.x.s t2, v10
-; RV32ZVE32F-NEXT:    lw t1, 4(t2)
-; RV32ZVE32F-NEXT:    lw t2, 0(t2)
+; RV32ZVE32F-NEXT:    lw t1, 0(t2)
+; RV32ZVE32F-NEXT:    lw t2, 4(t2)
 ; RV32ZVE32F-NEXT:    andi t3, t0, 16
 ; RV32ZVE32F-NEXT:    beqz t3, .LBB52_5
 ; RV32ZVE32F-NEXT:  .LBB52_11: # %cond.load10
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
 ; RV32ZVE32F-NEXT:    vmv.x.s t4, v10
-; RV32ZVE32F-NEXT:    lw t3, 4(t4)
-; RV32ZVE32F-NEXT:    lw t4, 0(t4)
+; RV32ZVE32F-NEXT:    lw t3, 0(t4)
+; RV32ZVE32F-NEXT:    lw t4, 4(t4)
 ; RV32ZVE32F-NEXT:    andi t5, t0, 32
 ; RV32ZVE32F-NEXT:    beqz t5, .LBB52_6
 ; RV32ZVE32F-NEXT:  .LBB52_12: # %cond.load13
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 5
 ; RV32ZVE32F-NEXT:    vmv.x.s t6, v10
-; RV32ZVE32F-NEXT:    lw t5, 4(t6)
-; RV32ZVE32F-NEXT:    lw t6, 0(t6)
+; RV32ZVE32F-NEXT:    lw t5, 0(t6)
+; RV32ZVE32F-NEXT:    lw t6, 4(t6)
 ; RV32ZVE32F-NEXT:  .LBB52_13: # %else14
 ; RV32ZVE32F-NEXT:    addi sp, sp, -16
 ; RV32ZVE32F-NEXT:    .cfi_def_cfa_offset 16
@@ -5464,42 +5464,42 @@ define <8 x i64> @mgather_baseidx_sext_v8i16_v8i64(ptr %base, <8 x i16> %idxs, <
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 6
 ; RV32ZVE32F-NEXT:    vmv.x.s s1, v10
-; RV32ZVE32F-NEXT:    lw s0, 4(s1)
-; RV32ZVE32F-NEXT:    lw s1, 0(s1)
+; RV32ZVE32F-NEXT:    lw s0, 0(s1)
+; RV32ZVE32F-NEXT:    lw s1, 4(s1)
 ; RV32ZVE32F-NEXT:    andi t0, t0, -128
 ; RV32ZVE32F-NEXT:    bnez t0, .LBB52_17
 ; RV32ZVE32F-NEXT:  .LBB52_15:
-; RV32ZVE32F-NEXT:    lw t0, 60(a2)
-; RV32ZVE32F-NEXT:    lw a2, 56(a2)
+; RV32ZVE32F-NEXT:    lw t0, 56(a2)
+; RV32ZVE32F-NEXT:    lw a2, 60(a2)
 ; RV32ZVE32F-NEXT:    j .LBB52_18
 ; RV32ZVE32F-NEXT:  .LBB52_16:
-; RV32ZVE32F-NEXT:    lw s0, 52(a2)
-; RV32ZVE32F-NEXT:    lw s1, 48(a2)
+; RV32ZVE32F-NEXT:    lw s0, 48(a2)
+; RV32ZVE32F-NEXT:    lw s1, 52(a2)
 ; RV32ZVE32F-NEXT:    andi t0, t0, -128
 ; RV32ZVE32F-NEXT:    beqz t0, .LBB52_15
 ; RV32ZVE32F-NEXT:  .LBB52_17: # %cond.load19
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
 ; RV32ZVE32F-NEXT:    vmv.x.s a2, v8
-; RV32ZVE32F-NEXT:    lw t0, 4(a2)
-; RV32ZVE32F-NEXT:    lw a2, 0(a2)
+; RV32ZVE32F-NEXT:    lw t0, 0(a2)
+; RV32ZVE32F-NEXT:    lw a2, 4(a2)
 ; RV32ZVE32F-NEXT:  .LBB52_18: # %else20
-; RV32ZVE32F-NEXT:    sw a3, 0(a0)
-; RV32ZVE32F-NEXT:    sw a1, 4(a0)
-; RV32ZVE32F-NEXT:    sw a5, 8(a0)
-; RV32ZVE32F-NEXT:    sw a4, 12(a0)
-; RV32ZVE32F-NEXT:    sw a7, 16(a0)
-; RV32ZVE32F-NEXT:    sw a6, 20(a0)
-; RV32ZVE32F-NEXT:    sw t2, 24(a0)
-; RV32ZVE32F-NEXT:    sw t1, 28(a0)
-; RV32ZVE32F-NEXT:    sw t4, 32(a0)
-; RV32ZVE32F-NEXT:    sw t3, 36(a0)
-; RV32ZVE32F-NEXT:    sw t6, 40(a0)
-; RV32ZVE32F-NEXT:    sw t5, 44(a0)
-; RV32ZVE32F-NEXT:    sw s1, 48(a0)
-; RV32ZVE32F-NEXT:    sw s0, 52(a0)
-; RV32ZVE32F-NEXT:    sw a2, 56(a0)
-; RV32ZVE32F-NEXT:    sw t0, 60(a0)
+; RV32ZVE32F-NEXT:    sw a1, 0(a0)
+; RV32ZVE32F-NEXT:    sw a3, 4(a0)
+; RV32ZVE32F-NEXT:    sw a4, 8(a0)
+; RV32ZVE32F-NEXT:    sw a5, 12(a0)
+; RV32ZVE32F-NEXT:    sw a6, 16(a0)
+; RV32ZVE32F-NEXT:    sw a7, 20(a0)
+; RV32ZVE32F-NEXT:    sw t1, 24(a0)
+; RV32ZVE32F-NEXT:    sw t2, 28(a0)
+; RV32ZVE32F-NEXT:    sw t3, 32(a0)
+; RV32ZVE32F-NEXT:    sw t4, 36(a0)
+; RV32ZVE32F-NEXT:    sw t5, 40(a0)
+; RV32ZVE32F-NEXT:    sw t6, 44(a0)
+; RV32ZVE32F-NEXT:    sw s0, 48(a0)
+; RV32ZVE32F-NEXT:    sw s1, 52(a0)
+; RV32ZVE32F-NEXT:    sw t0, 56(a0)
+; RV32ZVE32F-NEXT:    sw a2, 60(a0)
 ; RV32ZVE32F-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    addi sp, sp, 16
@@ -5657,77 +5657,77 @@ define <8 x i64> @mgather_baseidx_zext_v8i16_v8i64(ptr %base, <8 x i16> %idxs, <
 ; RV32ZVE32F-NEXT:  # %bb.1: # %cond.load
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s a3, v8
-; RV32ZVE32F-NEXT:    lw a1, 4(a3)
-; RV32ZVE32F-NEXT:    lw a3, 0(a3)
+; RV32ZVE32F-NEXT:    lw a1, 0(a3)
+; RV32ZVE32F-NEXT:    lw a3, 4(a3)
 ; RV32ZVE32F-NEXT:    andi a4, t0, 2
 ; RV32ZVE32F-NEXT:    bnez a4, .LBB53_8
 ; RV32ZVE32F-NEXT:  .LBB53_2:
-; RV32ZVE32F-NEXT:    lw a4, 12(a2)
-; RV32ZVE32F-NEXT:    lw a5, 8(a2)
+; RV32ZVE32F-NEXT:    lw a4, 8(a2)
+; RV32ZVE32F-NEXT:    lw a5, 12(a2)
 ; RV32ZVE32F-NEXT:    andi a6, t0, 4
 ; RV32ZVE32F-NEXT:    bnez a6, .LBB53_9
 ; RV32ZVE32F-NEXT:  .LBB53_3:
-; RV32ZVE32F-NEXT:    lw a6, 20(a2)
-; RV32ZVE32F-NEXT:    lw a7, 16(a2)
+; RV32ZVE32F-NEXT:    lw a6, 16(a2)
+; RV32ZVE32F-NEXT:    lw a7, 20(a2)
 ; RV32ZVE32F-NEXT:    andi t1, t0, 8
 ; RV32ZVE32F-NEXT:    bnez t1, .LBB53_10
 ; RV32ZVE32F-NEXT:  .LBB53_4:
-; RV32ZVE32F-NEXT:    lw t1, 28(a2)
-; RV32ZVE32F-NEXT:    lw t2, 24(a2)
+; RV32ZVE32F-NEXT:    lw t1, 24(a2)
+; RV32ZVE32F-NEXT:    lw t2, 28(a2)
 ; RV32ZVE32F-NEXT:    andi t3, t0, 16
 ; RV32ZVE32F-NEXT:    bnez t3, .LBB53_11
 ; RV32ZVE32F-NEXT:  .LBB53_5:
-; RV32ZVE32F-NEXT:    lw t3, 36(a2)
-; RV32ZVE32F-NEXT:    lw t4, 32(a2)
+; RV32ZVE32F-NEXT:    lw t3, 32(a2)
+; RV32ZVE32F-NEXT:    lw t4, 36(a2)
 ; RV32ZVE32F-NEXT:    andi t5, t0, 32
 ; RV32ZVE32F-NEXT:    bnez t5, .LBB53_12
 ; RV32ZVE32F-NEXT:  .LBB53_6:
-; RV32ZVE32F-NEXT:    lw t5, 44(a2)
-; RV32ZVE32F-NEXT:    lw t6, 40(a2)
+; RV32ZVE32F-NEXT:    lw t5, 40(a2)
+; RV32ZVE32F-NEXT:    lw t6, 44(a2)
 ; RV32ZVE32F-NEXT:    j .LBB53_13
 ; RV32ZVE32F-NEXT:  .LBB53_7:
-; RV32ZVE32F-NEXT:    lw a1, 4(a2)
-; RV32ZVE32F-NEXT:    lw a3, 0(a2)
+; RV32ZVE32F-NEXT:    lw a1, 0(a2)
+; RV32ZVE32F-NEXT:    lw a3, 4(a2)
 ; RV32ZVE32F-NEXT:    andi a4, t0, 2
 ; RV32ZVE32F-NEXT:    beqz a4, .LBB53_2
 ; RV32ZVE32F-NEXT:  .LBB53_8: # %cond.load1
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 1
 ; RV32ZVE32F-NEXT:    vmv.x.s a5, v10
-; RV32ZVE32F-NEXT:    lw a4, 4(a5)
-; RV32ZVE32F-NEXT:    lw a5, 0(a5)
+; RV32ZVE32F-NEXT:    lw a4, 0(a5)
+; RV32ZVE32F-NEXT:    lw a5, 4(a5)
 ; RV32ZVE32F-NEXT:    andi a6, t0, 4
 ; RV32ZVE32F-NEXT:    beqz a6, .LBB53_3
 ; RV32ZVE32F-NEXT:  .LBB53_9: # %cond.load4
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 2
 ; RV32ZVE32F-NEXT:    vmv.x.s a7, v10
-; RV32ZVE32F-NEXT:    lw a6, 4(a7)
-; RV32ZVE32F-NEXT:    lw a7, 0(a7)
+; RV32ZVE32F-NEXT:    lw a6, 0(a7)
+; RV32ZVE32F-NEXT:    lw a7, 4(a7)
 ; RV32ZVE32F-NEXT:    andi t1, t0, 8
 ; RV32ZVE32F-NEXT:    beqz t1, .LBB53_4
 ; RV32ZVE32F-NEXT:  .LBB53_10: # %cond.load7
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 3
 ; RV32ZVE32F-NEXT:    vmv.x.s t2, v10
-; RV32ZVE32F-NEXT:    lw t1, 4(t2)
-; RV32ZVE32F-NEXT:    lw t2, 0(t2)
+; RV32ZVE32F-NEXT:    lw t1, 0(t2)
+; RV32ZVE32F-NEXT:    lw t2, 4(t2)
 ; RV32ZVE32F-NEXT:    andi t3, t0, 16
 ; RV32ZVE32F-NEXT:    beqz t3, .LBB53_5
 ; RV32ZVE32F-NEXT:  .LBB53_11: # %cond.load10
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
 ; RV32ZVE32F-NEXT:    vmv.x.s t4, v10
-; RV32ZVE32F-NEXT:    lw t3, 4(t4)
-; RV32ZVE32F-NEXT:    lw t4, 0(t4)
+; RV32ZVE32F-NEXT:    lw t3, 0(t4)
+; RV32ZVE32F-NEXT:    lw t4, 4(t4)
 ; RV32ZVE32F-NEXT:    andi t5, t0, 32
 ; RV32ZVE32F-NEXT:    beqz t5, .LBB53_6
 ; RV32ZVE32F-NEXT:  .LBB53_12: # %cond.load13
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 5
 ; RV32ZVE32F-NEXT:    vmv.x.s t6, v10
-; RV32ZVE32F-NEXT:    lw t5, 4(t6)
-; RV32ZVE32F-NEXT:    lw t6, 0(t6)
+; RV32ZVE32F-NEXT:    lw t5, 0(t6)
+; RV32ZVE32F-NEXT:    lw t6, 4(t6)
 ; RV32ZVE32F-NEXT:  .LBB53_13: # %else14
 ; RV32ZVE32F-NEXT:    addi sp, sp, -16
 ; RV32ZVE32F-NEXT:    .cfi_def_cfa_offset 16
@@ -5741,42 +5741,42 @@ define <8 x i64> @mgather_baseidx_zext_v8i16_v8i64(ptr %base, <8 x i16> %idxs, <
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 6
 ; RV32ZVE32F-NEXT:    vmv.x.s s1, v10
-; RV32ZVE32F-NEXT:    lw s0, 4(s1)
-; RV32ZVE32F-NEXT:    lw s1, 0(s1)
+; RV32ZVE32F-NEXT:    lw s0, 0(s1)
+; RV32ZVE32F-NEXT:    lw s1, 4(s1)
 ; RV32ZVE32F-NEXT:    andi t0, t0, -128
 ; RV32ZVE32F-NEXT:    bnez t0, .LBB53_17
 ; RV32ZVE32F-NEXT:  .LBB53_15:
-; RV32ZVE32F-NEXT:    lw t0, 60(a2)
-; RV32ZVE32F-NEXT:    lw a2, 56(a2)
+; RV32ZVE32F-NEXT:    lw t0, 56(a2)
+; RV32ZVE32F-NEXT:    lw a2, 60(a2)
 ; RV32ZVE32F-NEXT:    j .LBB53_18
 ; RV32ZVE32F-NEXT:  .LBB53_16:
-; RV32ZVE32F-NEXT:    lw s0, 52(a2)
-; RV32ZVE32F-NEXT:    lw s1, 48(a2)
+; RV32ZVE32F-NEXT:    lw s0, 48(a2)
+; RV32ZVE32F-NEXT:    lw s1, 52(a2)
 ; RV32ZVE32F-NEXT:    andi t0, t0, -128
 ; RV32ZVE32F-NEXT:    beqz t0, .LBB53_15
 ; RV32ZVE32F-NEXT:  .LBB53_17: # %cond.load19
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
 ; RV32ZVE32F-NEXT:    vmv.x.s a2, v8
-; RV32ZVE32F-NEXT:    lw t0, 4(a2)
-; RV32ZVE32F-NEXT:    lw a2, 0(a2)
+; RV32ZVE32F-NEXT:    lw t0, 0(a2)
+; RV32ZVE32F-NEXT:    lw a2, 4(a2)
 ; RV32ZVE32F-NEXT:  .LBB53_18: # %else20
-; RV32ZVE32F-NEXT:    sw a3, 0(a0)
-; RV32ZVE32F-NEXT:    sw a1, 4(a0)
-; RV32ZVE32F-NEXT:    sw a5, 8(a0)
-; RV32ZVE32F-NEXT:    sw a4, 12(a0)
-; RV32ZVE32F-NEXT:    sw a7, 16(a0)
-; RV32ZVE32F-NEXT:    sw a6, 20(a0)
-; RV32ZVE32F-NEXT:    sw t2, 24(a0)
-; RV32ZVE32F-NEXT:    sw t1, 28(a0)
-; RV32ZVE32F-NEXT:    sw t4, 32(a0)
-; RV32ZVE32F-NEXT:    sw t3, 36(a0)
-; RV32ZVE32F-NEXT:    sw t6, 40(a0)
-; RV32ZVE32F-NEXT:    sw t5, 44(a0)
-; RV32ZVE32F-NEXT:    sw s1, 48(a0)
-; RV32ZVE32F-NEXT:    sw s0, 52(a0)
-; RV32ZVE32F-NEXT:    sw a2, 56(a0)
-; RV32ZVE32F-NEXT:    sw t0, 60(a0)
+; RV32ZVE32F-NEXT:    sw a1, 0(a0)
+; RV32ZVE32F-NEXT:    sw a3, 4(a0)
+; RV32ZVE32F-NEXT:    sw a4, 8(a0)
+; RV32ZVE32F-NEXT:    sw a5, 12(a0)
+; RV32ZVE32F-NEXT:    sw a6, 16(a0)
+; RV32ZVE32F-NEXT:    sw a7, 20(a0)
+; RV32ZVE32F-NEXT:    sw t1, 24(a0)
+; RV32ZVE32F-NEXT:    sw t2, 28(a0)
+; RV32ZVE32F-NEXT:    sw t3, 32(a0)
+; RV32ZVE32F-NEXT:    sw t4, 36(a0)
+; RV32ZVE32F-NEXT:    sw t5, 40(a0)
+; RV32ZVE32F-NEXT:    sw t6, 44(a0)
+; RV32ZVE32F-NEXT:    sw s0, 48(a0)
+; RV32ZVE32F-NEXT:    sw s1, 52(a0)
+; RV32ZVE32F-NEXT:    sw t0, 56(a0)
+; RV32ZVE32F-NEXT:    sw a2, 60(a0)
 ; RV32ZVE32F-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    addi sp, sp, 16
@@ -5941,77 +5941,77 @@ define <8 x i64> @mgather_baseidx_v8i32_v8i64(ptr %base, <8 x i32> %idxs, <8 x i
 ; RV32ZVE32F-NEXT:  # %bb.1: # %cond.load
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s a3, v8
-; RV32ZVE32F-NEXT:    lw a1, 4(a3)
-; RV32ZVE32F-NEXT:    lw a3, 0(a3)
+; RV32ZVE32F-NEXT:    lw a1, 0(a3)
+; RV32ZVE32F-NEXT:    lw a3, 4(a3)
 ; RV32ZVE32F-NEXT:    andi a4, t0, 2
 ; RV32ZVE32F-NEXT:    bnez a4, .LBB54_8
 ; RV32ZVE32F-NEXT:  .LBB54_2:
-; RV32ZVE32F-NEXT:    lw a4, 12(a2)
-; RV32ZVE32F-NEXT:    lw a5, 8(a2)
+; RV32ZVE32F-NEXT:    lw a4, 8(a2)
+; RV32ZVE32F-NEXT:    lw a5, 12(a2)
 ; RV32ZVE32F-NEXT:    andi a6, t0, 4
 ; RV32ZVE32F-NEXT:    bnez a6, .LBB54_9
 ; RV32ZVE32F-NEXT:  .LBB54_3:
-; RV32ZVE32F-NEXT:    lw a6, 20(a2)
-; RV32ZVE32F-NEXT:    lw a7, 16(a2)
+; RV32ZVE32F-NEXT:    lw a6, 16(a2)
+; RV32ZVE32F-NEXT:    lw a7, 20(a2)
 ; RV32ZVE32F-NEXT:    andi t1, t0, 8
 ; RV32ZVE32F-NEXT:    bnez t1, .LBB54_10
 ; RV32ZVE32F-NEXT:  .LBB54_4:
-; RV32ZVE32F-NEXT:    lw t1, 28(a2)
-; RV32ZVE32F-NEXT:    lw t2, 24(a2)
+; RV32ZVE32F-NEXT:    lw t1, 24(a2)
+; RV32ZVE32F-NEXT:    lw t2, 28(a2)
 ; RV32ZVE32F-NEXT:    andi t3, t0, 16
 ; RV32ZVE32F-NEXT:    bnez t3, .LBB54_11
 ; RV32ZVE32F-NEXT:  .LBB54_5:
-; RV32ZVE32F-NEXT:    lw t3, 36(a2)
-; RV32ZVE32F-NEXT:    lw t4, 32(a2)
+; RV32ZVE32F-NEXT:    lw t3, 32(a2)
+; RV32ZVE32F-NEXT:    lw t4, 36(a2)
 ; RV32ZVE32F-NEXT:    andi t5, t0, 32
 ; RV32ZVE32F-NEXT:    bnez t5, .LBB54_12
 ; RV32ZVE32F-NEXT:  .LBB54_6:
-; RV32ZVE32F-NEXT:    lw t5, 44(a2)
-; RV32ZVE32F-NEXT:    lw t6, 40(a2)
+; RV32ZVE32F-NEXT:    lw t5, 40(a2)
+; RV32ZVE32F-NEXT:    lw t6, 44(a2)
 ; RV32ZVE32F-NEXT:    j .LBB54_13
 ; RV32ZVE32F-NEXT:  .LBB54_7:
-; RV32ZVE32F-NEXT:    lw a1, 4(a2)
-; RV32ZVE32F-NEXT:    lw a3, 0(a2)
+; RV32ZVE32F-NEXT:    lw a1, 0(a2)
+; RV32ZVE32F-NEXT:    lw a3, 4(a2)
 ; RV32ZVE32F-NEXT:    andi a4, t0, 2
 ; RV32ZVE32F-NEXT:    beqz a4, .LBB54_2
 ; RV32ZVE32F-NEXT:  .LBB54_8: # %cond.load1
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 1
 ; RV32ZVE32F-NEXT:    vmv.x.s a5, v10
-; RV32ZVE32F-NEXT:    lw a4, 4(a5)
-; RV32ZVE32F-NEXT:    lw a5, 0(a5)
+; RV32ZVE32F-NEXT:    lw a4, 0(a5)
+; RV32ZVE32F-NEXT:    lw a5, 4(a5)
 ; RV32ZVE32F-NEXT:    andi a6, t0, 4
 ; RV32ZVE32F-NEXT:    beqz a6, .LBB54_3
 ; RV32ZVE32F-NEXT:  .LBB54_9: # %cond.load4
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 2
 ; RV32ZVE32F-NEXT:    vmv.x.s a7, v10
-; RV32ZVE32F-NEXT:    lw a6, 4(a7)
-; RV32ZVE32F-NEXT:    lw a7, 0(a7)
+; RV32ZVE32F-NEXT:    lw a6, 0(a7)
+; RV32ZVE32F-NEXT:    lw a7, 4(a7)
 ; RV32ZVE32F-NEXT:    andi t1, t0, 8
 ; RV32ZVE32F-NEXT:    beqz t1, .LBB54_4
 ; RV32ZVE32F-NEXT:  .LBB54_10: # %cond.load7
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 3
 ; RV32ZVE32F-NEXT:    vmv.x.s t2, v10
-; RV32ZVE32F-NEXT:    lw t1, 4(t2)
-; RV32ZVE32F-NEXT:    lw t2, 0(t2)
+; RV32ZVE32F-NEXT:    lw t1, 0(t2)
+; RV32ZVE32F-NEXT:    lw t2, 4(t2)
 ; RV32ZVE32F-NEXT:    andi t3, t0, 16
 ; RV32ZVE32F-NEXT:    beqz t3, .LBB54_5
 ; RV32ZVE32F-NEXT:  .LBB54_11: # %cond.load10
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
 ; RV32ZVE32F-NEXT:    vmv.x.s t4, v10
-; RV32ZVE32F-NEXT:    lw t3, 4(t4)
-; RV32ZVE32F-NEXT:    lw t4, 0(t4)
+; RV32ZVE32F-NEXT:    lw t3, 0(t4)
+; RV32ZVE32F-NEXT:    lw t4, 4(t4)
 ; RV32ZVE32F-NEXT:    andi t5, t0, 32
 ; RV32ZVE32F-NEXT:    beqz t5, .LBB54_6
 ; RV32ZVE32F-NEXT:  .LBB54_12: # %cond.load13
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 5
 ; RV32ZVE32F-NEXT:    vmv.x.s t6, v10
-; RV32ZVE32F-NEXT:    lw t5, 4(t6)
-; RV32ZVE32F-NEXT:    lw t6, 0(t6)
+; RV32ZVE32F-NEXT:    lw t5, 0(t6)
+; RV32ZVE32F-NEXT:    lw t6, 4(t6)
 ; RV32ZVE32F-NEXT:  .LBB54_13: # %else14
 ; RV32ZVE32F-NEXT:    addi sp, sp, -16
 ; RV32ZVE32F-NEXT:    .cfi_def_cfa_offset 16
@@ -6025,42 +6025,42 @@ define <8 x i64> @mgather_baseidx_v8i32_v8i64(ptr %base, <8 x i32> %idxs, <8 x i
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 6
 ; RV32ZVE32F-NEXT:    vmv.x.s s1, v10
-; RV32ZVE32F-NEXT:    lw s0, 4(s1)
-; RV32ZVE32F-NEXT:    lw s1, 0(s1)
+; RV32ZVE32F-NEXT:    lw s0, 0(s1)
+; RV32ZVE32F-NEXT:    lw s1, 4(s1)
 ; RV32ZVE32F-NEXT:    andi t0, t0, -128
 ; RV32ZVE32F-NEXT:    bnez t0, .LBB54_17
 ; RV32ZVE32F-NEXT:  .LBB54_15:
-; RV32ZVE32F-NEXT:    lw t0, 60(a2)
-; RV32ZVE32F-NEXT:    lw a2, 56(a2)
+; RV32ZVE32F-NEXT:    lw t0, 56(a2)
+; RV32ZVE32F-NEXT:    lw a2, 60(a2)
 ; RV32ZVE32F-NEXT:    j .LBB54_18
 ; RV32ZVE32F-NEXT:  .LBB54_16:
-; RV32ZVE32F-NEXT:    lw s0, 52(a2)
-; RV32ZVE32F-NEXT:    lw s1, 48(a2)
+; RV32ZVE32F-NEXT:    lw s0, 48(a2)
+; RV32ZVE32F-NEXT:    lw s1, 52(a2)
 ; RV32ZVE32F-NEXT:    andi t0, t0, -128
 ; RV32ZVE32F-NEXT:    beqz t0, .LBB54_15
 ; RV32ZVE32F-NEXT:  .LBB54_17: # %cond.load19
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
 ; RV32ZVE32F-NEXT:    vmv.x.s a2, v8
-; RV32ZVE32F-NEXT:    lw t0, 4(a2)
-; RV32ZVE32F-NEXT:    lw a2, 0(a2)
+; RV32ZVE32F-NEXT:    lw t0, 0(a2)
+; RV32ZVE32F-NEXT:    lw a2, 4(a2)
 ; RV32ZVE32F-NEXT:  .LBB54_18: # %else20
-; RV32ZVE32F-NEXT:    sw a3, 0(a0)
-; RV32ZVE32F-NEXT:    sw a1, 4(a0)
-; RV32ZVE32F-NEXT:    sw a5, 8(a0)
-; RV32ZVE32F-NEXT:    sw a4, 12(a0)
-; RV32ZVE32F-NEXT:    sw a7, 16(a0)
-; RV32ZVE32F-NEXT:    sw a6, 20(a0)
-; RV32ZVE32F-NEXT:    sw t2, 24(a0)
-; RV32ZVE32F-NEXT:    sw t1, 28(a0)
-; RV32ZVE32F-NEXT:    sw t4, 32(a0)
-; RV32ZVE32F-NEXT:    sw t3, 36(a0)
-; RV32ZVE32F-NEXT:    sw t6, 40(a0)
-; RV32ZVE32F-NEXT:    sw t5, 44(a0)
-; RV32ZVE32F-NEXT:    sw s1, 48(a0)
-; RV32ZVE32F-NEXT:    sw s0, 52(a0)
-; RV32ZVE32F-NEXT:    sw a2, 56(a0)
-; RV32ZVE32F-NEXT:    sw t0, 60(a0)
+; RV32ZVE32F-NEXT:    sw a1, 0(a0)
+; RV32ZVE32F-NEXT:    sw a3, 4(a0)
+; RV32ZVE32F-NEXT:    sw a4, 8(a0)
+; RV32ZVE32F-NEXT:    sw a5, 12(a0)
+; RV32ZVE32F-NEXT:    sw a6, 16(a0)
+; RV32ZVE32F-NEXT:    sw a7, 20(a0)
+; RV32ZVE32F-NEXT:    sw t1, 24(a0)
+; RV32ZVE32F-NEXT:    sw t2, 28(a0)
+; RV32ZVE32F-NEXT:    sw t3, 32(a0)
+; RV32ZVE32F-NEXT:    sw t4, 36(a0)
+; RV32ZVE32F-NEXT:    sw t5, 40(a0)
+; RV32ZVE32F-NEXT:    sw t6, 44(a0)
+; RV32ZVE32F-NEXT:    sw s0, 48(a0)
+; RV32ZVE32F-NEXT:    sw s1, 52(a0)
+; RV32ZVE32F-NEXT:    sw t0, 56(a0)
+; RV32ZVE32F-NEXT:    sw a2, 60(a0)
 ; RV32ZVE32F-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    addi sp, sp, 16
@@ -6214,77 +6214,77 @@ define <8 x i64> @mgather_baseidx_sext_v8i32_v8i64(ptr %base, <8 x i32> %idxs, <
 ; RV32ZVE32F-NEXT:  # %bb.1: # %cond.load
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s a3, v8
-; RV32ZVE32F-NEXT:    lw a1, 4(a3)
-; RV32ZVE32F-NEXT:    lw a3, 0(a3)
+; RV32ZVE32F-NEXT:    lw a1, 0(a3)
+; RV32ZVE32F-NEXT:    lw a3, 4(a3)
 ; RV32ZVE32F-NEXT:    andi a4, t0, 2
 ; RV32ZVE32F-NEXT:    bnez a4, .LBB55_8
 ; RV32ZVE32F-NEXT:  .LBB55_2:
-; RV32ZVE32F-NEXT:    lw a4, 12(a2)
-; RV32ZVE32F-NEXT:    lw a5, 8(a2)
+; RV32ZVE32F-NEXT:    lw a4, 8(a2)
+; RV32ZVE32F-NEXT:    lw a5, 12(a2)
 ; RV32ZVE32F-NEXT:    andi a6, t0, 4
 ; RV32ZVE32F-NEXT:    bnez a6, .LBB55_9
 ; RV32ZVE32F-NEXT:  .LBB55_3:
-; RV32ZVE32F-NEXT:    lw a6, 20(a2)
-; RV32ZVE32F-NEXT:    lw a7, 16(a2)
+; RV32ZVE32F-NEXT:    lw a6, 16(a2)
+; RV32ZVE32F-NEXT:    lw a7, 20(a2)
 ; RV32ZVE32F-NEXT:    andi t1, t0, 8
 ; RV32ZVE32F-NEXT:    bnez t1, .LBB55_10
 ; RV32ZVE32F-NEXT:  .LBB55_4:
-; RV32ZVE32F-NEXT:    lw t1, 28(a2)
-; RV32ZVE32F-NEXT:    lw t2, 24(a2)
+; RV32ZVE32F-NEXT:    lw t1, 24(a2)
+; RV32ZVE32F-NEXT:    lw t2, 28(a2)
 ; RV32ZVE32F-NEXT:    andi t3, t0, 16
 ; RV32ZVE32F-NEXT:    bnez t3, .LBB55_11
 ; RV32ZVE32F-NEXT:  .LBB55_5:
-; RV32ZVE32F-NEXT:    lw t3, 36(a2)
-; RV32ZVE32F-NEXT:    lw t4, 32(a2)
+; RV32ZVE32F-NEXT:    lw t3, 32(a2)
+; RV32ZVE32F-NEXT:    lw t4, 36(a2)
 ; RV32ZVE32F-NEXT:    andi t5, t0, 32
 ; RV32ZVE32F-NEXT:    bnez t5, .LBB55_12
 ; RV32ZVE32F-NEXT:  .LBB55_6:
-; RV32ZVE32F-NEXT:    lw t5, 44(a2)
-; RV32ZVE32F-NEXT:    lw t6, 40(a2)
+; RV32ZVE32F-NEXT:    lw t5, 40(a2)
+; RV32ZVE32F-NEXT:    lw t6, 44(a2)
 ; RV32ZVE32F-NEXT:    j .LBB55_13
 ; RV32ZVE32F-NEXT:  .LBB55_7:
-; RV32ZVE32F-NEXT:    lw a1, 4(a2)
-; RV32ZVE32F-NEXT:    lw a3, 0(a2)
+; RV32ZVE32F-NEXT:    lw a1, 0(a2)
+; RV32ZVE32F-NEXT:    lw a3, 4(a2)
 ; RV32ZVE32F-NEXT:    andi a4, t0, 2
 ; RV32ZVE32F-NEXT:    beqz a4, .LBB55_2
 ; RV32ZVE32F-NEXT:  .LBB55_8: # %cond.load1
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 1
 ; RV32ZVE32F-NEXT:    vmv.x.s a5, v10
-; RV32ZVE32F-NEXT:    lw a4, 4(a5)
-; RV32ZVE32F-NEXT:    lw a5, 0(a5)
+; RV32ZVE32F-NEXT:    lw a4, 0(a5)
+; RV32ZVE32F-NEXT:    lw a5, 4(a5)
 ; RV32ZVE32F-NEXT:    andi a6, t0, 4
 ; RV32ZVE32F-NEXT:    beqz a6, .LBB55_3
 ; RV32ZVE32F-NEXT:  .LBB55_9: # %cond.load4
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 2
 ; RV32ZVE32F-NEXT:    vmv.x.s a7, v10
-; RV32ZVE32F-NEXT:    lw a6, 4(a7)
-; RV32ZVE32F-NEXT:    lw a7, 0(a7)
+; RV32ZVE32F-NEXT:    lw a6, 0(a7)
+; RV32ZVE32F-NEXT:    lw a7, 4(a7)
 ; RV32ZVE32F-NEXT:    andi t1, t0, 8
 ; RV32ZVE32F-NEXT:    beqz t1, .LBB55_4
 ; RV32ZVE32F-NEXT:  .LBB55_10: # %cond.load7
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 3
 ; RV32ZVE32F-NEXT:    vmv.x.s t2, v10
-; RV32ZVE32F-NEXT:    lw t1, 4(t2)
-; RV32ZVE32F-NEXT:    lw t2, 0(t2)
+; RV32ZVE32F-NEXT:    lw t1, 0(t2)
+; RV32ZVE32F-NEXT:    lw t2, 4(t2)
 ; RV32ZVE32F-NEXT:    andi t3, t0, 16
 ; RV32ZVE32F-NEXT:    beqz t3, .LBB55_5
 ; RV32ZVE32F-NEXT:  .LBB55_11: # %cond.load10
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
 ; RV32ZVE32F-NEXT:    vmv.x.s t4, v10
-; RV32ZVE32F-NEXT:    lw t3, 4(t4)
-; RV32ZVE32F-NEXT:    lw t4, 0(t4)
+; RV32ZVE32F-NEXT:    lw t3, 0(t4)
+; RV32ZVE32F-NEXT:    lw t4, 4(t4)
 ; RV32ZVE32F-NEXT:    andi t5, t0, 32
 ; RV32ZVE32F-NEXT:    beqz t5, .LBB55_6
 ; RV32ZVE32F-NEXT:  .LBB55_12: # %cond.load13
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 5
 ; RV32ZVE32F-NEXT:    vmv.x.s t6, v10
-; RV32ZVE32F-NEXT:    lw t5, 4(t6)
-; RV32ZVE32F-NEXT:    lw t6, 0(t6)
+; RV32ZVE32F-NEXT:    lw t5, 0(t6)
+; RV32ZVE32F-NEXT:    lw t6, 4(t6)
 ; RV32ZVE32F-NEXT:  .LBB55_13: # %else14
 ; RV32ZVE32F-NEXT:    addi sp, sp, -16
 ; RV32ZVE32F-NEXT:    .cfi_def_cfa_offset 16
@@ -6298,42 +6298,42 @@ define <8 x i64> @mgather_baseidx_sext_v8i32_v8i64(ptr %base, <8 x i32> %idxs, <
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 6
 ; RV32ZVE32F-NEXT:    vmv.x.s s1, v10
-; RV32ZVE32F-NEXT:    lw s0, 4(s1)
-; RV32ZVE32F-NEXT:    lw s1, 0(s1)
+; RV32ZVE32F-NEXT:    lw s0, 0(s1)
+; RV32ZVE32F-NEXT:    lw s1, 4(s1)
 ; RV32ZVE32F-NEXT:    andi t0, t0, -128
 ; RV32ZVE32F-NEXT:    bnez t0, .LBB55_17
 ; RV32ZVE32F-NEXT:  .LBB55_15:
-; RV32ZVE32F-NEXT:    lw t0, 60(a2)
-; RV32ZVE32F-NEXT:    lw a2, 56(a2)
+; RV32ZVE32F-NEXT:    lw t0, 56(a2)
+; RV32ZVE32F-NEXT:    lw a2, 60(a2)
 ; RV32ZVE32F-NEXT:    j .LBB55_18
 ; RV32ZVE32F-NEXT:  .LBB55_16:
-; RV32ZVE32F-NEXT:    lw s0, 52(a2)
-; RV32ZVE32F-NEXT:    lw s1, 48(a2)
+; RV32ZVE32F-NEXT:    lw s0, 48(a2)
+; RV32ZVE32F-NEXT:    lw s1, 52(a2)
 ; RV32ZVE32F-NEXT:    andi t0, t0, -128
 ; RV32ZVE32F-NEXT:    beqz t0, .LBB55_15
 ; RV32ZVE32F-NEXT:  .LBB55_17: # %cond.load19
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
 ; RV32ZVE32F-NEXT:    vmv.x.s a2, v8
-; RV32ZVE32F-NEXT:    lw t0, 4(a2)
-; RV32ZVE32F-NEXT:    lw a2, 0(a2)
+; RV32ZVE32F-NEXT:    lw t0, 0(a2)
+; RV32ZVE32F-NEXT:    lw a2, 4(a2)
 ; RV32ZVE32F-NEXT:  .LBB55_18: # %else20
-; RV32ZVE32F-NEXT:    sw a3, 0(a0)
-; RV32ZVE32F-NEXT:    sw a1, 4(a0)
-; RV32ZVE32F-NEXT:    sw a5, 8(a0)
-; RV32ZVE32F-NEXT:    sw a4, 12(a0)
-; RV32ZVE32F-NEXT:    sw a7, 16(a0)
-; RV32ZVE32F-NEXT:    sw a6, 20(a0)
-; RV32ZVE32F-NEXT:    sw t2, 24(a0)
-; RV32ZVE32F-NEXT:    sw t1, 28(a0)
-; RV32ZVE32F-NEXT:    sw t4, 32(a0)
-; RV32ZVE32F-NEXT:    sw t3, 36(a0)
-; RV32ZVE32F-NEXT:    sw t6, 40(a0)
-; RV32ZVE32F-NEXT:    sw t5, 44(a0)
-; RV32ZVE32F-NEXT:    sw s1, 48(a0)
-; RV32ZVE32F-NEXT:    sw s0, 52(a0)
-; RV32ZVE32F-NEXT:    sw a2, 56(a0)
-; RV32ZVE32F-NEXT:    sw t0, 60(a0)
+; RV32ZVE32F-NEXT:    sw a1, 0(a0)
+; RV32ZVE32F-NEXT:    sw a3, 4(a0)
+; RV32ZVE32F-NEXT:    sw a4, 8(a0)
+; RV32ZVE32F-NEXT:    sw a5, 12(a0)
+; RV32ZVE32F-NEXT:    sw a6, 16(a0)
+; RV32ZVE32F-NEXT:    sw a7, 20(a0)
+; RV32ZVE32F-NEXT:    sw t1, 24(a0)
+; RV32ZVE32F-NEXT:    sw t2, 28(a0)
+; RV32ZVE32F-NEXT:    sw t3, 32(a0)
+; RV32ZVE32F-NEXT:    sw t4, 36(a0)
+; RV32ZVE32F-NEXT:    sw t5, 40(a0)
+; RV32ZVE32F-NEXT:    sw t6, 44(a0)
+; RV32ZVE32F-NEXT:    sw s0, 48(a0)
+; RV32ZVE32F-NEXT:    sw s1, 52(a0)
+; RV32ZVE32F-NEXT:    sw t0, 56(a0)
+; RV32ZVE32F-NEXT:    sw a2, 60(a0)
 ; RV32ZVE32F-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    addi sp, sp, 16
@@ -6488,77 +6488,77 @@ define <8 x i64> @mgather_baseidx_zext_v8i32_v8i64(ptr %base, <8 x i32> %idxs, <
 ; RV32ZVE32F-NEXT:  # %bb.1: # %cond.load
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s a3, v8
-; RV32ZVE32F-NEXT:    lw a1, 4(a3)
-; RV32ZVE32F-NEXT:    lw a3, 0(a3)
+; RV32ZVE32F-NEXT:    lw a1, 0(a3)
+; RV32ZVE32F-NEXT:    lw a3, 4(a3)
 ; RV32ZVE32F-NEXT:    andi a4, t0, 2
 ; RV32ZVE32F-NEXT:    bnez a4, .LBB56_8
 ; RV32ZVE32F-NEXT:  .LBB56_2:
-; RV32ZVE32F-NEXT:    lw a4, 12(a2)
-; RV32ZVE32F-NEXT:    lw a5, 8(a2)
+; RV32ZVE32F-NEXT:    lw a4, 8(a2)
+; RV32ZVE32F-NEXT:    lw a5, 12(a2)
 ; RV32ZVE32F-NEXT:    andi a6, t0, 4
 ; RV32ZVE32F-NEXT:    bnez a6, .LBB56_9
 ; RV32ZVE32F-NEXT:  .LBB56_3:
-; RV32ZVE32F-NEXT:    lw a6, 20(a2)
-; RV32ZVE32F-NEXT:    lw a7, 16(a2)
+; RV32ZVE32F-NEXT:    lw a6, 16(a2)
+; RV32ZVE32F-NEXT:    lw a7, 20(a2)
 ; RV32ZVE32F-NEXT:    andi t1, t0, 8
 ; RV32ZVE32F-NEXT:    bnez t1, .LBB56_10
 ; RV32ZVE32F-NEXT:  .LBB56_4:
-; RV32ZVE32F-NEXT:    lw t1, 28(a2)
-; RV32ZVE32F-NEXT:    lw t2, 24(a2)
+; RV32ZVE32F-NEXT:    lw t1, 24(a2)
+; RV32ZVE32F-NEXT:    lw t2, 28(a2)
 ; RV32ZVE32F-NEXT:    andi t3, t0, 16
 ; RV32ZVE32F-NEXT:    bnez t3, .LBB56_11
 ; RV32ZVE32F-NEXT:  .LBB56_5:
-; RV32ZVE32F-NEXT:    lw t3, 36(a2)
-; RV32ZVE32F-NEXT:    lw t4, 32(a2)
+; RV32ZVE32F-NEXT:    lw t3, 32(a2)
+; RV32ZVE32F-NEXT:    lw t4, 36(a2)
 ; RV32ZVE32F-NEXT:    andi t5, t0, 32
 ; RV32ZVE32F-NEXT:    bnez t5, .LBB56_12
 ; RV32ZVE32F-NEXT:  .LBB56_6:
-; RV32ZVE32F-NEXT:    lw t5, 44(a2)
-; RV32ZVE32F-NEXT:    lw t6, 40(a2)
+; RV32ZVE32F-NEXT:    lw t5, 40(a2)
+; RV32ZVE32F-NEXT:    lw t6, 44(a2)
 ; RV32ZVE32F-NEXT:    j .LBB56_13
 ; RV32ZVE32F-NEXT:  .LBB56_7:
-; RV32ZVE32F-NEXT:    lw a1, 4(a2)
-; RV32ZVE32F-NEXT:    lw a3, 0(a2)
+; RV32ZVE32F-NEXT:    lw a1, 0(a2)
+; RV32ZVE32F-NEXT:    lw a3, 4(a2)
 ; RV32ZVE32F-NEXT:    andi a4, t0, 2
 ; RV32ZVE32F-NEXT:    beqz a4, .LBB56_2
 ; RV32ZVE32F-NEXT:  .LBB56_8: # %cond.load1
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 1
 ; RV32ZVE32F-NEXT:    vmv.x.s a5, v10
-; RV32ZVE32F-NEXT:    lw a4, 4(a5)
-; RV32ZVE32F-NEXT:    lw a5, 0(a5)
+; RV32ZVE32F-NEXT:    lw a4, 0(a5)
+; RV32ZVE32F-NEXT:    lw a5, 4(a5)
 ; RV32ZVE32F-NEXT:    andi a6, t0, 4
 ; RV32ZVE32F-NEXT:    beqz a6, .LBB56_3
 ; RV32ZVE32F-NEXT:  .LBB56_9: # %cond.load4
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 2
 ; RV32ZVE32F-NEXT:    vmv.x.s a7, v10
-; RV32ZVE32F-NEXT:    lw a6, 4(a7)
-; RV32ZVE32F-NEXT:    lw a7, 0(a7)
+; RV32ZVE32F-NEXT:    lw a6, 0(a7)
+; RV32ZVE32F-NEXT:    lw a7, 4(a7)
 ; RV32ZVE32F-NEXT:    andi t1, t0, 8
 ; RV32ZVE32F-NEXT:    beqz t1, .LBB56_4
 ; RV32ZVE32F-NEXT:  .LBB56_10: # %cond.load7
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 3
 ; RV32ZVE32F-NEXT:    vmv.x.s t2, v10
-; RV32ZVE32F-NEXT:    lw t1, 4(t2)
-; RV32ZVE32F-NEXT:    lw t2, 0(t2)
+; RV32ZVE32F-NEXT:    lw t1, 0(t2)
+; RV32ZVE32F-NEXT:    lw t2, 4(t2)
 ; RV32ZVE32F-NEXT:    andi t3, t0, 16
 ; RV32ZVE32F-NEXT:    beqz t3, .LBB56_5
 ; RV32ZVE32F-NEXT:  .LBB56_11: # %cond.load10
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
 ; RV32ZVE32F-NEXT:    vmv.x.s t4, v10
-; RV32ZVE32F-NEXT:    lw t3, 4(t4)
-; RV32ZVE32F-NEXT:    lw t4, 0(t4)
+; RV32ZVE32F-NEXT:    lw t3, 0(t4)
+; RV32ZVE32F-NEXT:    lw t4, 4(t4)
 ; RV32ZVE32F-NEXT:    andi t5, t0, 32
 ; RV32ZVE32F-NEXT:    beqz t5, .LBB56_6
 ; RV32ZVE32F-NEXT:  .LBB56_12: # %cond.load13
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 5
 ; RV32ZVE32F-NEXT:    vmv.x.s t6, v10
-; RV32ZVE32F-NEXT:    lw t5, 4(t6)
-; RV32ZVE32F-NEXT:    lw t6, 0(t6)
+; RV32ZVE32F-NEXT:    lw t5, 0(t6)
+; RV32ZVE32F-NEXT:    lw t6, 4(t6)
 ; RV32ZVE32F-NEXT:  .LBB56_13: # %else14
 ; RV32ZVE32F-NEXT:    addi sp, sp, -16
 ; RV32ZVE32F-NEXT:    .cfi_def_cfa_offset 16
@@ -6572,42 +6572,42 @@ define <8 x i64> @mgather_baseidx_zext_v8i32_v8i64(ptr %base, <8 x i32> %idxs, <
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 6
 ; RV32ZVE32F-NEXT:    vmv.x.s s1, v10
-; RV32ZVE32F-NEXT:    lw s0, 4(s1)
-; RV32ZVE32F-NEXT:    lw s1, 0(s1)
+; RV32ZVE32F-NEXT:    lw s0, 0(s1)
+; RV32ZVE32F-NEXT:    lw s1, 4(s1)
 ; RV32ZVE32F-NEXT:    andi t0, t0, -128
 ; RV32ZVE32F-NEXT:    bnez t0, .LBB56_17
 ; RV32ZVE32F-NEXT:  .LBB56_15:
-; RV32ZVE32F-NEXT:    lw t0, 60(a2)
-; RV32ZVE32F-NEXT:    lw a2, 56(a2)
+; RV32ZVE32F-NEXT:    lw t0, 56(a2)
+; RV32ZVE32F-NEXT:    lw a2, 60(a2)
 ; RV32ZVE32F-NEXT:    j .LBB56_18
 ; RV32ZVE32F-NEXT:  .LBB56_16:
-; RV32ZVE32F-NEXT:    lw s0, 52(a2)
-; RV32ZVE32F-NEXT:    lw s1, 48(a2)
+; RV32ZVE32F-NEXT:    lw s0, 48(a2)
+; RV32ZVE32F-NEXT:    lw s1, 52(a2)
 ; RV32ZVE32F-NEXT:    andi t0, t0, -128
 ; RV32ZVE32F-NEXT:    beqz t0, .LBB56_15
 ; RV32ZVE32F-NEXT:  .LBB56_17: # %cond.load19
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
 ; RV32ZVE32F-NEXT:    vmv.x.s a2, v8
-; RV32ZVE32F-NEXT:    lw t0, 4(a2)
-; RV32ZVE32F-NEXT:    lw a2, 0(a2)
+; RV32ZVE32F-NEXT:    lw t0, 0(a2)
+; RV32ZVE32F-NEXT:    lw a2, 4(a2)
 ; RV32ZVE32F-NEXT:  .LBB56_18: # %else20
-; RV32ZVE32F-NEXT:    sw a3, 0(a0)
-; RV32ZVE32F-NEXT:    sw a1, 4(a0)
-; RV32ZVE32F-NEXT:    sw a5, 8(a0)
-; RV32ZVE32F-NEXT:    sw a4, 12(a0)
-; RV32ZVE32F-NEXT:    sw a7, 16(a0)
-; RV32ZVE32F-NEXT:    sw a6, 20(a0)
-; RV32ZVE32F-NEXT:    sw t2, 24(a0)
-; RV32ZVE32F-NEXT:    sw t1, 28(a0)
-; RV32ZVE32F-NEXT:    sw t4, 32(a0)
-; RV32ZVE32F-NEXT:    sw t3, 36(a0)
-; RV32ZVE32F-NEXT:    sw t6, 40(a0)
-; RV32ZVE32F-NEXT:    sw t5, 44(a0)
-; RV32ZVE32F-NEXT:    sw s1, 48(a0)
-; RV32ZVE32F-NEXT:    sw s0, 52(a0)
-; RV32ZVE32F-NEXT:    sw a2, 56(a0)
-; RV32ZVE32F-NEXT:    sw t0, 60(a0)
+; RV32ZVE32F-NEXT:    sw a1, 0(a0)
+; RV32ZVE32F-NEXT:    sw a3, 4(a0)
+; RV32ZVE32F-NEXT:    sw a4, 8(a0)
+; RV32ZVE32F-NEXT:    sw a5, 12(a0)
+; RV32ZVE32F-NEXT:    sw a6, 16(a0)
+; RV32ZVE32F-NEXT:    sw a7, 20(a0)
+; RV32ZVE32F-NEXT:    sw t1, 24(a0)
+; RV32ZVE32F-NEXT:    sw t2, 28(a0)
+; RV32ZVE32F-NEXT:    sw t3, 32(a0)
+; RV32ZVE32F-NEXT:    sw t4, 36(a0)
+; RV32ZVE32F-NEXT:    sw t5, 40(a0)
+; RV32ZVE32F-NEXT:    sw t6, 44(a0)
+; RV32ZVE32F-NEXT:    sw s0, 48(a0)
+; RV32ZVE32F-NEXT:    sw s1, 52(a0)
+; RV32ZVE32F-NEXT:    sw t0, 56(a0)
+; RV32ZVE32F-NEXT:    sw a2, 60(a0)
 ; RV32ZVE32F-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    addi sp, sp, 16
@@ -6760,22 +6760,22 @@ define <8 x i64> @mgather_baseidx_v8i64(ptr %base, <8 x i64> %idxs, <8 x i1> %m,
 ;
 ; RV32ZVE32F-LABEL: mgather_baseidx_v8i64:
 ; RV32ZVE32F:       # %bb.0:
-; RV32ZVE32F-NEXT:    lw a4, 56(a2)
-; RV32ZVE32F-NEXT:    lw a5, 48(a2)
-; RV32ZVE32F-NEXT:    lw a6, 40(a2)
+; RV32ZVE32F-NEXT:    lw a4, 8(a2)
+; RV32ZVE32F-NEXT:    lw a5, 16(a2)
+; RV32ZVE32F-NEXT:    lw a6, 24(a2)
 ; RV32ZVE32F-NEXT:    lw a7, 32(a2)
-; RV32ZVE32F-NEXT:    lw t0, 24(a2)
-; RV32ZVE32F-NEXT:    lw t1, 16(a2)
-; RV32ZVE32F-NEXT:    lw t2, 8(a2)
+; RV32ZVE32F-NEXT:    lw t0, 40(a2)
+; RV32ZVE32F-NEXT:    lw t1, 48(a2)
+; RV32ZVE32F-NEXT:    lw t2, 56(a2)
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vlse32.v v8, (a2), zero
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, t2
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, t1
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, t0
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a7
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a5
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a4
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a5
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a7
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, t0
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, t1
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, t2
 ; RV32ZVE32F-NEXT:    vsll.vi v8, v8, 3
 ; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a1
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
@@ -6785,77 +6785,77 @@ define <8 x i64> @mgather_baseidx_v8i64(ptr %base, <8 x i64> %idxs, <8 x i1> %m,
 ; RV32ZVE32F-NEXT:  # %bb.1: # %cond.load
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s a2, v8
-; RV32ZVE32F-NEXT:    lw a1, 4(a2)
-; RV32ZVE32F-NEXT:    lw a2, 0(a2)
+; RV32ZVE32F-NEXT:    lw a1, 0(a2)
+; RV32ZVE32F-NEXT:    lw a2, 4(a2)
 ; RV32ZVE32F-NEXT:    andi a4, t0, 2
 ; RV32ZVE32F-NEXT:    bnez a4, .LBB57_8
 ; RV32ZVE32F-NEXT:  .LBB57_2:
-; RV32ZVE32F-NEXT:    lw a4, 12(a3)
-; RV32ZVE32F-NEXT:    lw a5, 8(a3)
+; RV32ZVE32F-NEXT:    lw a4, 8(a3)
+; RV32ZVE32F-NEXT:    lw a5, 12(a3)
 ; RV32ZVE32F-NEXT:    andi a6, t0, 4
 ; RV32ZVE32F-NEXT:    bnez a6, .LBB57_9
 ; RV32ZVE32F-NEXT:  .LBB57_3:
-; RV32ZVE32F-NEXT:    lw a6, 20(a3)
-; RV32ZVE32F-NEXT:    lw a7, 16(a3)
+; RV32ZVE32F-NEXT:    lw a6, 16(a3)
+; RV32ZVE32F-NEXT:    lw a7, 20(a3)
 ; RV32ZVE32F-NEXT:    andi t1, t0, 8
 ; RV32ZVE32F-NEXT:    bnez t1, .LBB57_10
 ; RV32ZVE32F-NEXT:  .LBB57_4:
-; RV32ZVE32F-NEXT:    lw t1, 28(a3)
-; RV32ZVE32F-NEXT:    lw t2, 24(a3)
+; RV32ZVE32F-NEXT:    lw t1, 24(a3)
+; RV32ZVE32F-NEXT:    lw t2, 28(a3)
 ; RV32ZVE32F-NEXT:    andi t3, t0, 16
 ; RV32ZVE32F-NEXT:    bnez t3, .LBB57_11
 ; RV32ZVE32F-NEXT:  .LBB57_5:
-; RV32ZVE32F-NEXT:    lw t3, 36(a3)
-; RV32ZVE32F-NEXT:    lw t4, 32(a3)
+; RV32ZVE32F-NEXT:    lw t3, 32(a3)
+; RV32ZVE32F-NEXT:    lw t4, 36(a3)
 ; RV32ZVE32F-NEXT:    andi t5, t0, 32
 ; RV32ZVE32F-NEXT:    bnez t5, .LBB57_12
 ; RV32ZVE32F-NEXT:  .LBB57_6:
-; RV32ZVE32F-NEXT:    lw t5, 44(a3)
-; RV32ZVE32F-NEXT:    lw t6, 40(a3)
+; RV32ZVE32F-NEXT:    lw t5, 40(a3)
+; RV32ZVE32F-NEXT:    lw t6, 44(a3)
 ; RV32ZVE32F-NEXT:    j .LBB57_13
 ; RV32ZVE32F-NEXT:  .LBB57_7:
-; RV32ZVE32F-NEXT:    lw a1, 4(a3)
-; RV32ZVE32F-NEXT:    lw a2, 0(a3)
+; RV32ZVE32F-NEXT:    lw a1, 0(a3)
+; RV32ZVE32F-NEXT:    lw a2, 4(a3)
 ; RV32ZVE32F-NEXT:    andi a4, t0, 2
 ; RV32ZVE32F-NEXT:    beqz a4, .LBB57_2
 ; RV32ZVE32F-NEXT:  .LBB57_8: # %cond.load1
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 1
 ; RV32ZVE32F-NEXT:    vmv.x.s a5, v10
-; RV32ZVE32F-NEXT:    lw a4, 4(a5)
-; RV32ZVE32F-NEXT:    lw a5, 0(a5)
+; RV32ZVE32F-NEXT:    lw a4, 0(a5)
+; RV32ZVE32F-NEXT:    lw a5, 4(a5)
 ; RV32ZVE32F-NEXT:    andi a6, t0, 4
 ; RV32ZVE32F-NEXT:    beqz a6, .LBB57_3
 ; RV32ZVE32F-NEXT:  .LBB57_9: # %cond.load4
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 2
 ; RV32ZVE32F-NEXT:    vmv.x.s a7, v10
-; RV32ZVE32F-NEXT:    lw a6, 4(a7)
-; RV32ZVE32F-NEXT:    lw a7, 0(a7)
+; RV32ZVE32F-NEXT:    lw a6, 0(a7)
+; RV32ZVE32F-NEXT:    lw a7, 4(a7)
 ; RV32ZVE32F-NEXT:    andi t1, t0, 8
 ; RV32ZVE32F-NEXT:    beqz t1, .LBB57_4
 ; RV32ZVE32F-NEXT:  .LBB57_10: # %cond.load7
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 3
 ; RV32ZVE32F-NEXT:    vmv.x.s t2, v10
-; RV32ZVE32F-NEXT:    lw t1, 4(t2)
-; RV32ZVE32F-NEXT:    lw t2, 0(t2)
+; RV32ZVE32F-NEXT:    lw t1, 0(t2)
+; RV32ZVE32F-NEXT:    lw t2, 4(t2)
 ; RV32ZVE32F-NEXT:    andi t3, t0, 16
 ; RV32ZVE32F-NEXT:    beqz t3, .LBB57_5
 ; RV32ZVE32F-NEXT:  .LBB57_11: # %cond.load10
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
 ; RV32ZVE32F-NEXT:    vmv.x.s t4, v10
-; RV32ZVE32F-NEXT:    lw t3, 4(t4)
-; RV32ZVE32F-NEXT:    lw t4, 0(t4)
+; RV32ZVE32F-NEXT:    lw t3, 0(t4)
+; RV32ZVE32F-NEXT:    lw t4, 4(t4)
 ; RV32ZVE32F-NEXT:    andi t5, t0, 32
 ; RV32ZVE32F-NEXT:    beqz t5, .LBB57_6
 ; RV32ZVE32F-NEXT:  .LBB57_12: # %cond.load13
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 5
 ; RV32ZVE32F-NEXT:    vmv.x.s t6, v10
-; RV32ZVE32F-NEXT:    lw t5, 4(t6)
-; RV32ZVE32F-NEXT:    lw t6, 0(t6)
+; RV32ZVE32F-NEXT:    lw t5, 0(t6)
+; RV32ZVE32F-NEXT:    lw t6, 4(t6)
 ; RV32ZVE32F-NEXT:  .LBB57_13: # %else14
 ; RV32ZVE32F-NEXT:    addi sp, sp, -16
 ; RV32ZVE32F-NEXT:    .cfi_def_cfa_offset 16
@@ -6869,42 +6869,42 @@ define <8 x i64> @mgather_baseidx_v8i64(ptr %base, <8 x i64> %idxs, <8 x i1> %m,
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 6
 ; RV32ZVE32F-NEXT:    vmv.x.s s1, v10
-; RV32ZVE32F-NEXT:    lw s0, 4(s1)
-; RV32ZVE32F-NEXT:    lw s1, 0(s1)
+; RV32ZVE32F-NEXT:    lw s0, 0(s1)
+; RV32ZVE32F-NEXT:    lw s1, 4(s1)
 ; RV32ZVE32F-NEXT:    andi t0, t0, -128
 ; RV32ZVE32F-NEXT:    bnez t0, .LBB57_17
 ; RV32ZVE32F-NEXT:  .LBB57_15:
-; RV32ZVE32F-NEXT:    lw t0, 60(a3)
-; RV32ZVE32F-NEXT:    lw a3, 56(a3)
+; RV32ZVE32F-NEXT:    lw t0, 56(a3)
+; RV32ZVE32F-NEXT:    lw a3, 60(a3)
 ; RV32ZVE32F-NEXT:    j .LBB57_18
 ; RV32ZVE32F-NEXT:  .LBB57_16:
-; RV32ZVE32F-NEXT:    lw s0, 52(a3)
-; RV32ZVE32F-NEXT:    lw s1, 48(a3)
+; RV32ZVE32F-NEXT:    lw s0, 48(a3)
+; RV32ZVE32F-NEXT:    lw s1, 52(a3)
 ; RV32ZVE32F-NEXT:    andi t0, t0, -128
 ; RV32ZVE32F-NEXT:    beqz t0, .LBB57_15
 ; RV32ZVE32F-NEXT:  .LBB57_17: # %cond.load19
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
 ; RV32ZVE32F-NEXT:    vmv.x.s a3, v8
-; RV32ZVE32F-NEXT:    lw t0, 4(a3)
-; RV32ZVE32F-NEXT:    lw a3, 0(a3)
+; RV32ZVE32F-NEXT:    lw t0, 0(a3)
+; RV32ZVE32F-NEXT:    lw a3, 4(a3)
 ; RV32ZVE32F-NEXT:  .LBB57_18: # %else20
-; RV32ZVE32F-NEXT:    sw a2, 0(a0)
-; RV32ZVE32F-NEXT:    sw a1, 4(a0)
-; RV32ZVE32F-NEXT:    sw a5, 8(a0)
-; RV32ZVE32F-NEXT:    sw a4, 12(a0)
-; RV32ZVE32F-NEXT:    sw a7, 16(a0)
-; RV32ZVE32F-NEXT:    sw a6, 20(a0)
-; RV32ZVE32F-NEXT:    sw t2, 24(a0)
-; RV32ZVE32F-NEXT:    sw t1, 28(a0)
-; RV32ZVE32F-NEXT:    sw t4, 32(a0)
-; RV32ZVE32F-NEXT:    sw t3, 36(a0)
-; RV32ZVE32F-NEXT:    sw t6, 40(a0)
-; RV32ZVE32F-NEXT:    sw t5, 44(a0)
-; RV32ZVE32F-NEXT:    sw s1, 48(a0)
-; RV32ZVE32F-NEXT:    sw s0, 52(a0)
-; RV32ZVE32F-NEXT:    sw a3, 56(a0)
-; RV32ZVE32F-NEXT:    sw t0, 60(a0)
+; RV32ZVE32F-NEXT:    sw a1, 0(a0)
+; RV32ZVE32F-NEXT:    sw a2, 4(a0)
+; RV32ZVE32F-NEXT:    sw a4, 8(a0)
+; RV32ZVE32F-NEXT:    sw a5, 12(a0)
+; RV32ZVE32F-NEXT:    sw a6, 16(a0)
+; RV32ZVE32F-NEXT:    sw a7, 20(a0)
+; RV32ZVE32F-NEXT:    sw t1, 24(a0)
+; RV32ZVE32F-NEXT:    sw t2, 28(a0)
+; RV32ZVE32F-NEXT:    sw t3, 32(a0)
+; RV32ZVE32F-NEXT:    sw t4, 36(a0)
+; RV32ZVE32F-NEXT:    sw t5, 40(a0)
+; RV32ZVE32F-NEXT:    sw t6, 44(a0)
+; RV32ZVE32F-NEXT:    sw s0, 48(a0)
+; RV32ZVE32F-NEXT:    sw s1, 52(a0)
+; RV32ZVE32F-NEXT:    sw t0, 56(a0)
+; RV32ZVE32F-NEXT:    sw a3, 60(a0)
 ; RV32ZVE32F-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    addi sp, sp, 16
@@ -11928,22 +11928,22 @@ define <8 x double> @mgather_baseidx_v8f64(ptr %base, <8 x i64> %idxs, <8 x i1>
 ;
 ; RV32ZVE32F-LABEL: mgather_baseidx_v8f64:
 ; RV32ZVE32F:       # %bb.0:
-; RV32ZVE32F-NEXT:    lw a3, 56(a2)
-; RV32ZVE32F-NEXT:    lw a4, 48(a2)
-; RV32ZVE32F-NEXT:    lw a5, 40(a2)
+; RV32ZVE32F-NEXT:    lw a3, 8(a2)
+; RV32ZVE32F-NEXT:    lw a4, 16(a2)
+; RV32ZVE32F-NEXT:    lw a5, 24(a2)
 ; RV32ZVE32F-NEXT:    lw a6, 32(a2)
-; RV32ZVE32F-NEXT:    lw a7, 24(a2)
-; RV32ZVE32F-NEXT:    lw t0, 16(a2)
-; RV32ZVE32F-NEXT:    lw t1, 8(a2)
+; RV32ZVE32F-NEXT:    lw a7, 40(a2)
+; RV32ZVE32F-NEXT:    lw t0, 48(a2)
+; RV32ZVE32F-NEXT:    lw t1, 56(a2)
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vlse32.v v8, (a2), zero
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, t1
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, t0
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a7
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a5
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a4
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a3
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a4
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a5
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a7
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, t0
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, t1
 ; RV32ZVE32F-NEXT:    vsll.vi v8, v8, 3
 ; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a1
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll
index 0125c0256162c9..92c8eee728308e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll
@@ -275,9 +275,9 @@ define void @mscatter_v4i8(<4 x i8> %val, <4 x ptr> %ptrs, <4 x i1> %m) {
 ;
 ; RV64ZVE32F-LABEL: mscatter_v4i8:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    ld a1, 24(a0)
-; RV64ZVE32F-NEXT:    ld a2, 16(a0)
 ; RV64ZVE32F-NEXT:    ld a4, 8(a0)
+; RV64ZVE32F-NEXT:    ld a2, 16(a0)
+; RV64ZVE32F-NEXT:    ld a1, 24(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v0
 ; RV64ZVE32F-NEXT:    andi a5, a3, 1
@@ -335,9 +335,9 @@ define void @mscatter_truemask_v4i8(<4 x i8> %val, <4 x ptr> %ptrs) {
 ;
 ; RV64ZVE32F-LABEL: mscatter_truemask_v4i8:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    ld a1, 24(a0)
-; RV64ZVE32F-NEXT:    ld a2, 16(a0)
 ; RV64ZVE32F-NEXT:    ld a4, 8(a0)
+; RV64ZVE32F-NEXT:    ld a2, 16(a0)
+; RV64ZVE32F-NEXT:    ld a1, 24(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vmset.m v9
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v9
@@ -407,13 +407,13 @@ define void @mscatter_v8i8(<8 x i8> %val, <8 x ptr> %ptrs, <8 x i1> %m) {
 ;
 ; RV64ZVE32F-LABEL: mscatter_v8i8:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    ld a1, 56(a0)
-; RV64ZVE32F-NEXT:    ld a2, 48(a0)
-; RV64ZVE32F-NEXT:    ld a4, 40(a0)
-; RV64ZVE32F-NEXT:    ld a5, 32(a0)
-; RV64ZVE32F-NEXT:    ld a6, 24(a0)
-; RV64ZVE32F-NEXT:    ld a7, 16(a0)
 ; RV64ZVE32F-NEXT:    ld t0, 8(a0)
+; RV64ZVE32F-NEXT:    ld a7, 16(a0)
+; RV64ZVE32F-NEXT:    ld a6, 24(a0)
+; RV64ZVE32F-NEXT:    ld a5, 32(a0)
+; RV64ZVE32F-NEXT:    ld a4, 40(a0)
+; RV64ZVE32F-NEXT:    ld a2, 48(a0)
+; RV64ZVE32F-NEXT:    ld a1, 56(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v0
 ; RV64ZVE32F-NEXT:    andi t1, a3, 1
@@ -822,9 +822,9 @@ define void @mscatter_v4i16(<4 x i16> %val, <4 x ptr> %ptrs, <4 x i1> %m) {
 ;
 ; RV64ZVE32F-LABEL: mscatter_v4i16:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    ld a1, 24(a0)
-; RV64ZVE32F-NEXT:    ld a2, 16(a0)
 ; RV64ZVE32F-NEXT:    ld a4, 8(a0)
+; RV64ZVE32F-NEXT:    ld a2, 16(a0)
+; RV64ZVE32F-NEXT:    ld a1, 24(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v0
 ; RV64ZVE32F-NEXT:    andi a5, a3, 1
@@ -882,9 +882,9 @@ define void @mscatter_truemask_v4i16(<4 x i16> %val, <4 x ptr> %ptrs) {
 ;
 ; RV64ZVE32F-LABEL: mscatter_truemask_v4i16:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    ld a1, 24(a0)
-; RV64ZVE32F-NEXT:    ld a2, 16(a0)
 ; RV64ZVE32F-NEXT:    ld a4, 8(a0)
+; RV64ZVE32F-NEXT:    ld a2, 16(a0)
+; RV64ZVE32F-NEXT:    ld a1, 24(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vmset.m v9
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v9
@@ -954,13 +954,13 @@ define void @mscatter_v8i16(<8 x i16> %val, <8 x ptr> %ptrs, <8 x i1> %m) {
 ;
 ; RV64ZVE32F-LABEL: mscatter_v8i16:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    ld a1, 56(a0)
-; RV64ZVE32F-NEXT:    ld a2, 48(a0)
-; RV64ZVE32F-NEXT:    ld a4, 40(a0)
-; RV64ZVE32F-NEXT:    ld a5, 32(a0)
-; RV64ZVE32F-NEXT:    ld a6, 24(a0)
-; RV64ZVE32F-NEXT:    ld a7, 16(a0)
 ; RV64ZVE32F-NEXT:    ld t0, 8(a0)
+; RV64ZVE32F-NEXT:    ld a7, 16(a0)
+; RV64ZVE32F-NEXT:    ld a6, 24(a0)
+; RV64ZVE32F-NEXT:    ld a5, 32(a0)
+; RV64ZVE32F-NEXT:    ld a4, 40(a0)
+; RV64ZVE32F-NEXT:    ld a2, 48(a0)
+; RV64ZVE32F-NEXT:    ld a1, 56(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v0
 ; RV64ZVE32F-NEXT:    andi t1, a3, 1
@@ -1727,9 +1727,9 @@ define void @mscatter_v4i32(<4 x i32> %val, <4 x ptr> %ptrs, <4 x i1> %m) {
 ;
 ; RV64ZVE32F-LABEL: mscatter_v4i32:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    ld a1, 24(a0)
-; RV64ZVE32F-NEXT:    ld a2, 16(a0)
 ; RV64ZVE32F-NEXT:    ld a4, 8(a0)
+; RV64ZVE32F-NEXT:    ld a2, 16(a0)
+; RV64ZVE32F-NEXT:    ld a1, 24(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v0
 ; RV64ZVE32F-NEXT:    andi a5, a3, 1
@@ -1787,9 +1787,9 @@ define void @mscatter_truemask_v4i32(<4 x i32> %val, <4 x ptr> %ptrs) {
 ;
 ; RV64ZVE32F-LABEL: mscatter_truemask_v4i32:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    ld a1, 24(a0)
-; RV64ZVE32F-NEXT:    ld a2, 16(a0)
 ; RV64ZVE32F-NEXT:    ld a4, 8(a0)
+; RV64ZVE32F-NEXT:    ld a2, 16(a0)
+; RV64ZVE32F-NEXT:    ld a1, 24(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vmset.m v9
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v9
@@ -1859,13 +1859,13 @@ define void @mscatter_v8i32(<8 x i32> %val, <8 x ptr> %ptrs, <8 x i1> %m) {
 ;
 ; RV64ZVE32F-LABEL: mscatter_v8i32:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    ld a1, 56(a0)
-; RV64ZVE32F-NEXT:    ld a2, 48(a0)
-; RV64ZVE32F-NEXT:    ld a4, 40(a0)
-; RV64ZVE32F-NEXT:    ld a5, 32(a0)
-; RV64ZVE32F-NEXT:    ld a6, 24(a0)
-; RV64ZVE32F-NEXT:    ld a7, 16(a0)
 ; RV64ZVE32F-NEXT:    ld t0, 8(a0)
+; RV64ZVE32F-NEXT:    ld a7, 16(a0)
+; RV64ZVE32F-NEXT:    ld a6, 24(a0)
+; RV64ZVE32F-NEXT:    ld a5, 32(a0)
+; RV64ZVE32F-NEXT:    ld a4, 40(a0)
+; RV64ZVE32F-NEXT:    ld a2, 48(a0)
+; RV64ZVE32F-NEXT:    ld a1, 56(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v0
 ; RV64ZVE32F-NEXT:    andi t1, a3, 1
@@ -2947,8 +2947,8 @@ define void @mscatter_v2i64(<2 x i64> %val, <2 x ptr> %ptrs, <2 x i1> %m) {
 ;
 ; RV32ZVE32F-LABEL: mscatter_v2i64:
 ; RV32ZVE32F:       # %bb.0:
-; RV32ZVE32F-NEXT:    lw a2, 12(a0)
 ; RV32ZVE32F-NEXT:    lw a1, 8(a0)
+; RV32ZVE32F-NEXT:    lw a2, 12(a0)
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s a3, v0
 ; RV32ZVE32F-NEXT:    andi a4, a3, 1
@@ -3014,12 +3014,12 @@ define void @mscatter_v4i64(<4 x i64> %val, <4 x ptr> %ptrs, <4 x i1> %m) {
 ;
 ; RV32ZVE32F-LABEL: mscatter_v4i64:
 ; RV32ZVE32F:       # %bb.0:
-; RV32ZVE32F-NEXT:    lw a1, 28(a0)
-; RV32ZVE32F-NEXT:    lw a2, 24(a0)
-; RV32ZVE32F-NEXT:    lw a3, 20(a0)
-; RV32ZVE32F-NEXT:    lw a4, 16(a0)
-; RV32ZVE32F-NEXT:    lw a7, 12(a0)
 ; RV32ZVE32F-NEXT:    lw a6, 8(a0)
+; RV32ZVE32F-NEXT:    lw a7, 12(a0)
+; RV32ZVE32F-NEXT:    lw a3, 16(a0)
+; RV32ZVE32F-NEXT:    lw a4, 20(a0)
+; RV32ZVE32F-NEXT:    lw a1, 24(a0)
+; RV32ZVE32F-NEXT:    lw a2, 28(a0)
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s a5, v0
 ; RV32ZVE32F-NEXT:    andi t0, a5, 1
@@ -3056,38 +3056,38 @@ define void @mscatter_v4i64(<4 x i64> %val, <4 x ptr> %ptrs, <4 x i1> %m) {
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v9, v8, 2
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v9
-; RV32ZVE32F-NEXT:    sw a4, 0(a0)
-; RV32ZVE32F-NEXT:    sw a3, 4(a0)
+; RV32ZVE32F-NEXT:    sw a3, 0(a0)
+; RV32ZVE32F-NEXT:    sw a4, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a5, a5, 8
 ; RV32ZVE32F-NEXT:    beqz a5, .LBB38_4
 ; RV32ZVE32F-NEXT:  .LBB38_8: # %cond.store5
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 3
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v8
-; RV32ZVE32F-NEXT:    sw a2, 0(a0)
-; RV32ZVE32F-NEXT:    sw a1, 4(a0)
+; RV32ZVE32F-NEXT:    sw a1, 0(a0)
+; RV32ZVE32F-NEXT:    sw a2, 4(a0)
 ; RV32ZVE32F-NEXT:    ret
 ;
 ; RV64ZVE32F-LABEL: mscatter_v4i64:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    ld a2, 24(a1)
+; RV64ZVE32F-NEXT:    ld a6, 8(a1)
 ; RV64ZVE32F-NEXT:    ld a4, 16(a1)
-; RV64ZVE32F-NEXT:    ld a7, 8(a1)
-; RV64ZVE32F-NEXT:    ld a3, 24(a0)
-; RV64ZVE32F-NEXT:    ld a5, 16(a0)
+; RV64ZVE32F-NEXT:    ld a2, 24(a1)
 ; RV64ZVE32F-NEXT:    ld t0, 8(a0)
+; RV64ZVE32F-NEXT:    ld a5, 16(a0)
+; RV64ZVE32F-NEXT:    ld a3, 24(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT:    vmv.x.s a6, v0
-; RV64ZVE32F-NEXT:    andi t1, a6, 1
+; RV64ZVE32F-NEXT:    vmv.x.s a7, v0
+; RV64ZVE32F-NEXT:    andi t1, a7, 1
 ; RV64ZVE32F-NEXT:    bnez t1, .LBB38_5
 ; RV64ZVE32F-NEXT:  # %bb.1: # %else
-; RV64ZVE32F-NEXT:    andi a0, a6, 2
+; RV64ZVE32F-NEXT:    andi a0, a7, 2
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB38_6
 ; RV64ZVE32F-NEXT:  .LBB38_2: # %else2
-; RV64ZVE32F-NEXT:    andi a0, a6, 4
+; RV64ZVE32F-NEXT:    andi a0, a7, 4
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB38_7
 ; RV64ZVE32F-NEXT:  .LBB38_3: # %else4
-; RV64ZVE32F-NEXT:    andi a0, a6, 8
+; RV64ZVE32F-NEXT:    andi a0, a7, 8
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB38_8
 ; RV64ZVE32F-NEXT:  .LBB38_4: # %else6
 ; RV64ZVE32F-NEXT:    ret
@@ -3095,15 +3095,15 @@ define void @mscatter_v4i64(<4 x i64> %val, <4 x ptr> %ptrs, <4 x i1> %m) {
 ; RV64ZVE32F-NEXT:    ld a1, 0(a1)
 ; RV64ZVE32F-NEXT:    ld a0, 0(a0)
 ; RV64ZVE32F-NEXT:    sd a0, 0(a1)
-; RV64ZVE32F-NEXT:    andi a0, a6, 2
+; RV64ZVE32F-NEXT:    andi a0, a7, 2
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB38_2
 ; RV64ZVE32F-NEXT:  .LBB38_6: # %cond.store1
-; RV64ZVE32F-NEXT:    sd t0, 0(a7)
-; RV64ZVE32F-NEXT:    andi a0, a6, 4
+; RV64ZVE32F-NEXT:    sd t0, 0(a6)
+; RV64ZVE32F-NEXT:    andi a0, a7, 4
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB38_3
 ; RV64ZVE32F-NEXT:  .LBB38_7: # %cond.store3
 ; RV64ZVE32F-NEXT:    sd a5, 0(a4)
-; RV64ZVE32F-NEXT:    andi a0, a6, 8
+; RV64ZVE32F-NEXT:    andi a0, a7, 8
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB38_4
 ; RV64ZVE32F-NEXT:  .LBB38_8: # %cond.store5
 ; RV64ZVE32F-NEXT:    sd a3, 0(a2)
@@ -3127,12 +3127,12 @@ define void @mscatter_truemask_v4i64(<4 x i64> %val, <4 x ptr> %ptrs) {
 ;
 ; RV32ZVE32F-LABEL: mscatter_truemask_v4i64:
 ; RV32ZVE32F:       # %bb.0:
-; RV32ZVE32F-NEXT:    lw a1, 28(a0)
-; RV32ZVE32F-NEXT:    lw a2, 24(a0)
-; RV32ZVE32F-NEXT:    lw a3, 20(a0)
-; RV32ZVE32F-NEXT:    lw a4, 16(a0)
-; RV32ZVE32F-NEXT:    lw a7, 12(a0)
 ; RV32ZVE32F-NEXT:    lw a6, 8(a0)
+; RV32ZVE32F-NEXT:    lw a7, 12(a0)
+; RV32ZVE32F-NEXT:    lw a3, 16(a0)
+; RV32ZVE32F-NEXT:    lw a4, 20(a0)
+; RV32ZVE32F-NEXT:    lw a1, 24(a0)
+; RV32ZVE32F-NEXT:    lw a2, 28(a0)
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; RV32ZVE32F-NEXT:    vmset.m v9
 ; RV32ZVE32F-NEXT:    vmv.x.s a5, v9
@@ -3169,38 +3169,38 @@ define void @mscatter_truemask_v4i64(<4 x i64> %val, <4 x ptr> %ptrs) {
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v9, v8, 2
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v9
-; RV32ZVE32F-NEXT:    sw a4, 0(a0)
-; RV32ZVE32F-NEXT:    sw a3, 4(a0)
+; RV32ZVE32F-NEXT:    sw a3, 0(a0)
+; RV32ZVE32F-NEXT:    sw a4, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a5, a5, 8
 ; RV32ZVE32F-NEXT:    beqz a5, .LBB39_4
 ; RV32ZVE32F-NEXT:  .LBB39_8: # %cond.store5
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 3
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v8
-; RV32ZVE32F-NEXT:    sw a2, 0(a0)
-; RV32ZVE32F-NEXT:    sw a1, 4(a0)
+; RV32ZVE32F-NEXT:    sw a1, 0(a0)
+; RV32ZVE32F-NEXT:    sw a2, 4(a0)
 ; RV32ZVE32F-NEXT:    ret
 ;
 ; RV64ZVE32F-LABEL: mscatter_truemask_v4i64:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    ld a2, 24(a1)
+; RV64ZVE32F-NEXT:    ld a6, 8(a1)
 ; RV64ZVE32F-NEXT:    ld a4, 16(a1)
-; RV64ZVE32F-NEXT:    ld a7, 8(a1)
-; RV64ZVE32F-NEXT:    ld a3, 24(a0)
-; RV64ZVE32F-NEXT:    ld a5, 16(a0)
+; RV64ZVE32F-NEXT:    ld a2, 24(a1)
 ; RV64ZVE32F-NEXT:    ld t0, 8(a0)
+; RV64ZVE32F-NEXT:    ld a5, 16(a0)
+; RV64ZVE32F-NEXT:    ld a3, 24(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vmset.m v8
-; RV64ZVE32F-NEXT:    vmv.x.s a6, v8
+; RV64ZVE32F-NEXT:    vmv.x.s a7, v8
 ; RV64ZVE32F-NEXT:    beqz zero, .LBB39_5
 ; RV64ZVE32F-NEXT:  # %bb.1: # %else
-; RV64ZVE32F-NEXT:    andi a0, a6, 2
+; RV64ZVE32F-NEXT:    andi a0, a7, 2
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB39_6
 ; RV64ZVE32F-NEXT:  .LBB39_2: # %else2
-; RV64ZVE32F-NEXT:    andi a0, a6, 4
+; RV64ZVE32F-NEXT:    andi a0, a7, 4
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB39_7
 ; RV64ZVE32F-NEXT:  .LBB39_3: # %else4
-; RV64ZVE32F-NEXT:    andi a0, a6, 8
+; RV64ZVE32F-NEXT:    andi a0, a7, 8
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB39_8
 ; RV64ZVE32F-NEXT:  .LBB39_4: # %else6
 ; RV64ZVE32F-NEXT:    ret
@@ -3208,15 +3208,15 @@ define void @mscatter_truemask_v4i64(<4 x i64> %val, <4 x ptr> %ptrs) {
 ; RV64ZVE32F-NEXT:    ld a1, 0(a1)
 ; RV64ZVE32F-NEXT:    ld a0, 0(a0)
 ; RV64ZVE32F-NEXT:    sd a0, 0(a1)
-; RV64ZVE32F-NEXT:    andi a0, a6, 2
+; RV64ZVE32F-NEXT:    andi a0, a7, 2
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB39_2
 ; RV64ZVE32F-NEXT:  .LBB39_6: # %cond.store1
-; RV64ZVE32F-NEXT:    sd t0, 0(a7)
-; RV64ZVE32F-NEXT:    andi a0, a6, 4
+; RV64ZVE32F-NEXT:    sd t0, 0(a6)
+; RV64ZVE32F-NEXT:    andi a0, a7, 4
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB39_3
 ; RV64ZVE32F-NEXT:  .LBB39_7: # %cond.store3
 ; RV64ZVE32F-NEXT:    sd a5, 0(a4)
-; RV64ZVE32F-NEXT:    andi a0, a6, 8
+; RV64ZVE32F-NEXT:    andi a0, a7, 8
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB39_4
 ; RV64ZVE32F-NEXT:  .LBB39_8: # %cond.store5
 ; RV64ZVE32F-NEXT:    sd a3, 0(a2)
@@ -3260,51 +3260,51 @@ define void @mscatter_v8i64(<8 x i64> %val, <8 x ptr> %ptrs, <8 x i1> %m) {
 ; RV32ZVE32F-NEXT:    .cfi_offset s0, -4
 ; RV32ZVE32F-NEXT:    .cfi_offset s1, -8
 ; RV32ZVE32F-NEXT:    .cfi_offset s2, -12
-; RV32ZVE32F-NEXT:    lw a1, 60(a0)
-; RV32ZVE32F-NEXT:    lw a2, 56(a0)
-; RV32ZVE32F-NEXT:    lw a3, 52(a0)
-; RV32ZVE32F-NEXT:    lw a4, 48(a0)
-; RV32ZVE32F-NEXT:    lw a5, 44(a0)
-; RV32ZVE32F-NEXT:    lw a7, 40(a0)
-; RV32ZVE32F-NEXT:    lw t0, 36(a0)
-; RV32ZVE32F-NEXT:    lw t1, 32(a0)
-; RV32ZVE32F-NEXT:    lw t2, 28(a0)
-; RV32ZVE32F-NEXT:    lw t3, 24(a0)
-; RV32ZVE32F-NEXT:    lw t4, 20(a0)
-; RV32ZVE32F-NEXT:    lw t5, 16(a0)
-; RV32ZVE32F-NEXT:    lw s0, 12(a0)
 ; RV32ZVE32F-NEXT:    lw t6, 8(a0)
+; RV32ZVE32F-NEXT:    lw s0, 12(a0)
+; RV32ZVE32F-NEXT:    lw t4, 16(a0)
+; RV32ZVE32F-NEXT:    lw t5, 20(a0)
+; RV32ZVE32F-NEXT:    lw t2, 24(a0)
+; RV32ZVE32F-NEXT:    lw t3, 28(a0)
+; RV32ZVE32F-NEXT:    lw t0, 32(a0)
+; RV32ZVE32F-NEXT:    lw t1, 36(a0)
+; RV32ZVE32F-NEXT:    lw a6, 40(a0)
+; RV32ZVE32F-NEXT:    lw a7, 44(a0)
+; RV32ZVE32F-NEXT:    lw a3, 48(a0)
+; RV32ZVE32F-NEXT:    lw a4, 52(a0)
+; RV32ZVE32F-NEXT:    lw a1, 56(a0)
+; RV32ZVE32F-NEXT:    lw a2, 60(a0)
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
-; RV32ZVE32F-NEXT:    vmv.x.s a6, v0
-; RV32ZVE32F-NEXT:    andi s1, a6, 1
+; RV32ZVE32F-NEXT:    vmv.x.s a5, v0
+; RV32ZVE32F-NEXT:    andi s1, a5, 1
 ; RV32ZVE32F-NEXT:    bnez s1, .LBB41_10
 ; RV32ZVE32F-NEXT:  # %bb.1: # %else
-; RV32ZVE32F-NEXT:    andi a0, a6, 2
+; RV32ZVE32F-NEXT:    andi a0, a5, 2
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB41_11
 ; RV32ZVE32F-NEXT:  .LBB41_2: # %else2
-; RV32ZVE32F-NEXT:    andi a0, a6, 4
+; RV32ZVE32F-NEXT:    andi a0, a5, 4
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB41_12
 ; RV32ZVE32F-NEXT:  .LBB41_3: # %else4
-; RV32ZVE32F-NEXT:    andi a0, a6, 8
+; RV32ZVE32F-NEXT:    andi a0, a5, 8
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB41_13
 ; RV32ZVE32F-NEXT:  .LBB41_4: # %else6
-; RV32ZVE32F-NEXT:    andi a0, a6, 16
+; RV32ZVE32F-NEXT:    andi a0, a5, 16
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB41_14
 ; RV32ZVE32F-NEXT:  .LBB41_5: # %else8
-; RV32ZVE32F-NEXT:    andi a0, a6, 32
+; RV32ZVE32F-NEXT:    andi a0, a5, 32
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB41_15
 ; RV32ZVE32F-NEXT:  .LBB41_6: # %else10
-; RV32ZVE32F-NEXT:    andi a0, a6, 64
+; RV32ZVE32F-NEXT:    andi a0, a5, 64
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB41_16
 ; RV32ZVE32F-NEXT:  .LBB41_7: # %else12
-; RV32ZVE32F-NEXT:    andi a0, a6, -128
+; RV32ZVE32F-NEXT:    andi a0, a5, -128
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB41_9
 ; RV32ZVE32F-NEXT:  .LBB41_8: # %cond.store13
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v8
-; RV32ZVE32F-NEXT:    sw a2, 0(a0)
-; RV32ZVE32F-NEXT:    sw a1, 4(a0)
+; RV32ZVE32F-NEXT:    sw a1, 0(a0)
+; RV32ZVE32F-NEXT:    sw a2, 4(a0)
 ; RV32ZVE32F-NEXT:  .LBB41_9: # %else14
 ; RV32ZVE32F-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
@@ -3318,7 +3318,7 @@ define void @mscatter_v8i64(<8 x i64> %val, <8 x ptr> %ptrs, <8 x i1> %m) {
 ; RV32ZVE32F-NEXT:    vmv.x.s s2, v8
 ; RV32ZVE32F-NEXT:    sw s1, 4(s2)
 ; RV32ZVE32F-NEXT:    sw a0, 0(s2)
-; RV32ZVE32F-NEXT:    andi a0, a6, 2
+; RV32ZVE32F-NEXT:    andi a0, a5, 2
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB41_2
 ; RV32ZVE32F-NEXT:  .LBB41_11: # %cond.store1
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
@@ -3326,47 +3326,47 @@ define void @mscatter_v8i64(<8 x i64> %val, <8 x ptr> %ptrs, <8 x i1> %m) {
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
 ; RV32ZVE32F-NEXT:    sw s0, 4(a0)
 ; RV32ZVE32F-NEXT:    sw t6, 0(a0)
-; RV32ZVE32F-NEXT:    andi a0, a6, 4
+; RV32ZVE32F-NEXT:    andi a0, a5, 4
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB41_3
 ; RV32ZVE32F-NEXT:  .LBB41_12: # %cond.store3
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 2
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t5, 0(a0)
-; RV32ZVE32F-NEXT:    sw t4, 4(a0)
-; RV32ZVE32F-NEXT:    andi a0, a6, 8
+; RV32ZVE32F-NEXT:    sw t4, 0(a0)
+; RV32ZVE32F-NEXT:    sw t5, 4(a0)
+; RV32ZVE32F-NEXT:    andi a0, a5, 8
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB41_4
 ; RV32ZVE32F-NEXT:  .LBB41_13: # %cond.store5
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 3
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t3, 0(a0)
-; RV32ZVE32F-NEXT:    sw t2, 4(a0)
-; RV32ZVE32F-NEXT:    andi a0, a6, 16
+; RV32ZVE32F-NEXT:    sw t2, 0(a0)
+; RV32ZVE32F-NEXT:    sw t3, 4(a0)
+; RV32ZVE32F-NEXT:    andi a0, a5, 16
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB41_5
 ; RV32ZVE32F-NEXT:  .LBB41_14: # %cond.store7
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t1, 0(a0)
-; RV32ZVE32F-NEXT:    sw t0, 4(a0)
-; RV32ZVE32F-NEXT:    andi a0, a6, 32
+; RV32ZVE32F-NEXT:    sw t0, 0(a0)
+; RV32ZVE32F-NEXT:    sw t1, 4(a0)
+; RV32ZVE32F-NEXT:    andi a0, a5, 32
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB41_6
 ; RV32ZVE32F-NEXT:  .LBB41_15: # %cond.store9
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 5
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw a7, 0(a0)
-; RV32ZVE32F-NEXT:    sw a5, 4(a0)
-; RV32ZVE32F-NEXT:    andi a0, a6, 64
+; RV32ZVE32F-NEXT:    sw a6, 0(a0)
+; RV32ZVE32F-NEXT:    sw a7, 4(a0)
+; RV32ZVE32F-NEXT:    andi a0, a5, 64
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB41_7
 ; RV32ZVE32F-NEXT:  .LBB41_16: # %cond.store11
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 6
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw a4, 0(a0)
-; RV32ZVE32F-NEXT:    sw a3, 4(a0)
-; RV32ZVE32F-NEXT:    andi a0, a6, -128
+; RV32ZVE32F-NEXT:    sw a3, 0(a0)
+; RV32ZVE32F-NEXT:    sw a4, 4(a0)
+; RV32ZVE32F-NEXT:    andi a0, a5, -128
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB41_8
 ; RV32ZVE32F-NEXT:    j .LBB41_9
 ;
@@ -3380,47 +3380,47 @@ define void @mscatter_v8i64(<8 x i64> %val, <8 x ptr> %ptrs, <8 x i1> %m) {
 ; RV64ZVE32F-NEXT:    .cfi_offset s0, -8
 ; RV64ZVE32F-NEXT:    .cfi_offset s1, -16
 ; RV64ZVE32F-NEXT:    .cfi_offset s2, -24
+; RV64ZVE32F-NEXT:    ld t5, 8(a1)
+; RV64ZVE32F-NEXT:    ld t3, 16(a1)
+; RV64ZVE32F-NEXT:    ld t1, 24(a1)
+; RV64ZVE32F-NEXT:    ld a6, 32(a1)
+; RV64ZVE32F-NEXT:    ld a4, 40(a1)
+; RV64ZVE32F-NEXT:    ld a3, 48(a1)
 ; RV64ZVE32F-NEXT:    ld a2, 56(a1)
-; RV64ZVE32F-NEXT:    ld a4, 48(a1)
-; RV64ZVE32F-NEXT:    ld a6, 40(a1)
-; RV64ZVE32F-NEXT:    ld t1, 32(a1)
-; RV64ZVE32F-NEXT:    ld t3, 24(a1)
-; RV64ZVE32F-NEXT:    ld t5, 16(a1)
-; RV64ZVE32F-NEXT:    ld s0, 8(a1)
-; RV64ZVE32F-NEXT:    ld a3, 56(a0)
-; RV64ZVE32F-NEXT:    ld a5, 48(a0)
-; RV64ZVE32F-NEXT:    ld t0, 40(a0)
-; RV64ZVE32F-NEXT:    ld t2, 32(a0)
-; RV64ZVE32F-NEXT:    ld t4, 24(a0)
-; RV64ZVE32F-NEXT:    ld t6, 16(a0)
 ; RV64ZVE32F-NEXT:    ld s1, 8(a0)
-; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT:    vmv.x.s a7, v0
-; RV64ZVE32F-NEXT:    andi s2, a7, 1
+; RV64ZVE32F-NEXT:    ld s0, 16(a0)
+; RV64ZVE32F-NEXT:    ld t6, 24(a0)
+; RV64ZVE32F-NEXT:    ld t4, 32(a0)
+; RV64ZVE32F-NEXT:    ld t2, 40(a0)
+; RV64ZVE32F-NEXT:    ld a7, 48(a0)
+; RV64ZVE32F-NEXT:    ld a5, 56(a0)
+; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
+; RV64ZVE32F-NEXT:    vmv.x.s t0, v0
+; RV64ZVE32F-NEXT:    andi s2, t0, 1
 ; RV64ZVE32F-NEXT:    bnez s2, .LBB41_10
 ; RV64ZVE32F-NEXT:  # %bb.1: # %else
-; RV64ZVE32F-NEXT:    andi a0, a7, 2
+; RV64ZVE32F-NEXT:    andi a0, t0, 2
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB41_11
 ; RV64ZVE32F-NEXT:  .LBB41_2: # %else2
-; RV64ZVE32F-NEXT:    andi a0, a7, 4
+; RV64ZVE32F-NEXT:    andi a0, t0, 4
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB41_12
 ; RV64ZVE32F-NEXT:  .LBB41_3: # %else4
-; RV64ZVE32F-NEXT:    andi a0, a7, 8
+; RV64ZVE32F-NEXT:    andi a0, t0, 8
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB41_13
 ; RV64ZVE32F-NEXT:  .LBB41_4: # %else6
-; RV64ZVE32F-NEXT:    andi a0, a7, 16
+; RV64ZVE32F-NEXT:    andi a0, t0, 16
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB41_14
 ; RV64ZVE32F-NEXT:  .LBB41_5: # %else8
-; RV64ZVE32F-NEXT:    andi a0, a7, 32
+; RV64ZVE32F-NEXT:    andi a0, t0, 32
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB41_15
 ; RV64ZVE32F-NEXT:  .LBB41_6: # %else10
-; RV64ZVE32F-NEXT:    andi a0, a7, 64
+; RV64ZVE32F-NEXT:    andi a0, t0, 64
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB41_16
 ; RV64ZVE32F-NEXT:  .LBB41_7: # %else12
-; RV64ZVE32F-NEXT:    andi a0, a7, -128
+; RV64ZVE32F-NEXT:    andi a0, t0, -128
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB41_9
 ; RV64ZVE32F-NEXT:  .LBB41_8: # %cond.store13
-; RV64ZVE32F-NEXT:    sd a3, 0(a2)
+; RV64ZVE32F-NEXT:    sd a5, 0(a2)
 ; RV64ZVE32F-NEXT:  .LBB41_9: # %else14
 ; RV64ZVE32F-NEXT:    ld s0, 24(sp) # 8-byte Folded Reload
 ; RV64ZVE32F-NEXT:    ld s1, 16(sp) # 8-byte Folded Reload
@@ -3431,31 +3431,31 @@ define void @mscatter_v8i64(<8 x i64> %val, <8 x ptr> %ptrs, <8 x i1> %m) {
 ; RV64ZVE32F-NEXT:    ld a1, 0(a1)
 ; RV64ZVE32F-NEXT:    ld a0, 0(a0)
 ; RV64ZVE32F-NEXT:    sd a0, 0(a1)
-; RV64ZVE32F-NEXT:    andi a0, a7, 2
+; RV64ZVE32F-NEXT:    andi a0, t0, 2
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB41_2
 ; RV64ZVE32F-NEXT:  .LBB41_11: # %cond.store1
-; RV64ZVE32F-NEXT:    sd s1, 0(s0)
-; RV64ZVE32F-NEXT:    andi a0, a7, 4
+; RV64ZVE32F-NEXT:    sd s1, 0(t5)
+; RV64ZVE32F-NEXT:    andi a0, t0, 4
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB41_3
 ; RV64ZVE32F-NEXT:  .LBB41_12: # %cond.store3
-; RV64ZVE32F-NEXT:    sd t6, 0(t5)
-; RV64ZVE32F-NEXT:    andi a0, a7, 8
+; RV64ZVE32F-NEXT:    sd s0, 0(t3)
+; RV64ZVE32F-NEXT:    andi a0, t0, 8
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB41_4
 ; RV64ZVE32F-NEXT:  .LBB41_13: # %cond.store5
-; RV64ZVE32F-NEXT:    sd t4, 0(t3)
-; RV64ZVE32F-NEXT:    andi a0, a7, 16
+; RV64ZVE32F-NEXT:    sd t6, 0(t1)
+; RV64ZVE32F-NEXT:    andi a0, t0, 16
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB41_5
 ; RV64ZVE32F-NEXT:  .LBB41_14: # %cond.store7
-; RV64ZVE32F-NEXT:    sd t2, 0(t1)
-; RV64ZVE32F-NEXT:    andi a0, a7, 32
+; RV64ZVE32F-NEXT:    sd t4, 0(a6)
+; RV64ZVE32F-NEXT:    andi a0, t0, 32
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB41_6
 ; RV64ZVE32F-NEXT:  .LBB41_15: # %cond.store9
-; RV64ZVE32F-NEXT:    sd t0, 0(a6)
-; RV64ZVE32F-NEXT:    andi a0, a7, 64
+; RV64ZVE32F-NEXT:    sd t2, 0(a4)
+; RV64ZVE32F-NEXT:    andi a0, t0, 64
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB41_7
 ; RV64ZVE32F-NEXT:  .LBB41_16: # %cond.store11
-; RV64ZVE32F-NEXT:    sd a5, 0(a4)
-; RV64ZVE32F-NEXT:    andi a0, a7, -128
+; RV64ZVE32F-NEXT:    sd a7, 0(a3)
+; RV64ZVE32F-NEXT:    andi a0, t0, -128
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB41_8
 ; RV64ZVE32F-NEXT:    j .LBB41_9
   call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> %val, <8 x ptr> %ptrs, i32 8, <8 x i1> %m)
@@ -3490,20 +3490,20 @@ define void @mscatter_baseidx_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8> %id
 ; RV32ZVE32F-NEXT:    .cfi_offset s0, -4
 ; RV32ZVE32F-NEXT:    .cfi_offset s1, -8
 ; RV32ZVE32F-NEXT:    .cfi_offset s2, -12
-; RV32ZVE32F-NEXT:    lw a2, 60(a0)
-; RV32ZVE32F-NEXT:    lw a3, 56(a0)
-; RV32ZVE32F-NEXT:    lw a4, 52(a0)
-; RV32ZVE32F-NEXT:    lw a5, 48(a0)
-; RV32ZVE32F-NEXT:    lw a6, 44(a0)
-; RV32ZVE32F-NEXT:    lw a7, 40(a0)
-; RV32ZVE32F-NEXT:    lw t0, 36(a0)
-; RV32ZVE32F-NEXT:    lw t1, 32(a0)
-; RV32ZVE32F-NEXT:    lw t2, 28(a0)
-; RV32ZVE32F-NEXT:    lw t3, 24(a0)
-; RV32ZVE32F-NEXT:    lw t4, 20(a0)
-; RV32ZVE32F-NEXT:    lw t5, 16(a0)
-; RV32ZVE32F-NEXT:    lw s0, 12(a0)
 ; RV32ZVE32F-NEXT:    lw t6, 8(a0)
+; RV32ZVE32F-NEXT:    lw s0, 12(a0)
+; RV32ZVE32F-NEXT:    lw t4, 16(a0)
+; RV32ZVE32F-NEXT:    lw t5, 20(a0)
+; RV32ZVE32F-NEXT:    lw t2, 24(a0)
+; RV32ZVE32F-NEXT:    lw t3, 28(a0)
+; RV32ZVE32F-NEXT:    lw t0, 32(a0)
+; RV32ZVE32F-NEXT:    lw t1, 36(a0)
+; RV32ZVE32F-NEXT:    lw a6, 40(a0)
+; RV32ZVE32F-NEXT:    lw a7, 44(a0)
+; RV32ZVE32F-NEXT:    lw a4, 48(a0)
+; RV32ZVE32F-NEXT:    lw a5, 52(a0)
+; RV32ZVE32F-NEXT:    lw a2, 56(a0)
+; RV32ZVE32F-NEXT:    lw a3, 60(a0)
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vsext.vf4 v10, v8
 ; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
@@ -3537,8 +3537,8 @@ define void @mscatter_baseidx_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8> %id
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v8
-; RV32ZVE32F-NEXT:    sw a3, 0(a0)
-; RV32ZVE32F-NEXT:    sw a2, 4(a0)
+; RV32ZVE32F-NEXT:    sw a2, 0(a0)
+; RV32ZVE32F-NEXT:    sw a3, 4(a0)
 ; RV32ZVE32F-NEXT:  .LBB42_9: # %else14
 ; RV32ZVE32F-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
@@ -3566,53 +3566,53 @@ define void @mscatter_baseidx_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8> %id
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 2
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t5, 0(a0)
-; RV32ZVE32F-NEXT:    sw t4, 4(a0)
+; RV32ZVE32F-NEXT:    sw t4, 0(a0)
+; RV32ZVE32F-NEXT:    sw t5, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, 8
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB42_4
 ; RV32ZVE32F-NEXT:  .LBB42_13: # %cond.store5
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 3
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t3, 0(a0)
-; RV32ZVE32F-NEXT:    sw t2, 4(a0)
+; RV32ZVE32F-NEXT:    sw t2, 0(a0)
+; RV32ZVE32F-NEXT:    sw t3, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, 16
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB42_5
 ; RV32ZVE32F-NEXT:  .LBB42_14: # %cond.store7
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t1, 0(a0)
-; RV32ZVE32F-NEXT:    sw t0, 4(a0)
+; RV32ZVE32F-NEXT:    sw t0, 0(a0)
+; RV32ZVE32F-NEXT:    sw t1, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, 32
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB42_6
 ; RV32ZVE32F-NEXT:  .LBB42_15: # %cond.store9
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 5
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw a7, 0(a0)
-; RV32ZVE32F-NEXT:    sw a6, 4(a0)
+; RV32ZVE32F-NEXT:    sw a6, 0(a0)
+; RV32ZVE32F-NEXT:    sw a7, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, 64
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB42_7
 ; RV32ZVE32F-NEXT:  .LBB42_16: # %cond.store11
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 6
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw a5, 0(a0)
-; RV32ZVE32F-NEXT:    sw a4, 4(a0)
+; RV32ZVE32F-NEXT:    sw a4, 0(a0)
+; RV32ZVE32F-NEXT:    sw a5, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, -128
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB42_8
 ; RV32ZVE32F-NEXT:    j .LBB42_9
 ;
 ; RV64ZVE32F-LABEL: mscatter_baseidx_v8i8_v8i64:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    ld a2, 56(a0)
-; RV64ZVE32F-NEXT:    ld a3, 48(a0)
-; RV64ZVE32F-NEXT:    ld a5, 40(a0)
-; RV64ZVE32F-NEXT:    ld a6, 32(a0)
-; RV64ZVE32F-NEXT:    ld a7, 24(a0)
-; RV64ZVE32F-NEXT:    ld t0, 16(a0)
 ; RV64ZVE32F-NEXT:    ld t1, 8(a0)
+; RV64ZVE32F-NEXT:    ld t0, 16(a0)
+; RV64ZVE32F-NEXT:    ld a7, 24(a0)
+; RV64ZVE32F-NEXT:    ld a6, 32(a0)
+; RV64ZVE32F-NEXT:    ld a5, 40(a0)
+; RV64ZVE32F-NEXT:    ld a3, 48(a0)
+; RV64ZVE32F-NEXT:    ld a2, 56(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.x.s a4, v0
 ; RV64ZVE32F-NEXT:    andi t2, a4, 1
@@ -3734,20 +3734,20 @@ define void @mscatter_baseidx_sext_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8
 ; RV32ZVE32F-NEXT:    .cfi_offset s0, -4
 ; RV32ZVE32F-NEXT:    .cfi_offset s1, -8
 ; RV32ZVE32F-NEXT:    .cfi_offset s2, -12
-; RV32ZVE32F-NEXT:    lw a2, 60(a0)
-; RV32ZVE32F-NEXT:    lw a3, 56(a0)
-; RV32ZVE32F-NEXT:    lw a4, 52(a0)
-; RV32ZVE32F-NEXT:    lw a5, 48(a0)
-; RV32ZVE32F-NEXT:    lw a6, 44(a0)
-; RV32ZVE32F-NEXT:    lw a7, 40(a0)
-; RV32ZVE32F-NEXT:    lw t0, 36(a0)
-; RV32ZVE32F-NEXT:    lw t1, 32(a0)
-; RV32ZVE32F-NEXT:    lw t2, 28(a0)
-; RV32ZVE32F-NEXT:    lw t3, 24(a0)
-; RV32ZVE32F-NEXT:    lw t4, 20(a0)
-; RV32ZVE32F-NEXT:    lw t5, 16(a0)
-; RV32ZVE32F-NEXT:    lw s0, 12(a0)
 ; RV32ZVE32F-NEXT:    lw t6, 8(a0)
+; RV32ZVE32F-NEXT:    lw s0, 12(a0)
+; RV32ZVE32F-NEXT:    lw t4, 16(a0)
+; RV32ZVE32F-NEXT:    lw t5, 20(a0)
+; RV32ZVE32F-NEXT:    lw t2, 24(a0)
+; RV32ZVE32F-NEXT:    lw t3, 28(a0)
+; RV32ZVE32F-NEXT:    lw t0, 32(a0)
+; RV32ZVE32F-NEXT:    lw t1, 36(a0)
+; RV32ZVE32F-NEXT:    lw a6, 40(a0)
+; RV32ZVE32F-NEXT:    lw a7, 44(a0)
+; RV32ZVE32F-NEXT:    lw a4, 48(a0)
+; RV32ZVE32F-NEXT:    lw a5, 52(a0)
+; RV32ZVE32F-NEXT:    lw a2, 56(a0)
+; RV32ZVE32F-NEXT:    lw a3, 60(a0)
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vsext.vf4 v10, v8
 ; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
@@ -3781,8 +3781,8 @@ define void @mscatter_baseidx_sext_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v8
-; RV32ZVE32F-NEXT:    sw a3, 0(a0)
-; RV32ZVE32F-NEXT:    sw a2, 4(a0)
+; RV32ZVE32F-NEXT:    sw a2, 0(a0)
+; RV32ZVE32F-NEXT:    sw a3, 4(a0)
 ; RV32ZVE32F-NEXT:  .LBB43_9: # %else14
 ; RV32ZVE32F-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
@@ -3810,53 +3810,53 @@ define void @mscatter_baseidx_sext_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 2
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t5, 0(a0)
-; RV32ZVE32F-NEXT:    sw t4, 4(a0)
+; RV32ZVE32F-NEXT:    sw t4, 0(a0)
+; RV32ZVE32F-NEXT:    sw t5, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, 8
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB43_4
 ; RV32ZVE32F-NEXT:  .LBB43_13: # %cond.store5
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 3
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t3, 0(a0)
-; RV32ZVE32F-NEXT:    sw t2, 4(a0)
+; RV32ZVE32F-NEXT:    sw t2, 0(a0)
+; RV32ZVE32F-NEXT:    sw t3, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, 16
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB43_5
 ; RV32ZVE32F-NEXT:  .LBB43_14: # %cond.store7
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t1, 0(a0)
-; RV32ZVE32F-NEXT:    sw t0, 4(a0)
+; RV32ZVE32F-NEXT:    sw t0, 0(a0)
+; RV32ZVE32F-NEXT:    sw t1, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, 32
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB43_6
 ; RV32ZVE32F-NEXT:  .LBB43_15: # %cond.store9
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 5
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw a7, 0(a0)
-; RV32ZVE32F-NEXT:    sw a6, 4(a0)
+; RV32ZVE32F-NEXT:    sw a6, 0(a0)
+; RV32ZVE32F-NEXT:    sw a7, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, 64
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB43_7
 ; RV32ZVE32F-NEXT:  .LBB43_16: # %cond.store11
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 6
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw a5, 0(a0)
-; RV32ZVE32F-NEXT:    sw a4, 4(a0)
+; RV32ZVE32F-NEXT:    sw a4, 0(a0)
+; RV32ZVE32F-NEXT:    sw a5, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, -128
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB43_8
 ; RV32ZVE32F-NEXT:    j .LBB43_9
 ;
 ; RV64ZVE32F-LABEL: mscatter_baseidx_sext_v8i8_v8i64:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    ld a2, 56(a0)
-; RV64ZVE32F-NEXT:    ld a3, 48(a0)
-; RV64ZVE32F-NEXT:    ld a5, 40(a0)
-; RV64ZVE32F-NEXT:    ld a6, 32(a0)
-; RV64ZVE32F-NEXT:    ld a7, 24(a0)
-; RV64ZVE32F-NEXT:    ld t0, 16(a0)
 ; RV64ZVE32F-NEXT:    ld t1, 8(a0)
+; RV64ZVE32F-NEXT:    ld t0, 16(a0)
+; RV64ZVE32F-NEXT:    ld a7, 24(a0)
+; RV64ZVE32F-NEXT:    ld a6, 32(a0)
+; RV64ZVE32F-NEXT:    ld a5, 40(a0)
+; RV64ZVE32F-NEXT:    ld a3, 48(a0)
+; RV64ZVE32F-NEXT:    ld a2, 56(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.x.s a4, v0
 ; RV64ZVE32F-NEXT:    andi t2, a4, 1
@@ -3980,20 +3980,20 @@ define void @mscatter_baseidx_zext_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8
 ; RV32ZVE32F-NEXT:    .cfi_offset s0, -4
 ; RV32ZVE32F-NEXT:    .cfi_offset s1, -8
 ; RV32ZVE32F-NEXT:    .cfi_offset s2, -12
-; RV32ZVE32F-NEXT:    lw a2, 60(a0)
-; RV32ZVE32F-NEXT:    lw a3, 56(a0)
-; RV32ZVE32F-NEXT:    lw a4, 52(a0)
-; RV32ZVE32F-NEXT:    lw a5, 48(a0)
-; RV32ZVE32F-NEXT:    lw a6, 44(a0)
-; RV32ZVE32F-NEXT:    lw a7, 40(a0)
-; RV32ZVE32F-NEXT:    lw t0, 36(a0)
-; RV32ZVE32F-NEXT:    lw t1, 32(a0)
-; RV32ZVE32F-NEXT:    lw t2, 28(a0)
-; RV32ZVE32F-NEXT:    lw t3, 24(a0)
-; RV32ZVE32F-NEXT:    lw t4, 20(a0)
-; RV32ZVE32F-NEXT:    lw t5, 16(a0)
-; RV32ZVE32F-NEXT:    lw s0, 12(a0)
 ; RV32ZVE32F-NEXT:    lw t6, 8(a0)
+; RV32ZVE32F-NEXT:    lw s0, 12(a0)
+; RV32ZVE32F-NEXT:    lw t4, 16(a0)
+; RV32ZVE32F-NEXT:    lw t5, 20(a0)
+; RV32ZVE32F-NEXT:    lw t2, 24(a0)
+; RV32ZVE32F-NEXT:    lw t3, 28(a0)
+; RV32ZVE32F-NEXT:    lw t0, 32(a0)
+; RV32ZVE32F-NEXT:    lw t1, 36(a0)
+; RV32ZVE32F-NEXT:    lw a6, 40(a0)
+; RV32ZVE32F-NEXT:    lw a7, 44(a0)
+; RV32ZVE32F-NEXT:    lw a4, 48(a0)
+; RV32ZVE32F-NEXT:    lw a5, 52(a0)
+; RV32ZVE32F-NEXT:    lw a2, 56(a0)
+; RV32ZVE32F-NEXT:    lw a3, 60(a0)
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vzext.vf4 v10, v8
 ; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
@@ -4027,8 +4027,8 @@ define void @mscatter_baseidx_zext_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v8
-; RV32ZVE32F-NEXT:    sw a3, 0(a0)
-; RV32ZVE32F-NEXT:    sw a2, 4(a0)
+; RV32ZVE32F-NEXT:    sw a2, 0(a0)
+; RV32ZVE32F-NEXT:    sw a3, 4(a0)
 ; RV32ZVE32F-NEXT:  .LBB44_9: # %else14
 ; RV32ZVE32F-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
@@ -4056,53 +4056,53 @@ define void @mscatter_baseidx_zext_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 2
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t5, 0(a0)
-; RV32ZVE32F-NEXT:    sw t4, 4(a0)
+; RV32ZVE32F-NEXT:    sw t4, 0(a0)
+; RV32ZVE32F-NEXT:    sw t5, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, 8
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB44_4
 ; RV32ZVE32F-NEXT:  .LBB44_13: # %cond.store5
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 3
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t3, 0(a0)
-; RV32ZVE32F-NEXT:    sw t2, 4(a0)
+; RV32ZVE32F-NEXT:    sw t2, 0(a0)
+; RV32ZVE32F-NEXT:    sw t3, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, 16
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB44_5
 ; RV32ZVE32F-NEXT:  .LBB44_14: # %cond.store7
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t1, 0(a0)
-; RV32ZVE32F-NEXT:    sw t0, 4(a0)
+; RV32ZVE32F-NEXT:    sw t0, 0(a0)
+; RV32ZVE32F-NEXT:    sw t1, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, 32
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB44_6
 ; RV32ZVE32F-NEXT:  .LBB44_15: # %cond.store9
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 5
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw a7, 0(a0)
-; RV32ZVE32F-NEXT:    sw a6, 4(a0)
+; RV32ZVE32F-NEXT:    sw a6, 0(a0)
+; RV32ZVE32F-NEXT:    sw a7, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, 64
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB44_7
 ; RV32ZVE32F-NEXT:  .LBB44_16: # %cond.store11
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 6
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw a5, 0(a0)
-; RV32ZVE32F-NEXT:    sw a4, 4(a0)
+; RV32ZVE32F-NEXT:    sw a4, 0(a0)
+; RV32ZVE32F-NEXT:    sw a5, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, -128
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB44_8
 ; RV32ZVE32F-NEXT:    j .LBB44_9
 ;
 ; RV64ZVE32F-LABEL: mscatter_baseidx_zext_v8i8_v8i64:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    ld a2, 56(a0)
-; RV64ZVE32F-NEXT:    ld a3, 48(a0)
-; RV64ZVE32F-NEXT:    ld a5, 40(a0)
-; RV64ZVE32F-NEXT:    ld a6, 32(a0)
-; RV64ZVE32F-NEXT:    ld a7, 24(a0)
-; RV64ZVE32F-NEXT:    ld t0, 16(a0)
 ; RV64ZVE32F-NEXT:    ld t1, 8(a0)
+; RV64ZVE32F-NEXT:    ld t0, 16(a0)
+; RV64ZVE32F-NEXT:    ld a7, 24(a0)
+; RV64ZVE32F-NEXT:    ld a6, 32(a0)
+; RV64ZVE32F-NEXT:    ld a5, 40(a0)
+; RV64ZVE32F-NEXT:    ld a3, 48(a0)
+; RV64ZVE32F-NEXT:    ld a2, 56(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.x.s a4, v0
 ; RV64ZVE32F-NEXT:    andi t2, a4, 1
@@ -4233,20 +4233,20 @@ define void @mscatter_baseidx_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i16> %
 ; RV32ZVE32F-NEXT:    .cfi_offset s0, -4
 ; RV32ZVE32F-NEXT:    .cfi_offset s1, -8
 ; RV32ZVE32F-NEXT:    .cfi_offset s2, -12
-; RV32ZVE32F-NEXT:    lw a2, 60(a0)
-; RV32ZVE32F-NEXT:    lw a3, 56(a0)
-; RV32ZVE32F-NEXT:    lw a4, 52(a0)
-; RV32ZVE32F-NEXT:    lw a5, 48(a0)
-; RV32ZVE32F-NEXT:    lw a6, 44(a0)
-; RV32ZVE32F-NEXT:    lw a7, 40(a0)
-; RV32ZVE32F-NEXT:    lw t0, 36(a0)
-; RV32ZVE32F-NEXT:    lw t1, 32(a0)
-; RV32ZVE32F-NEXT:    lw t2, 28(a0)
-; RV32ZVE32F-NEXT:    lw t3, 24(a0)
-; RV32ZVE32F-NEXT:    lw t4, 20(a0)
-; RV32ZVE32F-NEXT:    lw t5, 16(a0)
-; RV32ZVE32F-NEXT:    lw s0, 12(a0)
 ; RV32ZVE32F-NEXT:    lw t6, 8(a0)
+; RV32ZVE32F-NEXT:    lw s0, 12(a0)
+; RV32ZVE32F-NEXT:    lw t4, 16(a0)
+; RV32ZVE32F-NEXT:    lw t5, 20(a0)
+; RV32ZVE32F-NEXT:    lw t2, 24(a0)
+; RV32ZVE32F-NEXT:    lw t3, 28(a0)
+; RV32ZVE32F-NEXT:    lw t0, 32(a0)
+; RV32ZVE32F-NEXT:    lw t1, 36(a0)
+; RV32ZVE32F-NEXT:    lw a6, 40(a0)
+; RV32ZVE32F-NEXT:    lw a7, 44(a0)
+; RV32ZVE32F-NEXT:    lw a4, 48(a0)
+; RV32ZVE32F-NEXT:    lw a5, 52(a0)
+; RV32ZVE32F-NEXT:    lw a2, 56(a0)
+; RV32ZVE32F-NEXT:    lw a3, 60(a0)
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vsext.vf2 v10, v8
 ; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
@@ -4280,8 +4280,8 @@ define void @mscatter_baseidx_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i16> %
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v8
-; RV32ZVE32F-NEXT:    sw a3, 0(a0)
-; RV32ZVE32F-NEXT:    sw a2, 4(a0)
+; RV32ZVE32F-NEXT:    sw a2, 0(a0)
+; RV32ZVE32F-NEXT:    sw a3, 4(a0)
 ; RV32ZVE32F-NEXT:  .LBB45_9: # %else14
 ; RV32ZVE32F-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
@@ -4309,53 +4309,53 @@ define void @mscatter_baseidx_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i16> %
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 2
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t5, 0(a0)
-; RV32ZVE32F-NEXT:    sw t4, 4(a0)
+; RV32ZVE32F-NEXT:    sw t4, 0(a0)
+; RV32ZVE32F-NEXT:    sw t5, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, 8
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB45_4
 ; RV32ZVE32F-NEXT:  .LBB45_13: # %cond.store5
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 3
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t3, 0(a0)
-; RV32ZVE32F-NEXT:    sw t2, 4(a0)
+; RV32ZVE32F-NEXT:    sw t2, 0(a0)
+; RV32ZVE32F-NEXT:    sw t3, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, 16
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB45_5
 ; RV32ZVE32F-NEXT:  .LBB45_14: # %cond.store7
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t1, 0(a0)
-; RV32ZVE32F-NEXT:    sw t0, 4(a0)
+; RV32ZVE32F-NEXT:    sw t0, 0(a0)
+; RV32ZVE32F-NEXT:    sw t1, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, 32
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB45_6
 ; RV32ZVE32F-NEXT:  .LBB45_15: # %cond.store9
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 5
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw a7, 0(a0)
-; RV32ZVE32F-NEXT:    sw a6, 4(a0)
+; RV32ZVE32F-NEXT:    sw a6, 0(a0)
+; RV32ZVE32F-NEXT:    sw a7, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, 64
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB45_7
 ; RV32ZVE32F-NEXT:  .LBB45_16: # %cond.store11
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 6
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw a5, 0(a0)
-; RV32ZVE32F-NEXT:    sw a4, 4(a0)
+; RV32ZVE32F-NEXT:    sw a4, 0(a0)
+; RV32ZVE32F-NEXT:    sw a5, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, -128
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB45_8
 ; RV32ZVE32F-NEXT:    j .LBB45_9
 ;
 ; RV64ZVE32F-LABEL: mscatter_baseidx_v8i16_v8i64:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    ld a2, 56(a0)
-; RV64ZVE32F-NEXT:    ld a3, 48(a0)
-; RV64ZVE32F-NEXT:    ld a5, 40(a0)
-; RV64ZVE32F-NEXT:    ld a6, 32(a0)
-; RV64ZVE32F-NEXT:    ld a7, 24(a0)
-; RV64ZVE32F-NEXT:    ld t0, 16(a0)
 ; RV64ZVE32F-NEXT:    ld t1, 8(a0)
+; RV64ZVE32F-NEXT:    ld t0, 16(a0)
+; RV64ZVE32F-NEXT:    ld a7, 24(a0)
+; RV64ZVE32F-NEXT:    ld a6, 32(a0)
+; RV64ZVE32F-NEXT:    ld a5, 40(a0)
+; RV64ZVE32F-NEXT:    ld a3, 48(a0)
+; RV64ZVE32F-NEXT:    ld a2, 56(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.x.s a4, v0
 ; RV64ZVE32F-NEXT:    andi t2, a4, 1
@@ -4478,20 +4478,20 @@ define void @mscatter_baseidx_sext_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i
 ; RV32ZVE32F-NEXT:    .cfi_offset s0, -4
 ; RV32ZVE32F-NEXT:    .cfi_offset s1, -8
 ; RV32ZVE32F-NEXT:    .cfi_offset s2, -12
-; RV32ZVE32F-NEXT:    lw a2, 60(a0)
-; RV32ZVE32F-NEXT:    lw a3, 56(a0)
-; RV32ZVE32F-NEXT:    lw a4, 52(a0)
-; RV32ZVE32F-NEXT:    lw a5, 48(a0)
-; RV32ZVE32F-NEXT:    lw a6, 44(a0)
-; RV32ZVE32F-NEXT:    lw a7, 40(a0)
-; RV32ZVE32F-NEXT:    lw t0, 36(a0)
-; RV32ZVE32F-NEXT:    lw t1, 32(a0)
-; RV32ZVE32F-NEXT:    lw t2, 28(a0)
-; RV32ZVE32F-NEXT:    lw t3, 24(a0)
-; RV32ZVE32F-NEXT:    lw t4, 20(a0)
-; RV32ZVE32F-NEXT:    lw t5, 16(a0)
-; RV32ZVE32F-NEXT:    lw s0, 12(a0)
 ; RV32ZVE32F-NEXT:    lw t6, 8(a0)
+; RV32ZVE32F-NEXT:    lw s0, 12(a0)
+; RV32ZVE32F-NEXT:    lw t4, 16(a0)
+; RV32ZVE32F-NEXT:    lw t5, 20(a0)
+; RV32ZVE32F-NEXT:    lw t2, 24(a0)
+; RV32ZVE32F-NEXT:    lw t3, 28(a0)
+; RV32ZVE32F-NEXT:    lw t0, 32(a0)
+; RV32ZVE32F-NEXT:    lw t1, 36(a0)
+; RV32ZVE32F-NEXT:    lw a6, 40(a0)
+; RV32ZVE32F-NEXT:    lw a7, 44(a0)
+; RV32ZVE32F-NEXT:    lw a4, 48(a0)
+; RV32ZVE32F-NEXT:    lw a5, 52(a0)
+; RV32ZVE32F-NEXT:    lw a2, 56(a0)
+; RV32ZVE32F-NEXT:    lw a3, 60(a0)
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vsext.vf2 v10, v8
 ; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
@@ -4525,8 +4525,8 @@ define void @mscatter_baseidx_sext_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v8
-; RV32ZVE32F-NEXT:    sw a3, 0(a0)
-; RV32ZVE32F-NEXT:    sw a2, 4(a0)
+; RV32ZVE32F-NEXT:    sw a2, 0(a0)
+; RV32ZVE32F-NEXT:    sw a3, 4(a0)
 ; RV32ZVE32F-NEXT:  .LBB46_9: # %else14
 ; RV32ZVE32F-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
@@ -4554,53 +4554,53 @@ define void @mscatter_baseidx_sext_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 2
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t5, 0(a0)
-; RV32ZVE32F-NEXT:    sw t4, 4(a0)
+; RV32ZVE32F-NEXT:    sw t4, 0(a0)
+; RV32ZVE32F-NEXT:    sw t5, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, 8
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB46_4
 ; RV32ZVE32F-NEXT:  .LBB46_13: # %cond.store5
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 3
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t3, 0(a0)
-; RV32ZVE32F-NEXT:    sw t2, 4(a0)
+; RV32ZVE32F-NEXT:    sw t2, 0(a0)
+; RV32ZVE32F-NEXT:    sw t3, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, 16
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB46_5
 ; RV32ZVE32F-NEXT:  .LBB46_14: # %cond.store7
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t1, 0(a0)
-; RV32ZVE32F-NEXT:    sw t0, 4(a0)
+; RV32ZVE32F-NEXT:    sw t0, 0(a0)
+; RV32ZVE32F-NEXT:    sw t1, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, 32
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB46_6
 ; RV32ZVE32F-NEXT:  .LBB46_15: # %cond.store9
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 5
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw a7, 0(a0)
-; RV32ZVE32F-NEXT:    sw a6, 4(a0)
+; RV32ZVE32F-NEXT:    sw a6, 0(a0)
+; RV32ZVE32F-NEXT:    sw a7, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, 64
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB46_7
 ; RV32ZVE32F-NEXT:  .LBB46_16: # %cond.store11
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 6
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw a5, 0(a0)
-; RV32ZVE32F-NEXT:    sw a4, 4(a0)
+; RV32ZVE32F-NEXT:    sw a4, 0(a0)
+; RV32ZVE32F-NEXT:    sw a5, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, -128
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB46_8
 ; RV32ZVE32F-NEXT:    j .LBB46_9
 ;
 ; RV64ZVE32F-LABEL: mscatter_baseidx_sext_v8i16_v8i64:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    ld a2, 56(a0)
-; RV64ZVE32F-NEXT:    ld a3, 48(a0)
-; RV64ZVE32F-NEXT:    ld a5, 40(a0)
-; RV64ZVE32F-NEXT:    ld a6, 32(a0)
-; RV64ZVE32F-NEXT:    ld a7, 24(a0)
-; RV64ZVE32F-NEXT:    ld t0, 16(a0)
 ; RV64ZVE32F-NEXT:    ld t1, 8(a0)
+; RV64ZVE32F-NEXT:    ld t0, 16(a0)
+; RV64ZVE32F-NEXT:    ld a7, 24(a0)
+; RV64ZVE32F-NEXT:    ld a6, 32(a0)
+; RV64ZVE32F-NEXT:    ld a5, 40(a0)
+; RV64ZVE32F-NEXT:    ld a3, 48(a0)
+; RV64ZVE32F-NEXT:    ld a2, 56(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.x.s a4, v0
 ; RV64ZVE32F-NEXT:    andi t2, a4, 1
@@ -4725,20 +4725,20 @@ define void @mscatter_baseidx_zext_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i
 ; RV32ZVE32F-NEXT:    .cfi_offset s0, -4
 ; RV32ZVE32F-NEXT:    .cfi_offset s1, -8
 ; RV32ZVE32F-NEXT:    .cfi_offset s2, -12
-; RV32ZVE32F-NEXT:    lw a2, 60(a0)
-; RV32ZVE32F-NEXT:    lw a3, 56(a0)
-; RV32ZVE32F-NEXT:    lw a4, 52(a0)
-; RV32ZVE32F-NEXT:    lw a5, 48(a0)
-; RV32ZVE32F-NEXT:    lw a6, 44(a0)
-; RV32ZVE32F-NEXT:    lw a7, 40(a0)
-; RV32ZVE32F-NEXT:    lw t0, 36(a0)
-; RV32ZVE32F-NEXT:    lw t1, 32(a0)
-; RV32ZVE32F-NEXT:    lw t2, 28(a0)
-; RV32ZVE32F-NEXT:    lw t3, 24(a0)
-; RV32ZVE32F-NEXT:    lw t4, 20(a0)
-; RV32ZVE32F-NEXT:    lw t5, 16(a0)
-; RV32ZVE32F-NEXT:    lw s0, 12(a0)
 ; RV32ZVE32F-NEXT:    lw t6, 8(a0)
+; RV32ZVE32F-NEXT:    lw s0, 12(a0)
+; RV32ZVE32F-NEXT:    lw t4, 16(a0)
+; RV32ZVE32F-NEXT:    lw t5, 20(a0)
+; RV32ZVE32F-NEXT:    lw t2, 24(a0)
+; RV32ZVE32F-NEXT:    lw t3, 28(a0)
+; RV32ZVE32F-NEXT:    lw t0, 32(a0)
+; RV32ZVE32F-NEXT:    lw t1, 36(a0)
+; RV32ZVE32F-NEXT:    lw a6, 40(a0)
+; RV32ZVE32F-NEXT:    lw a7, 44(a0)
+; RV32ZVE32F-NEXT:    lw a4, 48(a0)
+; RV32ZVE32F-NEXT:    lw a5, 52(a0)
+; RV32ZVE32F-NEXT:    lw a2, 56(a0)
+; RV32ZVE32F-NEXT:    lw a3, 60(a0)
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vzext.vf2 v10, v8
 ; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
@@ -4772,8 +4772,8 @@ define void @mscatter_baseidx_zext_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v8
-; RV32ZVE32F-NEXT:    sw a3, 0(a0)
-; RV32ZVE32F-NEXT:    sw a2, 4(a0)
+; RV32ZVE32F-NEXT:    sw a2, 0(a0)
+; RV32ZVE32F-NEXT:    sw a3, 4(a0)
 ; RV32ZVE32F-NEXT:  .LBB47_9: # %else14
 ; RV32ZVE32F-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
@@ -4801,53 +4801,53 @@ define void @mscatter_baseidx_zext_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 2
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t5, 0(a0)
-; RV32ZVE32F-NEXT:    sw t4, 4(a0)
+; RV32ZVE32F-NEXT:    sw t4, 0(a0)
+; RV32ZVE32F-NEXT:    sw t5, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, 8
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB47_4
 ; RV32ZVE32F-NEXT:  .LBB47_13: # %cond.store5
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 3
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t3, 0(a0)
-; RV32ZVE32F-NEXT:    sw t2, 4(a0)
+; RV32ZVE32F-NEXT:    sw t2, 0(a0)
+; RV32ZVE32F-NEXT:    sw t3, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, 16
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB47_5
 ; RV32ZVE32F-NEXT:  .LBB47_14: # %cond.store7
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t1, 0(a0)
-; RV32ZVE32F-NEXT:    sw t0, 4(a0)
+; RV32ZVE32F-NEXT:    sw t0, 0(a0)
+; RV32ZVE32F-NEXT:    sw t1, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, 32
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB47_6
 ; RV32ZVE32F-NEXT:  .LBB47_15: # %cond.store9
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 5
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw a7, 0(a0)
-; RV32ZVE32F-NEXT:    sw a6, 4(a0)
+; RV32ZVE32F-NEXT:    sw a6, 0(a0)
+; RV32ZVE32F-NEXT:    sw a7, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, 64
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB47_7
 ; RV32ZVE32F-NEXT:  .LBB47_16: # %cond.store11
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 6
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw a5, 0(a0)
-; RV32ZVE32F-NEXT:    sw a4, 4(a0)
+; RV32ZVE32F-NEXT:    sw a4, 0(a0)
+; RV32ZVE32F-NEXT:    sw a5, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, -128
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB47_8
 ; RV32ZVE32F-NEXT:    j .LBB47_9
 ;
 ; RV64ZVE32F-LABEL: mscatter_baseidx_zext_v8i16_v8i64:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    ld a2, 56(a0)
-; RV64ZVE32F-NEXT:    ld a3, 48(a0)
-; RV64ZVE32F-NEXT:    ld a6, 40(a0)
-; RV64ZVE32F-NEXT:    ld a7, 32(a0)
-; RV64ZVE32F-NEXT:    ld t0, 24(a0)
-; RV64ZVE32F-NEXT:    ld t1, 16(a0)
 ; RV64ZVE32F-NEXT:    ld t2, 8(a0)
+; RV64ZVE32F-NEXT:    ld t1, 16(a0)
+; RV64ZVE32F-NEXT:    ld t0, 24(a0)
+; RV64ZVE32F-NEXT:    ld a7, 32(a0)
+; RV64ZVE32F-NEXT:    ld a6, 40(a0)
+; RV64ZVE32F-NEXT:    ld a3, 48(a0)
+; RV64ZVE32F-NEXT:    ld a2, 56(a0)
 ; RV64ZVE32F-NEXT:    lui a4, 16
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.x.s a5, v0
@@ -4980,20 +4980,20 @@ define void @mscatter_baseidx_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i32> %
 ; RV32ZVE32F-NEXT:    .cfi_offset s0, -4
 ; RV32ZVE32F-NEXT:    .cfi_offset s1, -8
 ; RV32ZVE32F-NEXT:    .cfi_offset s2, -12
-; RV32ZVE32F-NEXT:    lw a2, 60(a0)
-; RV32ZVE32F-NEXT:    lw a3, 56(a0)
-; RV32ZVE32F-NEXT:    lw a4, 52(a0)
-; RV32ZVE32F-NEXT:    lw a5, 48(a0)
-; RV32ZVE32F-NEXT:    lw a6, 44(a0)
-; RV32ZVE32F-NEXT:    lw a7, 40(a0)
-; RV32ZVE32F-NEXT:    lw t0, 36(a0)
-; RV32ZVE32F-NEXT:    lw t1, 32(a0)
-; RV32ZVE32F-NEXT:    lw t2, 28(a0)
-; RV32ZVE32F-NEXT:    lw t3, 24(a0)
-; RV32ZVE32F-NEXT:    lw t4, 20(a0)
-; RV32ZVE32F-NEXT:    lw t5, 16(a0)
-; RV32ZVE32F-NEXT:    lw s0, 12(a0)
 ; RV32ZVE32F-NEXT:    lw t6, 8(a0)
+; RV32ZVE32F-NEXT:    lw s0, 12(a0)
+; RV32ZVE32F-NEXT:    lw t4, 16(a0)
+; RV32ZVE32F-NEXT:    lw t5, 20(a0)
+; RV32ZVE32F-NEXT:    lw t2, 24(a0)
+; RV32ZVE32F-NEXT:    lw t3, 28(a0)
+; RV32ZVE32F-NEXT:    lw t0, 32(a0)
+; RV32ZVE32F-NEXT:    lw t1, 36(a0)
+; RV32ZVE32F-NEXT:    lw a6, 40(a0)
+; RV32ZVE32F-NEXT:    lw a7, 44(a0)
+; RV32ZVE32F-NEXT:    lw a4, 48(a0)
+; RV32ZVE32F-NEXT:    lw a5, 52(a0)
+; RV32ZVE32F-NEXT:    lw a2, 56(a0)
+; RV32ZVE32F-NEXT:    lw a3, 60(a0)
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vsll.vi v8, v8, 3
 ; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a1
@@ -5026,8 +5026,8 @@ define void @mscatter_baseidx_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i32> %
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v8
-; RV32ZVE32F-NEXT:    sw a3, 0(a0)
-; RV32ZVE32F-NEXT:    sw a2, 4(a0)
+; RV32ZVE32F-NEXT:    sw a2, 0(a0)
+; RV32ZVE32F-NEXT:    sw a3, 4(a0)
 ; RV32ZVE32F-NEXT:  .LBB48_9: # %else14
 ; RV32ZVE32F-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
@@ -5055,53 +5055,53 @@ define void @mscatter_baseidx_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i32> %
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 2
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t5, 0(a0)
-; RV32ZVE32F-NEXT:    sw t4, 4(a0)
+; RV32ZVE32F-NEXT:    sw t4, 0(a0)
+; RV32ZVE32F-NEXT:    sw t5, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, 8
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB48_4
 ; RV32ZVE32F-NEXT:  .LBB48_13: # %cond.store5
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 3
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t3, 0(a0)
-; RV32ZVE32F-NEXT:    sw t2, 4(a0)
+; RV32ZVE32F-NEXT:    sw t2, 0(a0)
+; RV32ZVE32F-NEXT:    sw t3, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, 16
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB48_5
 ; RV32ZVE32F-NEXT:  .LBB48_14: # %cond.store7
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t1, 0(a0)
-; RV32ZVE32F-NEXT:    sw t0, 4(a0)
+; RV32ZVE32F-NEXT:    sw t0, 0(a0)
+; RV32ZVE32F-NEXT:    sw t1, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, 32
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB48_6
 ; RV32ZVE32F-NEXT:  .LBB48_15: # %cond.store9
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 5
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw a7, 0(a0)
-; RV32ZVE32F-NEXT:    sw a6, 4(a0)
+; RV32ZVE32F-NEXT:    sw a6, 0(a0)
+; RV32ZVE32F-NEXT:    sw a7, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, 64
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB48_7
 ; RV32ZVE32F-NEXT:  .LBB48_16: # %cond.store11
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 6
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw a5, 0(a0)
-; RV32ZVE32F-NEXT:    sw a4, 4(a0)
+; RV32ZVE32F-NEXT:    sw a4, 0(a0)
+; RV32ZVE32F-NEXT:    sw a5, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, -128
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB48_8
 ; RV32ZVE32F-NEXT:    j .LBB48_9
 ;
 ; RV64ZVE32F-LABEL: mscatter_baseidx_v8i32_v8i64:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    ld a2, 56(a0)
-; RV64ZVE32F-NEXT:    ld a3, 48(a0)
-; RV64ZVE32F-NEXT:    ld a5, 40(a0)
-; RV64ZVE32F-NEXT:    ld a6, 32(a0)
-; RV64ZVE32F-NEXT:    ld a7, 24(a0)
-; RV64ZVE32F-NEXT:    ld t0, 16(a0)
 ; RV64ZVE32F-NEXT:    ld t1, 8(a0)
+; RV64ZVE32F-NEXT:    ld t0, 16(a0)
+; RV64ZVE32F-NEXT:    ld a7, 24(a0)
+; RV64ZVE32F-NEXT:    ld a6, 32(a0)
+; RV64ZVE32F-NEXT:    ld a5, 40(a0)
+; RV64ZVE32F-NEXT:    ld a3, 48(a0)
+; RV64ZVE32F-NEXT:    ld a2, 56(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.x.s a4, v0
 ; RV64ZVE32F-NEXT:    andi t2, a4, 1
@@ -5223,20 +5223,20 @@ define void @mscatter_baseidx_sext_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i
 ; RV32ZVE32F-NEXT:    .cfi_offset s0, -4
 ; RV32ZVE32F-NEXT:    .cfi_offset s1, -8
 ; RV32ZVE32F-NEXT:    .cfi_offset s2, -12
-; RV32ZVE32F-NEXT:    lw a2, 60(a0)
-; RV32ZVE32F-NEXT:    lw a3, 56(a0)
-; RV32ZVE32F-NEXT:    lw a4, 52(a0)
-; RV32ZVE32F-NEXT:    lw a5, 48(a0)
-; RV32ZVE32F-NEXT:    lw a6, 44(a0)
-; RV32ZVE32F-NEXT:    lw a7, 40(a0)
-; RV32ZVE32F-NEXT:    lw t0, 36(a0)
-; RV32ZVE32F-NEXT:    lw t1, 32(a0)
-; RV32ZVE32F-NEXT:    lw t2, 28(a0)
-; RV32ZVE32F-NEXT:    lw t3, 24(a0)
-; RV32ZVE32F-NEXT:    lw t4, 20(a0)
-; RV32ZVE32F-NEXT:    lw t5, 16(a0)
-; RV32ZVE32F-NEXT:    lw s0, 12(a0)
 ; RV32ZVE32F-NEXT:    lw t6, 8(a0)
+; RV32ZVE32F-NEXT:    lw s0, 12(a0)
+; RV32ZVE32F-NEXT:    lw t4, 16(a0)
+; RV32ZVE32F-NEXT:    lw t5, 20(a0)
+; RV32ZVE32F-NEXT:    lw t2, 24(a0)
+; RV32ZVE32F-NEXT:    lw t3, 28(a0)
+; RV32ZVE32F-NEXT:    lw t0, 32(a0)
+; RV32ZVE32F-NEXT:    lw t1, 36(a0)
+; RV32ZVE32F-NEXT:    lw a6, 40(a0)
+; RV32ZVE32F-NEXT:    lw a7, 44(a0)
+; RV32ZVE32F-NEXT:    lw a4, 48(a0)
+; RV32ZVE32F-NEXT:    lw a5, 52(a0)
+; RV32ZVE32F-NEXT:    lw a2, 56(a0)
+; RV32ZVE32F-NEXT:    lw a3, 60(a0)
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vsll.vi v8, v8, 3
 ; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a1
@@ -5269,8 +5269,8 @@ define void @mscatter_baseidx_sext_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v8
-; RV32ZVE32F-NEXT:    sw a3, 0(a0)
-; RV32ZVE32F-NEXT:    sw a2, 4(a0)
+; RV32ZVE32F-NEXT:    sw a2, 0(a0)
+; RV32ZVE32F-NEXT:    sw a3, 4(a0)
 ; RV32ZVE32F-NEXT:  .LBB49_9: # %else14
 ; RV32ZVE32F-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
@@ -5298,53 +5298,53 @@ define void @mscatter_baseidx_sext_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 2
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t5, 0(a0)
-; RV32ZVE32F-NEXT:    sw t4, 4(a0)
+; RV32ZVE32F-NEXT:    sw t4, 0(a0)
+; RV32ZVE32F-NEXT:    sw t5, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, 8
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB49_4
 ; RV32ZVE32F-NEXT:  .LBB49_13: # %cond.store5
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 3
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t3, 0(a0)
-; RV32ZVE32F-NEXT:    sw t2, 4(a0)
+; RV32ZVE32F-NEXT:    sw t2, 0(a0)
+; RV32ZVE32F-NEXT:    sw t3, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, 16
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB49_5
 ; RV32ZVE32F-NEXT:  .LBB49_14: # %cond.store7
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t1, 0(a0)
-; RV32ZVE32F-NEXT:    sw t0, 4(a0)
+; RV32ZVE32F-NEXT:    sw t0, 0(a0)
+; RV32ZVE32F-NEXT:    sw t1, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, 32
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB49_6
 ; RV32ZVE32F-NEXT:  .LBB49_15: # %cond.store9
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 5
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw a7, 0(a0)
-; RV32ZVE32F-NEXT:    sw a6, 4(a0)
+; RV32ZVE32F-NEXT:    sw a6, 0(a0)
+; RV32ZVE32F-NEXT:    sw a7, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, 64
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB49_7
 ; RV32ZVE32F-NEXT:  .LBB49_16: # %cond.store11
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 6
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw a5, 0(a0)
-; RV32ZVE32F-NEXT:    sw a4, 4(a0)
+; RV32ZVE32F-NEXT:    sw a4, 0(a0)
+; RV32ZVE32F-NEXT:    sw a5, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, -128
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB49_8
 ; RV32ZVE32F-NEXT:    j .LBB49_9
 ;
 ; RV64ZVE32F-LABEL: mscatter_baseidx_sext_v8i32_v8i64:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    ld a2, 56(a0)
-; RV64ZVE32F-NEXT:    ld a3, 48(a0)
-; RV64ZVE32F-NEXT:    ld a5, 40(a0)
-; RV64ZVE32F-NEXT:    ld a6, 32(a0)
-; RV64ZVE32F-NEXT:    ld a7, 24(a0)
-; RV64ZVE32F-NEXT:    ld t0, 16(a0)
 ; RV64ZVE32F-NEXT:    ld t1, 8(a0)
+; RV64ZVE32F-NEXT:    ld t0, 16(a0)
+; RV64ZVE32F-NEXT:    ld a7, 24(a0)
+; RV64ZVE32F-NEXT:    ld a6, 32(a0)
+; RV64ZVE32F-NEXT:    ld a5, 40(a0)
+; RV64ZVE32F-NEXT:    ld a3, 48(a0)
+; RV64ZVE32F-NEXT:    ld a2, 56(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.x.s a4, v0
 ; RV64ZVE32F-NEXT:    andi t2, a4, 1
@@ -5467,20 +5467,20 @@ define void @mscatter_baseidx_zext_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i
 ; RV32ZVE32F-NEXT:    .cfi_offset s0, -4
 ; RV32ZVE32F-NEXT:    .cfi_offset s1, -8
 ; RV32ZVE32F-NEXT:    .cfi_offset s2, -12
-; RV32ZVE32F-NEXT:    lw a2, 60(a0)
-; RV32ZVE32F-NEXT:    lw a3, 56(a0)
-; RV32ZVE32F-NEXT:    lw a4, 52(a0)
-; RV32ZVE32F-NEXT:    lw a5, 48(a0)
-; RV32ZVE32F-NEXT:    lw a6, 44(a0)
-; RV32ZVE32F-NEXT:    lw a7, 40(a0)
-; RV32ZVE32F-NEXT:    lw t0, 36(a0)
-; RV32ZVE32F-NEXT:    lw t1, 32(a0)
-; RV32ZVE32F-NEXT:    lw t2, 28(a0)
-; RV32ZVE32F-NEXT:    lw t3, 24(a0)
-; RV32ZVE32F-NEXT:    lw t4, 20(a0)
-; RV32ZVE32F-NEXT:    lw t5, 16(a0)
-; RV32ZVE32F-NEXT:    lw s0, 12(a0)
 ; RV32ZVE32F-NEXT:    lw t6, 8(a0)
+; RV32ZVE32F-NEXT:    lw s0, 12(a0)
+; RV32ZVE32F-NEXT:    lw t4, 16(a0)
+; RV32ZVE32F-NEXT:    lw t5, 20(a0)
+; RV32ZVE32F-NEXT:    lw t2, 24(a0)
+; RV32ZVE32F-NEXT:    lw t3, 28(a0)
+; RV32ZVE32F-NEXT:    lw t0, 32(a0)
+; RV32ZVE32F-NEXT:    lw t1, 36(a0)
+; RV32ZVE32F-NEXT:    lw a6, 40(a0)
+; RV32ZVE32F-NEXT:    lw a7, 44(a0)
+; RV32ZVE32F-NEXT:    lw a4, 48(a0)
+; RV32ZVE32F-NEXT:    lw a5, 52(a0)
+; RV32ZVE32F-NEXT:    lw a2, 56(a0)
+; RV32ZVE32F-NEXT:    lw a3, 60(a0)
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vsll.vi v8, v8, 3
 ; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a1
@@ -5513,8 +5513,8 @@ define void @mscatter_baseidx_zext_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v8
-; RV32ZVE32F-NEXT:    sw a3, 0(a0)
-; RV32ZVE32F-NEXT:    sw a2, 4(a0)
+; RV32ZVE32F-NEXT:    sw a2, 0(a0)
+; RV32ZVE32F-NEXT:    sw a3, 4(a0)
 ; RV32ZVE32F-NEXT:  .LBB50_9: # %else14
 ; RV32ZVE32F-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
@@ -5542,53 +5542,53 @@ define void @mscatter_baseidx_zext_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 2
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t5, 0(a0)
-; RV32ZVE32F-NEXT:    sw t4, 4(a0)
+; RV32ZVE32F-NEXT:    sw t4, 0(a0)
+; RV32ZVE32F-NEXT:    sw t5, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, 8
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB50_4
 ; RV32ZVE32F-NEXT:  .LBB50_13: # %cond.store5
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 3
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t3, 0(a0)
-; RV32ZVE32F-NEXT:    sw t2, 4(a0)
+; RV32ZVE32F-NEXT:    sw t2, 0(a0)
+; RV32ZVE32F-NEXT:    sw t3, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, 16
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB50_5
 ; RV32ZVE32F-NEXT:  .LBB50_14: # %cond.store7
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t1, 0(a0)
-; RV32ZVE32F-NEXT:    sw t0, 4(a0)
+; RV32ZVE32F-NEXT:    sw t0, 0(a0)
+; RV32ZVE32F-NEXT:    sw t1, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, 32
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB50_6
 ; RV32ZVE32F-NEXT:  .LBB50_15: # %cond.store9
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 5
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw a7, 0(a0)
-; RV32ZVE32F-NEXT:    sw a6, 4(a0)
+; RV32ZVE32F-NEXT:    sw a6, 0(a0)
+; RV32ZVE32F-NEXT:    sw a7, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, 64
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB50_7
 ; RV32ZVE32F-NEXT:  .LBB50_16: # %cond.store11
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 6
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw a5, 0(a0)
-; RV32ZVE32F-NEXT:    sw a4, 4(a0)
+; RV32ZVE32F-NEXT:    sw a4, 0(a0)
+; RV32ZVE32F-NEXT:    sw a5, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, -128
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB50_8
 ; RV32ZVE32F-NEXT:    j .LBB50_9
 ;
 ; RV64ZVE32F-LABEL: mscatter_baseidx_zext_v8i32_v8i64:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    ld a2, 56(a0)
-; RV64ZVE32F-NEXT:    ld a3, 48(a0)
-; RV64ZVE32F-NEXT:    ld a5, 40(a0)
-; RV64ZVE32F-NEXT:    ld a6, 32(a0)
-; RV64ZVE32F-NEXT:    ld a7, 24(a0)
-; RV64ZVE32F-NEXT:    ld t0, 16(a0)
 ; RV64ZVE32F-NEXT:    ld t1, 8(a0)
+; RV64ZVE32F-NEXT:    ld t0, 16(a0)
+; RV64ZVE32F-NEXT:    ld a7, 24(a0)
+; RV64ZVE32F-NEXT:    ld a6, 32(a0)
+; RV64ZVE32F-NEXT:    ld a5, 40(a0)
+; RV64ZVE32F-NEXT:    ld a3, 48(a0)
+; RV64ZVE32F-NEXT:    ld a2, 56(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.x.s a4, v0
 ; RV64ZVE32F-NEXT:    andi t2, a4, 1
@@ -5731,36 +5731,36 @@ define void @mscatter_baseidx_v8i64(<8 x i64> %val, ptr %base, <8 x i64> %idxs,
 ; RV32ZVE32F-NEXT:    .cfi_offset s6, -28
 ; RV32ZVE32F-NEXT:    .cfi_offset s7, -32
 ; RV32ZVE32F-NEXT:    .cfi_offset s8, -36
-; RV32ZVE32F-NEXT:    lw a3, 60(a0)
-; RV32ZVE32F-NEXT:    lw a4, 56(a0)
-; RV32ZVE32F-NEXT:    lw a5, 52(a0)
-; RV32ZVE32F-NEXT:    lw a6, 48(a0)
-; RV32ZVE32F-NEXT:    lw a7, 44(a0)
-; RV32ZVE32F-NEXT:    lw t0, 40(a0)
-; RV32ZVE32F-NEXT:    lw t1, 36(a0)
-; RV32ZVE32F-NEXT:    lw t2, 32(a0)
-; RV32ZVE32F-NEXT:    lw t3, 28(a0)
-; RV32ZVE32F-NEXT:    lw t4, 24(a0)
-; RV32ZVE32F-NEXT:    lw t5, 20(a0)
-; RV32ZVE32F-NEXT:    lw t6, 16(a0)
-; RV32ZVE32F-NEXT:    lw s1, 12(a0)
 ; RV32ZVE32F-NEXT:    lw s0, 8(a0)
-; RV32ZVE32F-NEXT:    lw s2, 56(a2)
-; RV32ZVE32F-NEXT:    lw s3, 48(a2)
-; RV32ZVE32F-NEXT:    lw s4, 40(a2)
+; RV32ZVE32F-NEXT:    lw s1, 12(a0)
+; RV32ZVE32F-NEXT:    lw t5, 16(a0)
+; RV32ZVE32F-NEXT:    lw t6, 20(a0)
+; RV32ZVE32F-NEXT:    lw t3, 24(a0)
+; RV32ZVE32F-NEXT:    lw t4, 28(a0)
+; RV32ZVE32F-NEXT:    lw t1, 32(a0)
+; RV32ZVE32F-NEXT:    lw t2, 36(a0)
+; RV32ZVE32F-NEXT:    lw a7, 40(a0)
+; RV32ZVE32F-NEXT:    lw t0, 44(a0)
+; RV32ZVE32F-NEXT:    lw a5, 48(a0)
+; RV32ZVE32F-NEXT:    lw a6, 52(a0)
+; RV32ZVE32F-NEXT:    lw a3, 56(a0)
+; RV32ZVE32F-NEXT:    lw a4, 60(a0)
+; RV32ZVE32F-NEXT:    lw s2, 8(a2)
+; RV32ZVE32F-NEXT:    lw s3, 16(a2)
+; RV32ZVE32F-NEXT:    lw s4, 24(a2)
 ; RV32ZVE32F-NEXT:    lw s5, 32(a2)
-; RV32ZVE32F-NEXT:    lw s6, 24(a2)
-; RV32ZVE32F-NEXT:    lw s7, 16(a2)
-; RV32ZVE32F-NEXT:    lw s8, 8(a2)
+; RV32ZVE32F-NEXT:    lw s6, 40(a2)
+; RV32ZVE32F-NEXT:    lw s7, 48(a2)
+; RV32ZVE32F-NEXT:    lw s8, 56(a2)
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vlse32.v v8, (a2), zero
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s8
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s7
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s6
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s5
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s4
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s3
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s2
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s3
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s4
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s5
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s6
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s7
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s8
 ; RV32ZVE32F-NEXT:    vsll.vi v8, v8, 3
 ; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a1
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
@@ -5792,8 +5792,8 @@ define void @mscatter_baseidx_v8i64(<8 x i64> %val, ptr %base, <8 x i64> %idxs,
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v8
-; RV32ZVE32F-NEXT:    sw a4, 0(a0)
-; RV32ZVE32F-NEXT:    sw a3, 4(a0)
+; RV32ZVE32F-NEXT:    sw a3, 0(a0)
+; RV32ZVE32F-NEXT:    sw a4, 4(a0)
 ; RV32ZVE32F-NEXT:  .LBB51_9: # %else14
 ; RV32ZVE32F-NEXT:    lw s0, 44(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    lw s1, 40(sp) # 4-byte Folded Reload
@@ -5827,40 +5827,40 @@ define void @mscatter_baseidx_v8i64(<8 x i64> %val, ptr %base, <8 x i64> %idxs,
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 2
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t6, 0(a0)
-; RV32ZVE32F-NEXT:    sw t5, 4(a0)
+; RV32ZVE32F-NEXT:    sw t5, 0(a0)
+; RV32ZVE32F-NEXT:    sw t6, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, 8
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB51_4
 ; RV32ZVE32F-NEXT:  .LBB51_13: # %cond.store5
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 3
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t4, 0(a0)
-; RV32ZVE32F-NEXT:    sw t3, 4(a0)
+; RV32ZVE32F-NEXT:    sw t3, 0(a0)
+; RV32ZVE32F-NEXT:    sw t4, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, 16
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB51_5
 ; RV32ZVE32F-NEXT:  .LBB51_14: # %cond.store7
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t2, 0(a0)
-; RV32ZVE32F-NEXT:    sw t1, 4(a0)
+; RV32ZVE32F-NEXT:    sw t1, 0(a0)
+; RV32ZVE32F-NEXT:    sw t2, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, 32
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB51_6
 ; RV32ZVE32F-NEXT:  .LBB51_15: # %cond.store9
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 5
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t0, 0(a0)
-; RV32ZVE32F-NEXT:    sw a7, 4(a0)
+; RV32ZVE32F-NEXT:    sw a7, 0(a0)
+; RV32ZVE32F-NEXT:    sw t0, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, 64
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB51_7
 ; RV32ZVE32F-NEXT:  .LBB51_16: # %cond.store11
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 6
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw a6, 0(a0)
-; RV32ZVE32F-NEXT:    sw a5, 4(a0)
+; RV32ZVE32F-NEXT:    sw a5, 0(a0)
+; RV32ZVE32F-NEXT:    sw a6, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, -128
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB51_8
 ; RV32ZVE32F-NEXT:    j .LBB51_9
@@ -5877,16 +5877,16 @@ define void @mscatter_baseidx_v8i64(<8 x i64> %val, ptr %base, <8 x i64> %idxs,
 ; RV64ZVE32F-NEXT:    .cfi_offset s1, -16
 ; RV64ZVE32F-NEXT:    .cfi_offset s2, -24
 ; RV64ZVE32F-NEXT:    .cfi_offset s3, -32
-; RV64ZVE32F-NEXT:    ld a3, 56(a0)
-; RV64ZVE32F-NEXT:    ld a4, 48(a0)
-; RV64ZVE32F-NEXT:    ld a6, 40(a0)
-; RV64ZVE32F-NEXT:    ld t1, 32(a0)
+; RV64ZVE32F-NEXT:    ld s0, 8(a0)
+; RV64ZVE32F-NEXT:    ld t5, 16(a0)
 ; RV64ZVE32F-NEXT:    ld t3, 24(a0)
-; RV64ZVE32F-NEXT:    ld t6, 16(a0)
-; RV64ZVE32F-NEXT:    ld s1, 8(a0)
+; RV64ZVE32F-NEXT:    ld t1, 32(a0)
+; RV64ZVE32F-NEXT:    ld a6, 40(a0)
+; RV64ZVE32F-NEXT:    ld a4, 48(a0)
+; RV64ZVE32F-NEXT:    ld a3, 56(a0)
 ; RV64ZVE32F-NEXT:    ld s2, 8(a2)
-; RV64ZVE32F-NEXT:    ld s0, 16(a2)
-; RV64ZVE32F-NEXT:    ld t5, 24(a2)
+; RV64ZVE32F-NEXT:    ld s1, 16(a2)
+; RV64ZVE32F-NEXT:    ld t6, 24(a2)
 ; RV64ZVE32F-NEXT:    ld t4, 32(a2)
 ; RV64ZVE32F-NEXT:    ld t2, 40(a2)
 ; RV64ZVE32F-NEXT:    ld t0, 48(a2)
@@ -5938,19 +5938,19 @@ define void @mscatter_baseidx_v8i64(<8 x i64> %val, ptr %base, <8 x i64> %idxs,
 ; RV64ZVE32F-NEXT:  .LBB51_11: # %cond.store1
 ; RV64ZVE32F-NEXT:    slli s2, s2, 3
 ; RV64ZVE32F-NEXT:    add s2, a1, s2
-; RV64ZVE32F-NEXT:    sd s1, 0(s2)
+; RV64ZVE32F-NEXT:    sd s0, 0(s2)
 ; RV64ZVE32F-NEXT:    andi a0, a7, 4
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB51_3
 ; RV64ZVE32F-NEXT:  .LBB51_12: # %cond.store3
-; RV64ZVE32F-NEXT:    slli s0, s0, 3
-; RV64ZVE32F-NEXT:    add s0, a1, s0
-; RV64ZVE32F-NEXT:    sd t6, 0(s0)
+; RV64ZVE32F-NEXT:    slli s1, s1, 3
+; RV64ZVE32F-NEXT:    add s1, a1, s1
+; RV64ZVE32F-NEXT:    sd t5, 0(s1)
 ; RV64ZVE32F-NEXT:    andi a0, a7, 8
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB51_4
 ; RV64ZVE32F-NEXT:  .LBB51_13: # %cond.store5
-; RV64ZVE32F-NEXT:    slli t5, t5, 3
-; RV64ZVE32F-NEXT:    add t5, a1, t5
-; RV64ZVE32F-NEXT:    sd t3, 0(t5)
+; RV64ZVE32F-NEXT:    slli t6, t6, 3
+; RV64ZVE32F-NEXT:    add t6, a1, t6
+; RV64ZVE32F-NEXT:    sd t3, 0(t6)
 ; RV64ZVE32F-NEXT:    andi a0, a7, 16
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB51_5
 ; RV64ZVE32F-NEXT:  .LBB51_14: # %cond.store7
@@ -6075,9 +6075,9 @@ define void @mscatter_v4f16(<4 x half> %val, <4 x ptr> %ptrs, <4 x i1> %m) {
 ;
 ; RV64ZVE32F-LABEL: mscatter_v4f16:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    ld a1, 24(a0)
-; RV64ZVE32F-NEXT:    ld a2, 16(a0)
 ; RV64ZVE32F-NEXT:    ld a4, 8(a0)
+; RV64ZVE32F-NEXT:    ld a2, 16(a0)
+; RV64ZVE32F-NEXT:    ld a1, 24(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v0
 ; RV64ZVE32F-NEXT:    andi a5, a3, 1
@@ -6135,9 +6135,9 @@ define void @mscatter_truemask_v4f16(<4 x half> %val, <4 x ptr> %ptrs) {
 ;
 ; RV64ZVE32F-LABEL: mscatter_truemask_v4f16:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    ld a1, 24(a0)
-; RV64ZVE32F-NEXT:    ld a2, 16(a0)
 ; RV64ZVE32F-NEXT:    ld a4, 8(a0)
+; RV64ZVE32F-NEXT:    ld a2, 16(a0)
+; RV64ZVE32F-NEXT:    ld a1, 24(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vmset.m v9
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v9
@@ -6207,13 +6207,13 @@ define void @mscatter_v8f16(<8 x half> %val, <8 x ptr> %ptrs, <8 x i1> %m) {
 ;
 ; RV64ZVE32F-LABEL: mscatter_v8f16:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    ld a1, 56(a0)
-; RV64ZVE32F-NEXT:    ld a2, 48(a0)
-; RV64ZVE32F-NEXT:    ld a4, 40(a0)
-; RV64ZVE32F-NEXT:    ld a5, 32(a0)
-; RV64ZVE32F-NEXT:    ld a6, 24(a0)
-; RV64ZVE32F-NEXT:    ld a7, 16(a0)
 ; RV64ZVE32F-NEXT:    ld t0, 8(a0)
+; RV64ZVE32F-NEXT:    ld a7, 16(a0)
+; RV64ZVE32F-NEXT:    ld a6, 24(a0)
+; RV64ZVE32F-NEXT:    ld a5, 32(a0)
+; RV64ZVE32F-NEXT:    ld a4, 40(a0)
+; RV64ZVE32F-NEXT:    ld a2, 48(a0)
+; RV64ZVE32F-NEXT:    ld a1, 56(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v0
 ; RV64ZVE32F-NEXT:    andi t1, a3, 1
@@ -6927,9 +6927,9 @@ define void @mscatter_v4f32(<4 x float> %val, <4 x ptr> %ptrs, <4 x i1> %m) {
 ;
 ; RV64ZVE32F-LABEL: mscatter_v4f32:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    ld a1, 24(a0)
-; RV64ZVE32F-NEXT:    ld a2, 16(a0)
 ; RV64ZVE32F-NEXT:    ld a4, 8(a0)
+; RV64ZVE32F-NEXT:    ld a2, 16(a0)
+; RV64ZVE32F-NEXT:    ld a1, 24(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v0
 ; RV64ZVE32F-NEXT:    andi a5, a3, 1
@@ -6987,9 +6987,9 @@ define void @mscatter_truemask_v4f32(<4 x float> %val, <4 x ptr> %ptrs) {
 ;
 ; RV64ZVE32F-LABEL: mscatter_truemask_v4f32:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    ld a1, 24(a0)
-; RV64ZVE32F-NEXT:    ld a2, 16(a0)
 ; RV64ZVE32F-NEXT:    ld a4, 8(a0)
+; RV64ZVE32F-NEXT:    ld a2, 16(a0)
+; RV64ZVE32F-NEXT:    ld a1, 24(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vmset.m v9
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v9
@@ -7059,13 +7059,13 @@ define void @mscatter_v8f32(<8 x float> %val, <8 x ptr> %ptrs, <8 x i1> %m) {
 ;
 ; RV64ZVE32F-LABEL: mscatter_v8f32:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    ld a1, 56(a0)
-; RV64ZVE32F-NEXT:    ld a2, 48(a0)
-; RV64ZVE32F-NEXT:    ld a4, 40(a0)
-; RV64ZVE32F-NEXT:    ld a5, 32(a0)
-; RV64ZVE32F-NEXT:    ld a6, 24(a0)
-; RV64ZVE32F-NEXT:    ld a7, 16(a0)
 ; RV64ZVE32F-NEXT:    ld t0, 8(a0)
+; RV64ZVE32F-NEXT:    ld a7, 16(a0)
+; RV64ZVE32F-NEXT:    ld a6, 24(a0)
+; RV64ZVE32F-NEXT:    ld a5, 32(a0)
+; RV64ZVE32F-NEXT:    ld a4, 40(a0)
+; RV64ZVE32F-NEXT:    ld a2, 48(a0)
+; RV64ZVE32F-NEXT:    ld a1, 56(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v0
 ; RV64ZVE32F-NEXT:    andi t1, a3, 1
@@ -8283,9 +8283,9 @@ define void @mscatter_v4f64(<4 x double> %val, <4 x ptr> %ptrs, <4 x i1> %m) {
 ;
 ; RV64ZVE32F-LABEL: mscatter_v4f64:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    ld a1, 24(a0)
-; RV64ZVE32F-NEXT:    ld a2, 16(a0)
 ; RV64ZVE32F-NEXT:    ld a4, 8(a0)
+; RV64ZVE32F-NEXT:    ld a2, 16(a0)
+; RV64ZVE32F-NEXT:    ld a1, 24(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v0
 ; RV64ZVE32F-NEXT:    andi a5, a3, 1
@@ -8380,9 +8380,9 @@ define void @mscatter_truemask_v4f64(<4 x double> %val, <4 x ptr> %ptrs) {
 ;
 ; RV64ZVE32F-LABEL: mscatter_truemask_v4f64:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    ld a1, 24(a0)
-; RV64ZVE32F-NEXT:    ld a2, 16(a0)
 ; RV64ZVE32F-NEXT:    ld a4, 8(a0)
+; RV64ZVE32F-NEXT:    ld a2, 16(a0)
+; RV64ZVE32F-NEXT:    ld a1, 24(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vmset.m v8
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v8
@@ -8529,13 +8529,13 @@ define void @mscatter_v8f64(<8 x double> %val, <8 x ptr> %ptrs, <8 x i1> %m) {
 ;
 ; RV64ZVE32F-LABEL: mscatter_v8f64:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    ld a1, 56(a0)
-; RV64ZVE32F-NEXT:    ld a2, 48(a0)
-; RV64ZVE32F-NEXT:    ld a4, 40(a0)
-; RV64ZVE32F-NEXT:    ld a5, 32(a0)
-; RV64ZVE32F-NEXT:    ld a6, 24(a0)
-; RV64ZVE32F-NEXT:    ld a7, 16(a0)
 ; RV64ZVE32F-NEXT:    ld t0, 8(a0)
+; RV64ZVE32F-NEXT:    ld a7, 16(a0)
+; RV64ZVE32F-NEXT:    ld a6, 24(a0)
+; RV64ZVE32F-NEXT:    ld a5, 32(a0)
+; RV64ZVE32F-NEXT:    ld a4, 40(a0)
+; RV64ZVE32F-NEXT:    ld a2, 48(a0)
+; RV64ZVE32F-NEXT:    ld a1, 56(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v0
 ; RV64ZVE32F-NEXT:    andi t1, a3, 1
@@ -10452,22 +10452,22 @@ define void @mscatter_baseidx_v8f64(<8 x double> %val, ptr %base, <8 x i64> %idx
 ;
 ; RV32ZVE32F-LABEL: mscatter_baseidx_v8f64:
 ; RV32ZVE32F:       # %bb.0:
-; RV32ZVE32F-NEXT:    lw a2, 56(a1)
-; RV32ZVE32F-NEXT:    lw a3, 48(a1)
-; RV32ZVE32F-NEXT:    lw a4, 40(a1)
+; RV32ZVE32F-NEXT:    lw a2, 8(a1)
+; RV32ZVE32F-NEXT:    lw a3, 16(a1)
+; RV32ZVE32F-NEXT:    lw a4, 24(a1)
 ; RV32ZVE32F-NEXT:    lw a5, 32(a1)
-; RV32ZVE32F-NEXT:    lw a6, 24(a1)
-; RV32ZVE32F-NEXT:    lw a7, 16(a1)
-; RV32ZVE32F-NEXT:    lw t0, 8(a1)
+; RV32ZVE32F-NEXT:    lw a6, 40(a1)
+; RV32ZVE32F-NEXT:    lw a7, 48(a1)
+; RV32ZVE32F-NEXT:    lw t0, 56(a1)
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vlse32.v v8, (a1), zero
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, t0
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a7
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a5
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a4
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a3
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a3
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a4
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a5
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a7
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, t0
 ; RV32ZVE32F-NEXT:    vsll.vi v8, v8, 3
 ; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a0
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll
index eeb188627577dd..66331eb4e4010a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll
@@ -743,18 +743,18 @@ define void @scatter_of_pointers(ptr noalias nocapture %arg, ptr noalias nocaptu
 ; ZVE32F-NEXT:    li a5, 40
 ; ZVE32F-NEXT:  .LBB12_1: # %bb2
 ; ZVE32F-NEXT:    # =>This Inner Loop Header: Depth=1
-; ZVE32F-NEXT:    ld a6, 8(a1)
-; ZVE32F-NEXT:    ld a7, 0(a1)
-; ZVE32F-NEXT:    ld t0, 24(a1)
-; ZVE32F-NEXT:    ld t1, 16(a1)
+; ZVE32F-NEXT:    ld a6, 0(a1)
+; ZVE32F-NEXT:    ld a7, 8(a1)
+; ZVE32F-NEXT:    ld t0, 16(a1)
+; ZVE32F-NEXT:    ld t1, 24(a1)
 ; ZVE32F-NEXT:    mul t2, a4, a5
 ; ZVE32F-NEXT:    add t2, a0, t2
 ; ZVE32F-NEXT:    mul t3, a2, a5
 ; ZVE32F-NEXT:    add t3, a0, t3
-; ZVE32F-NEXT:    sd a7, 0(t3)
-; ZVE32F-NEXT:    sd a6, 0(t2)
-; ZVE32F-NEXT:    sd t1, 80(t3)
-; ZVE32F-NEXT:    sd t0, 80(t2)
+; ZVE32F-NEXT:    sd a6, 0(t3)
+; ZVE32F-NEXT:    sd a7, 0(t2)
+; ZVE32F-NEXT:    sd t0, 80(t3)
+; ZVE32F-NEXT:    sd t1, 80(t2)
 ; ZVE32F-NEXT:    addi a2, a2, 4
 ; ZVE32F-NEXT:    addi a1, a1, 32
 ; ZVE32F-NEXT:    addi a4, a4, 4
diff --git a/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll b/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll
index 7497051027fa37..afcfc4889c68bb 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll
@@ -364,21 +364,21 @@ define <4 x i32> @stest_f16i32(<4 x half> %x) {
 ; CHECK-NOV-NEXT:    .cfi_offset fs0, -48
 ; CHECK-NOV-NEXT:    .cfi_offset fs1, -56
 ; CHECK-NOV-NEXT:    .cfi_offset fs2, -64
-; CHECK-NOV-NEXT:    lhu s1, 24(a1)
-; CHECK-NOV-NEXT:    lhu s2, 0(a1)
-; CHECK-NOV-NEXT:    lhu s3, 8(a1)
-; CHECK-NOV-NEXT:    lhu a1, 16(a1)
+; CHECK-NOV-NEXT:    lhu s1, 0(a1)
+; CHECK-NOV-NEXT:    lhu s2, 8(a1)
+; CHECK-NOV-NEXT:    lhu a2, 16(a1)
+; CHECK-NOV-NEXT:    lhu s3, 24(a1)
 ; CHECK-NOV-NEXT:    mv s0, a0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, a1
+; CHECK-NOV-NEXT:    fmv.w.x fa0, a2
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs2, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs1, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s1
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs0, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s1
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
 ; CHECK-NOV-NEXT:    fcvt.l.s s1, fs2, rtz
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fcvt.l.s a0, fa0, rtz
@@ -459,18 +459,18 @@ define <4 x i32> @stest_f16i32(<4 x half> %x) {
 ; CHECK-V-NEXT:    slli a1, a1, 2
 ; CHECK-V-NEXT:    sub sp, sp, a1
 ; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 4 * vlenb
-; CHECK-V-NEXT:    lhu s0, 24(a0)
+; CHECK-V-NEXT:    lhu s0, 0(a0)
+; CHECK-V-NEXT:    lhu a1, 8(a0)
 ; CHECK-V-NEXT:    lhu s1, 16(a0)
-; CHECK-V-NEXT:    lhu s2, 0(a0)
-; CHECK-V-NEXT:    lhu a0, 8(a0)
-; CHECK-V-NEXT:    fmv.w.x fa0, a0
+; CHECK-V-NEXT:    lhu s2, 24(a0)
+; CHECK-V-NEXT:    fmv.w.x fa0, a1
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
 ; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s2
+; CHECK-V-NEXT:    fmv.w.x fa0, s0
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 2, e64, m2, tu, ma
@@ -499,7 +499,7 @@ define <4 x i32> @stest_f16i32(<4 x half> %x) {
 ; CHECK-V-NEXT:    add a0, sp, a0
 ; CHECK-V-NEXT:    addi a0, a0, 16
 ; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s0
+; CHECK-V-NEXT:    fmv.w.x fa0, s2
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
@@ -557,17 +557,17 @@ define <4 x i32> @utesth_f16i32(<4 x half> %x) {
 ; CHECK-NOV-NEXT:    .cfi_offset fs1, -56
 ; CHECK-NOV-NEXT:    .cfi_offset fs2, -64
 ; CHECK-NOV-NEXT:    lhu s1, 0(a1)
-; CHECK-NOV-NEXT:    lhu s2, 24(a1)
-; CHECK-NOV-NEXT:    lhu s3, 16(a1)
-; CHECK-NOV-NEXT:    lhu a1, 8(a1)
+; CHECK-NOV-NEXT:    lhu a2, 8(a1)
+; CHECK-NOV-NEXT:    lhu s2, 16(a1)
+; CHECK-NOV-NEXT:    lhu s3, 24(a1)
 ; CHECK-NOV-NEXT:    mv s0, a0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, a1
+; CHECK-NOV-NEXT:    fmv.w.x fa0, a2
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs2, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs1, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs0, fa0
 ; CHECK-NOV-NEXT:    fmv.w.x fa0, s1
@@ -631,18 +631,18 @@ define <4 x i32> @utesth_f16i32(<4 x half> %x) {
 ; CHECK-V-NEXT:    slli a1, a1, 2
 ; CHECK-V-NEXT:    sub sp, sp, a1
 ; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 4 * vlenb
-; CHECK-V-NEXT:    lhu s0, 24(a0)
+; CHECK-V-NEXT:    lhu s0, 0(a0)
+; CHECK-V-NEXT:    lhu a1, 8(a0)
 ; CHECK-V-NEXT:    lhu s1, 16(a0)
-; CHECK-V-NEXT:    lhu s2, 0(a0)
-; CHECK-V-NEXT:    lhu a0, 8(a0)
-; CHECK-V-NEXT:    fmv.w.x fa0, a0
+; CHECK-V-NEXT:    lhu s2, 24(a0)
+; CHECK-V-NEXT:    fmv.w.x fa0, a1
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
 ; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s2
+; CHECK-V-NEXT:    fmv.w.x fa0, s0
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 2, e64, m2, tu, ma
@@ -671,7 +671,7 @@ define <4 x i32> @utesth_f16i32(<4 x half> %x) {
 ; CHECK-V-NEXT:    add a0, sp, a0
 ; CHECK-V-NEXT:    addi a0, a0, 16
 ; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s0
+; CHECK-V-NEXT:    fmv.w.x fa0, s2
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
@@ -726,17 +726,17 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) {
 ; CHECK-NOV-NEXT:    .cfi_offset fs1, -56
 ; CHECK-NOV-NEXT:    .cfi_offset fs2, -64
 ; CHECK-NOV-NEXT:    lhu s1, 0(a1)
-; CHECK-NOV-NEXT:    lhu s2, 24(a1)
-; CHECK-NOV-NEXT:    lhu s3, 16(a1)
-; CHECK-NOV-NEXT:    lhu a1, 8(a1)
+; CHECK-NOV-NEXT:    lhu a2, 8(a1)
+; CHECK-NOV-NEXT:    lhu s2, 16(a1)
+; CHECK-NOV-NEXT:    lhu s3, 24(a1)
 ; CHECK-NOV-NEXT:    mv s0, a0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, a1
+; CHECK-NOV-NEXT:    fmv.w.x fa0, a2
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs2, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs1, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs0, fa0
 ; CHECK-NOV-NEXT:    fmv.w.x fa0, s1
@@ -812,18 +812,18 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) {
 ; CHECK-V-NEXT:    slli a1, a1, 2
 ; CHECK-V-NEXT:    sub sp, sp, a1
 ; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 4 * vlenb
-; CHECK-V-NEXT:    lhu s0, 24(a0)
+; CHECK-V-NEXT:    lhu s0, 0(a0)
+; CHECK-V-NEXT:    lhu a1, 8(a0)
 ; CHECK-V-NEXT:    lhu s1, 16(a0)
-; CHECK-V-NEXT:    lhu s2, 0(a0)
-; CHECK-V-NEXT:    lhu a0, 8(a0)
-; CHECK-V-NEXT:    fmv.w.x fa0, a0
+; CHECK-V-NEXT:    lhu s2, 24(a0)
+; CHECK-V-NEXT:    fmv.w.x fa0, a1
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
 ; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s2
+; CHECK-V-NEXT:    fmv.w.x fa0, s0
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 2, e64, m2, tu, ma
@@ -852,7 +852,7 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) {
 ; CHECK-V-NEXT:    add a0, sp, a0
 ; CHECK-V-NEXT:    addi a0, a0, 16
 ; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s0
+; CHECK-V-NEXT:    fmv.w.x fa0, s2
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
@@ -1267,37 +1267,37 @@ define <8 x i16> @stest_f16i16(<8 x half> %x) {
 ; CHECK-NOV-NEXT:    .cfi_offset fs4, -112
 ; CHECK-NOV-NEXT:    .cfi_offset fs5, -120
 ; CHECK-NOV-NEXT:    .cfi_offset fs6, -128
-; CHECK-NOV-NEXT:    lhu s1, 56(a1)
-; CHECK-NOV-NEXT:    lhu s2, 0(a1)
-; CHECK-NOV-NEXT:    lhu s3, 8(a1)
-; CHECK-NOV-NEXT:    lhu s4, 16(a1)
-; CHECK-NOV-NEXT:    lhu s5, 24(a1)
-; CHECK-NOV-NEXT:    lhu s6, 32(a1)
-; CHECK-NOV-NEXT:    lhu s7, 40(a1)
-; CHECK-NOV-NEXT:    lhu a1, 48(a1)
+; CHECK-NOV-NEXT:    lhu s1, 0(a1)
+; CHECK-NOV-NEXT:    lhu s2, 8(a1)
+; CHECK-NOV-NEXT:    lhu s3, 16(a1)
+; CHECK-NOV-NEXT:    lhu s4, 24(a1)
+; CHECK-NOV-NEXT:    lhu s5, 32(a1)
+; CHECK-NOV-NEXT:    lhu s6, 40(a1)
+; CHECK-NOV-NEXT:    lhu a2, 48(a1)
+; CHECK-NOV-NEXT:    lhu s7, 56(a1)
 ; CHECK-NOV-NEXT:    mv s0, a0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, a1
+; CHECK-NOV-NEXT:    fmv.w.x fa0, a2
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs6, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s7
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s6
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs5, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s6
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s5
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs4, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s5
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s4
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs3, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s4
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs2, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs1, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s1
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs0, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s1
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s7
 ; CHECK-NOV-NEXT:    fcvt.l.s s1, fs6, rtz
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fcvt.l.s a0, fa0, rtz
@@ -1448,22 +1448,22 @@ define <8 x i16> @stest_f16i16(<8 x half> %x) {
 ; CHECK-V-NEXT:    slli a1, a1, 1
 ; CHECK-V-NEXT:    sub sp, sp, a1
 ; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 80 + 2 * vlenb
-; CHECK-V-NEXT:    lhu s0, 56(a0)
-; CHECK-V-NEXT:    lhu s1, 48(a0)
-; CHECK-V-NEXT:    lhu s2, 40(a0)
-; CHECK-V-NEXT:    lhu s3, 32(a0)
-; CHECK-V-NEXT:    lhu s4, 24(a0)
+; CHECK-V-NEXT:    lhu s4, 0(a0)
+; CHECK-V-NEXT:    lhu a1, 8(a0)
 ; CHECK-V-NEXT:    lhu s5, 16(a0)
-; CHECK-V-NEXT:    lhu s6, 0(a0)
-; CHECK-V-NEXT:    lhu a0, 8(a0)
-; CHECK-V-NEXT:    fmv.w.x fa0, a0
+; CHECK-V-NEXT:    lhu s6, 24(a0)
+; CHECK-V-NEXT:    lhu s3, 32(a0)
+; CHECK-V-NEXT:    lhu s2, 40(a0)
+; CHECK-V-NEXT:    lhu s1, 48(a0)
+; CHECK-V-NEXT:    lhu s0, 56(a0)
+; CHECK-V-NEXT:    fmv.w.x fa0, a1
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
 ; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s6
+; CHECK-V-NEXT:    fmv.w.x fa0, s4
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 2, e32, m2, tu, ma
@@ -1481,7 +1481,7 @@ define <8 x i16> @stest_f16i16(<8 x half> %x) {
 ; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
 ; CHECK-V-NEXT:    vslideup.vi v10, v8, 2
 ; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s4
+; CHECK-V-NEXT:    fmv.w.x fa0, s6
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 4, e32, m2, tu, ma
@@ -1593,33 +1593,33 @@ define <8 x i16> @utesth_f16i16(<8 x half> %x) {
 ; CHECK-NOV-NEXT:    .cfi_offset fs5, -120
 ; CHECK-NOV-NEXT:    .cfi_offset fs6, -128
 ; CHECK-NOV-NEXT:    lhu s1, 0(a1)
-; CHECK-NOV-NEXT:    lhu s2, 56(a1)
-; CHECK-NOV-NEXT:    lhu s3, 48(a1)
-; CHECK-NOV-NEXT:    lhu s4, 40(a1)
-; CHECK-NOV-NEXT:    lhu s5, 32(a1)
-; CHECK-NOV-NEXT:    lhu s6, 24(a1)
-; CHECK-NOV-NEXT:    lhu s7, 16(a1)
-; CHECK-NOV-NEXT:    lhu a1, 8(a1)
+; CHECK-NOV-NEXT:    lhu a2, 8(a1)
+; CHECK-NOV-NEXT:    lhu s2, 16(a1)
+; CHECK-NOV-NEXT:    lhu s3, 24(a1)
+; CHECK-NOV-NEXT:    lhu s4, 32(a1)
+; CHECK-NOV-NEXT:    lhu s5, 40(a1)
+; CHECK-NOV-NEXT:    lhu s6, 48(a1)
+; CHECK-NOV-NEXT:    lhu s7, 56(a1)
 ; CHECK-NOV-NEXT:    mv s0, a0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, a1
+; CHECK-NOV-NEXT:    fmv.w.x fa0, a2
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs6, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s7
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs5, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s6
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs4, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s5
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s4
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs3, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s4
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s5
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs2, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s6
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs1, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s7
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs0, fa0
 ; CHECK-NOV-NEXT:    fmv.w.x fa0, s1
@@ -1731,22 +1731,22 @@ define <8 x i16> @utesth_f16i16(<8 x half> %x) {
 ; CHECK-V-NEXT:    slli a1, a1, 1
 ; CHECK-V-NEXT:    sub sp, sp, a1
 ; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 80 + 2 * vlenb
-; CHECK-V-NEXT:    lhu s0, 56(a0)
-; CHECK-V-NEXT:    lhu s1, 48(a0)
-; CHECK-V-NEXT:    lhu s2, 40(a0)
-; CHECK-V-NEXT:    lhu s3, 32(a0)
-; CHECK-V-NEXT:    lhu s4, 24(a0)
+; CHECK-V-NEXT:    lhu s4, 0(a0)
+; CHECK-V-NEXT:    lhu a1, 8(a0)
 ; CHECK-V-NEXT:    lhu s5, 16(a0)
-; CHECK-V-NEXT:    lhu s6, 0(a0)
-; CHECK-V-NEXT:    lhu a0, 8(a0)
-; CHECK-V-NEXT:    fmv.w.x fa0, a0
+; CHECK-V-NEXT:    lhu s6, 24(a0)
+; CHECK-V-NEXT:    lhu s3, 32(a0)
+; CHECK-V-NEXT:    lhu s2, 40(a0)
+; CHECK-V-NEXT:    lhu s1, 48(a0)
+; CHECK-V-NEXT:    lhu s0, 56(a0)
+; CHECK-V-NEXT:    fmv.w.x fa0, a1
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
 ; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s6
+; CHECK-V-NEXT:    fmv.w.x fa0, s4
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 2, e32, m2, tu, ma
@@ -1764,7 +1764,7 @@ define <8 x i16> @utesth_f16i16(<8 x half> %x) {
 ; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
 ; CHECK-V-NEXT:    vslideup.vi v10, v8, 2
 ; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s4
+; CHECK-V-NEXT:    fmv.w.x fa0, s6
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 4, e32, m2, tu, ma
@@ -1872,33 +1872,33 @@ define <8 x i16> @ustest_f16i16(<8 x half> %x) {
 ; CHECK-NOV-NEXT:    .cfi_offset fs5, -120
 ; CHECK-NOV-NEXT:    .cfi_offset fs6, -128
 ; CHECK-NOV-NEXT:    lhu s1, 0(a1)
-; CHECK-NOV-NEXT:    lhu s2, 56(a1)
-; CHECK-NOV-NEXT:    lhu s3, 48(a1)
-; CHECK-NOV-NEXT:    lhu s4, 40(a1)
-; CHECK-NOV-NEXT:    lhu s5, 32(a1)
-; CHECK-NOV-NEXT:    lhu s6, 24(a1)
-; CHECK-NOV-NEXT:    lhu s7, 16(a1)
-; CHECK-NOV-NEXT:    lhu a1, 8(a1)
+; CHECK-NOV-NEXT:    lhu a2, 8(a1)
+; CHECK-NOV-NEXT:    lhu s2, 16(a1)
+; CHECK-NOV-NEXT:    lhu s3, 24(a1)
+; CHECK-NOV-NEXT:    lhu s4, 32(a1)
+; CHECK-NOV-NEXT:    lhu s5, 40(a1)
+; CHECK-NOV-NEXT:    lhu s6, 48(a1)
+; CHECK-NOV-NEXT:    lhu s7, 56(a1)
 ; CHECK-NOV-NEXT:    mv s0, a0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, a1
+; CHECK-NOV-NEXT:    fmv.w.x fa0, a2
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs6, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s7
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs5, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s6
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs4, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s5
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s4
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs3, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s4
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s5
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs2, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s6
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs1, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s7
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs0, fa0
 ; CHECK-NOV-NEXT:    fmv.w.x fa0, s1
@@ -2034,22 +2034,22 @@ define <8 x i16> @ustest_f16i16(<8 x half> %x) {
 ; CHECK-V-NEXT:    slli a1, a1, 1
 ; CHECK-V-NEXT:    sub sp, sp, a1
 ; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 80 + 2 * vlenb
-; CHECK-V-NEXT:    lhu s0, 56(a0)
-; CHECK-V-NEXT:    lhu s1, 48(a0)
-; CHECK-V-NEXT:    lhu s2, 40(a0)
-; CHECK-V-NEXT:    lhu s3, 32(a0)
-; CHECK-V-NEXT:    lhu s4, 24(a0)
+; CHECK-V-NEXT:    lhu s4, 0(a0)
+; CHECK-V-NEXT:    lhu a1, 8(a0)
 ; CHECK-V-NEXT:    lhu s5, 16(a0)
-; CHECK-V-NEXT:    lhu s6, 0(a0)
-; CHECK-V-NEXT:    lhu a0, 8(a0)
-; CHECK-V-NEXT:    fmv.w.x fa0, a0
+; CHECK-V-NEXT:    lhu s6, 24(a0)
+; CHECK-V-NEXT:    lhu s3, 32(a0)
+; CHECK-V-NEXT:    lhu s2, 40(a0)
+; CHECK-V-NEXT:    lhu s1, 48(a0)
+; CHECK-V-NEXT:    lhu s0, 56(a0)
+; CHECK-V-NEXT:    fmv.w.x fa0, a1
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
 ; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s6
+; CHECK-V-NEXT:    fmv.w.x fa0, s4
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 2, e32, m2, tu, ma
@@ -2067,7 +2067,7 @@ define <8 x i16> @ustest_f16i16(<8 x half> %x) {
 ; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
 ; CHECK-V-NEXT:    vslideup.vi v10, v8, 2
 ; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s4
+; CHECK-V-NEXT:    fmv.w.x fa0, s6
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 4, e32, m2, tu, ma
@@ -3700,21 +3700,21 @@ define <4 x i32> @stest_f16i32_mm(<4 x half> %x) {
 ; CHECK-NOV-NEXT:    .cfi_offset fs0, -48
 ; CHECK-NOV-NEXT:    .cfi_offset fs1, -56
 ; CHECK-NOV-NEXT:    .cfi_offset fs2, -64
-; CHECK-NOV-NEXT:    lhu s1, 24(a1)
-; CHECK-NOV-NEXT:    lhu s2, 0(a1)
-; CHECK-NOV-NEXT:    lhu s3, 8(a1)
-; CHECK-NOV-NEXT:    lhu a1, 16(a1)
+; CHECK-NOV-NEXT:    lhu s1, 0(a1)
+; CHECK-NOV-NEXT:    lhu s2, 8(a1)
+; CHECK-NOV-NEXT:    lhu a2, 16(a1)
+; CHECK-NOV-NEXT:    lhu s3, 24(a1)
 ; CHECK-NOV-NEXT:    mv s0, a0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, a1
+; CHECK-NOV-NEXT:    fmv.w.x fa0, a2
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs2, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs1, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s1
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs0, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s1
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
 ; CHECK-NOV-NEXT:    fcvt.l.s s1, fs2, rtz
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fcvt.l.s a0, fa0, rtz
@@ -3795,18 +3795,18 @@ define <4 x i32> @stest_f16i32_mm(<4 x half> %x) {
 ; CHECK-V-NEXT:    slli a1, a1, 2
 ; CHECK-V-NEXT:    sub sp, sp, a1
 ; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 4 * vlenb
-; CHECK-V-NEXT:    lhu s0, 24(a0)
+; CHECK-V-NEXT:    lhu s0, 0(a0)
+; CHECK-V-NEXT:    lhu a1, 8(a0)
 ; CHECK-V-NEXT:    lhu s1, 16(a0)
-; CHECK-V-NEXT:    lhu s2, 0(a0)
-; CHECK-V-NEXT:    lhu a0, 8(a0)
-; CHECK-V-NEXT:    fmv.w.x fa0, a0
+; CHECK-V-NEXT:    lhu s2, 24(a0)
+; CHECK-V-NEXT:    fmv.w.x fa0, a1
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
 ; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s2
+; CHECK-V-NEXT:    fmv.w.x fa0, s0
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 2, e64, m2, tu, ma
@@ -3835,7 +3835,7 @@ define <4 x i32> @stest_f16i32_mm(<4 x half> %x) {
 ; CHECK-V-NEXT:    add a0, sp, a0
 ; CHECK-V-NEXT:    addi a0, a0, 16
 ; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s0
+; CHECK-V-NEXT:    fmv.w.x fa0, s2
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
@@ -3891,17 +3891,17 @@ define <4 x i32> @utesth_f16i32_mm(<4 x half> %x) {
 ; CHECK-NOV-NEXT:    .cfi_offset fs1, -56
 ; CHECK-NOV-NEXT:    .cfi_offset fs2, -64
 ; CHECK-NOV-NEXT:    lhu s1, 0(a1)
-; CHECK-NOV-NEXT:    lhu s2, 24(a1)
-; CHECK-NOV-NEXT:    lhu s3, 16(a1)
-; CHECK-NOV-NEXT:    lhu a1, 8(a1)
+; CHECK-NOV-NEXT:    lhu a2, 8(a1)
+; CHECK-NOV-NEXT:    lhu s2, 16(a1)
+; CHECK-NOV-NEXT:    lhu s3, 24(a1)
 ; CHECK-NOV-NEXT:    mv s0, a0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, a1
+; CHECK-NOV-NEXT:    fmv.w.x fa0, a2
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs2, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs1, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs0, fa0
 ; CHECK-NOV-NEXT:    fmv.w.x fa0, s1
@@ -3965,18 +3965,18 @@ define <4 x i32> @utesth_f16i32_mm(<4 x half> %x) {
 ; CHECK-V-NEXT:    slli a1, a1, 2
 ; CHECK-V-NEXT:    sub sp, sp, a1
 ; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 4 * vlenb
-; CHECK-V-NEXT:    lhu s0, 24(a0)
+; CHECK-V-NEXT:    lhu s0, 0(a0)
+; CHECK-V-NEXT:    lhu a1, 8(a0)
 ; CHECK-V-NEXT:    lhu s1, 16(a0)
-; CHECK-V-NEXT:    lhu s2, 0(a0)
-; CHECK-V-NEXT:    lhu a0, 8(a0)
-; CHECK-V-NEXT:    fmv.w.x fa0, a0
+; CHECK-V-NEXT:    lhu s2, 24(a0)
+; CHECK-V-NEXT:    fmv.w.x fa0, a1
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
 ; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s2
+; CHECK-V-NEXT:    fmv.w.x fa0, s0
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 2, e64, m2, tu, ma
@@ -4005,7 +4005,7 @@ define <4 x i32> @utesth_f16i32_mm(<4 x half> %x) {
 ; CHECK-V-NEXT:    add a0, sp, a0
 ; CHECK-V-NEXT:    addi a0, a0, 16
 ; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s0
+; CHECK-V-NEXT:    fmv.w.x fa0, s2
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
@@ -4058,21 +4058,21 @@ define <4 x i32> @ustest_f16i32_mm(<4 x half> %x) {
 ; CHECK-NOV-NEXT:    .cfi_offset fs0, -48
 ; CHECK-NOV-NEXT:    .cfi_offset fs1, -56
 ; CHECK-NOV-NEXT:    .cfi_offset fs2, -64
-; CHECK-NOV-NEXT:    lhu s1, 24(a1)
-; CHECK-NOV-NEXT:    lhu s2, 0(a1)
-; CHECK-NOV-NEXT:    lhu s3, 8(a1)
-; CHECK-NOV-NEXT:    lhu a1, 16(a1)
+; CHECK-NOV-NEXT:    lhu s1, 0(a1)
+; CHECK-NOV-NEXT:    lhu s2, 8(a1)
+; CHECK-NOV-NEXT:    lhu a2, 16(a1)
+; CHECK-NOV-NEXT:    lhu s3, 24(a1)
 ; CHECK-NOV-NEXT:    mv s0, a0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, a1
+; CHECK-NOV-NEXT:    fmv.w.x fa0, a2
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs2, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs1, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s1
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs0, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s1
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
 ; CHECK-NOV-NEXT:    fcvt.l.s s1, fs2, rtz
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fcvt.l.s a0, fa0, rtz
@@ -4145,18 +4145,18 @@ define <4 x i32> @ustest_f16i32_mm(<4 x half> %x) {
 ; CHECK-V-NEXT:    slli a1, a1, 2
 ; CHECK-V-NEXT:    sub sp, sp, a1
 ; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 4 * vlenb
-; CHECK-V-NEXT:    lhu s0, 24(a0)
+; CHECK-V-NEXT:    lhu s0, 0(a0)
+; CHECK-V-NEXT:    lhu a1, 8(a0)
 ; CHECK-V-NEXT:    lhu s1, 16(a0)
-; CHECK-V-NEXT:    lhu s2, 0(a0)
-; CHECK-V-NEXT:    lhu a0, 8(a0)
-; CHECK-V-NEXT:    fmv.w.x fa0, a0
+; CHECK-V-NEXT:    lhu s2, 24(a0)
+; CHECK-V-NEXT:    fmv.w.x fa0, a1
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
 ; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s2
+; CHECK-V-NEXT:    fmv.w.x fa0, s0
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 2, e64, m2, tu, ma
@@ -4185,7 +4185,7 @@ define <4 x i32> @ustest_f16i32_mm(<4 x half> %x) {
 ; CHECK-V-NEXT:    add a0, sp, a0
 ; CHECK-V-NEXT:    addi a0, a0, 16
 ; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s0
+; CHECK-V-NEXT:    fmv.w.x fa0, s2
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
@@ -4588,37 +4588,37 @@ define <8 x i16> @stest_f16i16_mm(<8 x half> %x) {
 ; CHECK-NOV-NEXT:    .cfi_offset fs4, -112
 ; CHECK-NOV-NEXT:    .cfi_offset fs5, -120
 ; CHECK-NOV-NEXT:    .cfi_offset fs6, -128
-; CHECK-NOV-NEXT:    lhu s1, 56(a1)
-; CHECK-NOV-NEXT:    lhu s2, 0(a1)
-; CHECK-NOV-NEXT:    lhu s3, 8(a1)
-; CHECK-NOV-NEXT:    lhu s4, 16(a1)
-; CHECK-NOV-NEXT:    lhu s5, 24(a1)
-; CHECK-NOV-NEXT:    lhu s6, 32(a1)
-; CHECK-NOV-NEXT:    lhu s7, 40(a1)
-; CHECK-NOV-NEXT:    lhu a1, 48(a1)
+; CHECK-NOV-NEXT:    lhu s1, 0(a1)
+; CHECK-NOV-NEXT:    lhu s2, 8(a1)
+; CHECK-NOV-NEXT:    lhu s3, 16(a1)
+; CHECK-NOV-NEXT:    lhu s4, 24(a1)
+; CHECK-NOV-NEXT:    lhu s5, 32(a1)
+; CHECK-NOV-NEXT:    lhu s6, 40(a1)
+; CHECK-NOV-NEXT:    lhu a2, 48(a1)
+; CHECK-NOV-NEXT:    lhu s7, 56(a1)
 ; CHECK-NOV-NEXT:    mv s0, a0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, a1
+; CHECK-NOV-NEXT:    fmv.w.x fa0, a2
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs6, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s7
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s6
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs5, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s6
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s5
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs4, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s5
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s4
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs3, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s4
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs2, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs1, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s1
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs0, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s1
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s7
 ; CHECK-NOV-NEXT:    fcvt.l.s s1, fs6, rtz
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fcvt.l.s a0, fa0, rtz
@@ -4769,22 +4769,22 @@ define <8 x i16> @stest_f16i16_mm(<8 x half> %x) {
 ; CHECK-V-NEXT:    slli a1, a1, 1
 ; CHECK-V-NEXT:    sub sp, sp, a1
 ; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 80 + 2 * vlenb
-; CHECK-V-NEXT:    lhu s0, 56(a0)
-; CHECK-V-NEXT:    lhu s1, 48(a0)
-; CHECK-V-NEXT:    lhu s2, 40(a0)
-; CHECK-V-NEXT:    lhu s3, 32(a0)
-; CHECK-V-NEXT:    lhu s4, 24(a0)
+; CHECK-V-NEXT:    lhu s4, 0(a0)
+; CHECK-V-NEXT:    lhu a1, 8(a0)
 ; CHECK-V-NEXT:    lhu s5, 16(a0)
-; CHECK-V-NEXT:    lhu s6, 0(a0)
-; CHECK-V-NEXT:    lhu a0, 8(a0)
-; CHECK-V-NEXT:    fmv.w.x fa0, a0
+; CHECK-V-NEXT:    lhu s6, 24(a0)
+; CHECK-V-NEXT:    lhu s3, 32(a0)
+; CHECK-V-NEXT:    lhu s2, 40(a0)
+; CHECK-V-NEXT:    lhu s1, 48(a0)
+; CHECK-V-NEXT:    lhu s0, 56(a0)
+; CHECK-V-NEXT:    fmv.w.x fa0, a1
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
 ; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s6
+; CHECK-V-NEXT:    fmv.w.x fa0, s4
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 2, e32, m2, tu, ma
@@ -4802,7 +4802,7 @@ define <8 x i16> @stest_f16i16_mm(<8 x half> %x) {
 ; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
 ; CHECK-V-NEXT:    vslideup.vi v10, v8, 2
 ; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s4
+; CHECK-V-NEXT:    fmv.w.x fa0, s6
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 4, e32, m2, tu, ma
@@ -4912,33 +4912,33 @@ define <8 x i16> @utesth_f16i16_mm(<8 x half> %x) {
 ; CHECK-NOV-NEXT:    .cfi_offset fs5, -120
 ; CHECK-NOV-NEXT:    .cfi_offset fs6, -128
 ; CHECK-NOV-NEXT:    lhu s1, 0(a1)
-; CHECK-NOV-NEXT:    lhu s2, 56(a1)
-; CHECK-NOV-NEXT:    lhu s3, 48(a1)
-; CHECK-NOV-NEXT:    lhu s4, 40(a1)
-; CHECK-NOV-NEXT:    lhu s5, 32(a1)
-; CHECK-NOV-NEXT:    lhu s6, 24(a1)
-; CHECK-NOV-NEXT:    lhu s7, 16(a1)
-; CHECK-NOV-NEXT:    lhu a1, 8(a1)
+; CHECK-NOV-NEXT:    lhu a2, 8(a1)
+; CHECK-NOV-NEXT:    lhu s2, 16(a1)
+; CHECK-NOV-NEXT:    lhu s3, 24(a1)
+; CHECK-NOV-NEXT:    lhu s4, 32(a1)
+; CHECK-NOV-NEXT:    lhu s5, 40(a1)
+; CHECK-NOV-NEXT:    lhu s6, 48(a1)
+; CHECK-NOV-NEXT:    lhu s7, 56(a1)
 ; CHECK-NOV-NEXT:    mv s0, a0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, a1
+; CHECK-NOV-NEXT:    fmv.w.x fa0, a2
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs5, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s7
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs6, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s6
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs4, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s5
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s4
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs3, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s4
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s5
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs2, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s6
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs1, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s7
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs0, fa0
 ; CHECK-NOV-NEXT:    fcvt.lu.s s2, fs6, rtz
@@ -5048,22 +5048,22 @@ define <8 x i16> @utesth_f16i16_mm(<8 x half> %x) {
 ; CHECK-V-NEXT:    slli a1, a1, 1
 ; CHECK-V-NEXT:    sub sp, sp, a1
 ; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 80 + 2 * vlenb
-; CHECK-V-NEXT:    lhu s0, 56(a0)
-; CHECK-V-NEXT:    lhu s1, 48(a0)
-; CHECK-V-NEXT:    lhu s2, 40(a0)
-; CHECK-V-NEXT:    lhu s3, 32(a0)
-; CHECK-V-NEXT:    lhu s4, 24(a0)
+; CHECK-V-NEXT:    lhu s4, 0(a0)
+; CHECK-V-NEXT:    lhu a1, 8(a0)
 ; CHECK-V-NEXT:    lhu s5, 16(a0)
-; CHECK-V-NEXT:    lhu s6, 0(a0)
-; CHECK-V-NEXT:    lhu a0, 8(a0)
-; CHECK-V-NEXT:    fmv.w.x fa0, a0
+; CHECK-V-NEXT:    lhu s6, 24(a0)
+; CHECK-V-NEXT:    lhu s3, 32(a0)
+; CHECK-V-NEXT:    lhu s2, 40(a0)
+; CHECK-V-NEXT:    lhu s1, 48(a0)
+; CHECK-V-NEXT:    lhu s0, 56(a0)
+; CHECK-V-NEXT:    fmv.w.x fa0, a1
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
 ; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s6
+; CHECK-V-NEXT:    fmv.w.x fa0, s4
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 2, e32, m2, tu, ma
@@ -5081,7 +5081,7 @@ define <8 x i16> @utesth_f16i16_mm(<8 x half> %x) {
 ; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
 ; CHECK-V-NEXT:    vslideup.vi v10, v8, 2
 ; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s4
+; CHECK-V-NEXT:    fmv.w.x fa0, s6
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 4, e32, m2, tu, ma
@@ -5187,37 +5187,37 @@ define <8 x i16> @ustest_f16i16_mm(<8 x half> %x) {
 ; CHECK-NOV-NEXT:    .cfi_offset fs4, -112
 ; CHECK-NOV-NEXT:    .cfi_offset fs5, -120
 ; CHECK-NOV-NEXT:    .cfi_offset fs6, -128
-; CHECK-NOV-NEXT:    lhu s1, 56(a1)
-; CHECK-NOV-NEXT:    lhu s2, 0(a1)
-; CHECK-NOV-NEXT:    lhu s3, 8(a1)
-; CHECK-NOV-NEXT:    lhu s4, 16(a1)
-; CHECK-NOV-NEXT:    lhu s5, 24(a1)
-; CHECK-NOV-NEXT:    lhu s6, 32(a1)
-; CHECK-NOV-NEXT:    lhu s7, 40(a1)
-; CHECK-NOV-NEXT:    lhu a1, 48(a1)
+; CHECK-NOV-NEXT:    lhu s1, 0(a1)
+; CHECK-NOV-NEXT:    lhu s2, 8(a1)
+; CHECK-NOV-NEXT:    lhu s3, 16(a1)
+; CHECK-NOV-NEXT:    lhu s4, 24(a1)
+; CHECK-NOV-NEXT:    lhu s5, 32(a1)
+; CHECK-NOV-NEXT:    lhu s6, 40(a1)
+; CHECK-NOV-NEXT:    lhu a2, 48(a1)
+; CHECK-NOV-NEXT:    lhu s7, 56(a1)
 ; CHECK-NOV-NEXT:    mv s0, a0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, a1
+; CHECK-NOV-NEXT:    fmv.w.x fa0, a2
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs6, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s7
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s6
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs5, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s6
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s5
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs4, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s5
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s4
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs3, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s4
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs2, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs1, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s1
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs0, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s1
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s7
 ; CHECK-NOV-NEXT:    fcvt.l.s s1, fs6, rtz
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fcvt.l.s a0, fa0, rtz
@@ -5350,22 +5350,22 @@ define <8 x i16> @ustest_f16i16_mm(<8 x half> %x) {
 ; CHECK-V-NEXT:    slli a1, a1, 1
 ; CHECK-V-NEXT:    sub sp, sp, a1
 ; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 80 + 2 * vlenb
-; CHECK-V-NEXT:    lhu s0, 56(a0)
-; CHECK-V-NEXT:    lhu s1, 48(a0)
-; CHECK-V-NEXT:    lhu s2, 40(a0)
-; CHECK-V-NEXT:    lhu s3, 32(a0)
-; CHECK-V-NEXT:    lhu s4, 24(a0)
+; CHECK-V-NEXT:    lhu s4, 0(a0)
+; CHECK-V-NEXT:    lhu a1, 8(a0)
 ; CHECK-V-NEXT:    lhu s5, 16(a0)
-; CHECK-V-NEXT:    lhu s6, 0(a0)
-; CHECK-V-NEXT:    lhu a0, 8(a0)
-; CHECK-V-NEXT:    fmv.w.x fa0, a0
+; CHECK-V-NEXT:    lhu s6, 24(a0)
+; CHECK-V-NEXT:    lhu s3, 32(a0)
+; CHECK-V-NEXT:    lhu s2, 40(a0)
+; CHECK-V-NEXT:    lhu s1, 48(a0)
+; CHECK-V-NEXT:    lhu s0, 56(a0)
+; CHECK-V-NEXT:    fmv.w.x fa0, a1
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
 ; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s6
+; CHECK-V-NEXT:    fmv.w.x fa0, s4
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 2, e32, m2, tu, ma
@@ -5383,7 +5383,7 @@ define <8 x i16> @ustest_f16i16_mm(<8 x half> %x) {
 ; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
 ; CHECK-V-NEXT:    vslideup.vi v10, v8, 2
 ; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s4
+; CHECK-V-NEXT:    fmv.w.x fa0, s6
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 4, e32, m2, tu, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/pr63596.ll b/llvm/test/CodeGen/RISCV/rvv/pr63596.ll
index 65dca0daed8c77..e0c58bc3230851 100644
--- a/llvm/test/CodeGen/RISCV/rvv/pr63596.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/pr63596.ll
@@ -9,20 +9,20 @@ define <4 x float> @foo(ptr %0) nounwind {
 ; CHECK-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
 ; CHECK-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; CHECK-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; CHECK-NEXT:    lhu s0, 6(a0)
+; CHECK-NEXT:    lhu s0, 0(a0)
+; CHECK-NEXT:    lhu a1, 2(a0)
 ; CHECK-NEXT:    lhu s1, 4(a0)
-; CHECK-NEXT:    lhu s2, 0(a0)
-; CHECK-NEXT:    lhu a0, 2(a0)
-; CHECK-NEXT:    fmv.w.x fa0, a0
+; CHECK-NEXT:    lhu s2, 6(a0)
+; CHECK-NEXT:    fmv.w.x fa0, a1
 ; CHECK-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NEXT:    fsw fa0, 8(sp)
-; CHECK-NEXT:    fmv.w.x fa0, s2
+; CHECK-NEXT:    fmv.w.x fa0, s0
 ; CHECK-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NEXT:    fsw fa0, 0(sp)
 ; CHECK-NEXT:    fmv.w.x fa0, s1
 ; CHECK-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NEXT:    fsw fa0, 12(sp)
-; CHECK-NEXT:    fmv.w.x fa0, s0
+; CHECK-NEXT:    fmv.w.x fa0, s2
 ; CHECK-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NEXT:    fsw fa0, 4(sp)
 ; CHECK-NEXT:    addi a0, sp, 8
diff --git a/llvm/test/CodeGen/RISCV/shifts.ll b/llvm/test/CodeGen/RISCV/shifts.ll
index 97121c275a2944..2e41622291e0df 100644
--- a/llvm/test/CodeGen/RISCV/shifts.ll
+++ b/llvm/test/CodeGen/RISCV/shifts.ll
@@ -151,12 +151,19 @@ define i64 @shl64_minsize(i64 %a, i64 %b) minsize nounwind {
 define i128 @lshr128(i128 %a, i128 %b) nounwind {
 ; RV32I-LABEL: lshr128:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    addi sp, sp, -48
+; RV32I-NEXT:    sw s0, 44(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 40(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 36(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    lw a2, 0(a2)
 ; RV32I-NEXT:    lw a3, 0(a1)
 ; RV32I-NEXT:    lw a4, 4(a1)
 ; RV32I-NEXT:    lw a5, 8(a1)
 ; RV32I-NEXT:    lw a1, 12(a1)
+; RV32I-NEXT:    sb zero, 35(sp)
+; RV32I-NEXT:    sb zero, 34(sp)
+; RV32I-NEXT:    sb zero, 33(sp)
+; RV32I-NEXT:    sb zero, 32(sp)
 ; RV32I-NEXT:    sb zero, 31(sp)
 ; RV32I-NEXT:    sb zero, 30(sp)
 ; RV32I-NEXT:    sb zero, 29(sp)
@@ -169,94 +176,90 @@ define i128 @lshr128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:    sb zero, 22(sp)
 ; RV32I-NEXT:    sb zero, 21(sp)
 ; RV32I-NEXT:    sb zero, 20(sp)
-; RV32I-NEXT:    sb zero, 19(sp)
-; RV32I-NEXT:    sb zero, 18(sp)
-; RV32I-NEXT:    sb zero, 17(sp)
-; RV32I-NEXT:    sb zero, 16(sp)
-; RV32I-NEXT:    sb a1, 12(sp)
-; RV32I-NEXT:    sb a5, 8(sp)
-; RV32I-NEXT:    sb a4, 4(sp)
-; RV32I-NEXT:    sb a3, 0(sp)
+; RV32I-NEXT:    sb a1, 16(sp)
+; RV32I-NEXT:    sb a5, 12(sp)
+; RV32I-NEXT:    sb a4, 8(sp)
+; RV32I-NEXT:    sb a3, 4(sp)
 ; RV32I-NEXT:    srli a6, a1, 24
-; RV32I-NEXT:    sb a6, 15(sp)
+; RV32I-NEXT:    sb a6, 19(sp)
 ; RV32I-NEXT:    srli a6, a1, 16
-; RV32I-NEXT:    sb a6, 14(sp)
+; RV32I-NEXT:    sb a6, 18(sp)
 ; RV32I-NEXT:    srli a1, a1, 8
-; RV32I-NEXT:    sb a1, 13(sp)
+; RV32I-NEXT:    sb a1, 17(sp)
 ; RV32I-NEXT:    srli a1, a5, 24
-; RV32I-NEXT:    sb a1, 11(sp)
+; RV32I-NEXT:    sb a1, 15(sp)
 ; RV32I-NEXT:    srli a1, a5, 16
-; RV32I-NEXT:    sb a1, 10(sp)
+; RV32I-NEXT:    sb a1, 14(sp)
 ; RV32I-NEXT:    srli a5, a5, 8
-; RV32I-NEXT:    sb a5, 9(sp)
+; RV32I-NEXT:    sb a5, 13(sp)
 ; RV32I-NEXT:    srli a1, a4, 24
-; RV32I-NEXT:    sb a1, 7(sp)
+; RV32I-NEXT:    sb a1, 11(sp)
 ; RV32I-NEXT:    srli a1, a4, 16
-; RV32I-NEXT:    sb a1, 6(sp)
+; RV32I-NEXT:    sb a1, 10(sp)
 ; RV32I-NEXT:    srli a4, a4, 8
-; RV32I-NEXT:    sb a4, 5(sp)
+; RV32I-NEXT:    sb a4, 9(sp)
 ; RV32I-NEXT:    srli a1, a3, 24
-; RV32I-NEXT:    sb a1, 3(sp)
+; RV32I-NEXT:    sb a1, 7(sp)
 ; RV32I-NEXT:    srli a1, a3, 16
-; RV32I-NEXT:    sb a1, 2(sp)
+; RV32I-NEXT:    sb a1, 6(sp)
 ; RV32I-NEXT:    srli a3, a3, 8
-; RV32I-NEXT:    sb a3, 1(sp)
+; RV32I-NEXT:    sb a3, 5(sp)
 ; RV32I-NEXT:    slli a1, a2, 25
 ; RV32I-NEXT:    srli a1, a1, 28
-; RV32I-NEXT:    mv a3, sp
+; RV32I-NEXT:    addi a3, sp, 4
 ; RV32I-NEXT:    add a1, a3, a1
-; RV32I-NEXT:    lbu a3, 1(a1)
-; RV32I-NEXT:    lbu a4, 0(a1)
+; RV32I-NEXT:    lbu a3, 0(a1)
+; RV32I-NEXT:    lbu a4, 1(a1)
 ; RV32I-NEXT:    lbu a5, 2(a1)
 ; RV32I-NEXT:    lbu a6, 3(a1)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    lbu a7, 4(a1)
+; RV32I-NEXT:    lbu t0, 5(a1)
+; RV32I-NEXT:    lbu t1, 6(a1)
+; RV32I-NEXT:    lbu t2, 7(a1)
+; RV32I-NEXT:    lbu t3, 8(a1)
+; RV32I-NEXT:    lbu t4, 9(a1)
+; RV32I-NEXT:    lbu t5, 10(a1)
+; RV32I-NEXT:    lbu t6, 11(a1)
+; RV32I-NEXT:    lbu s0, 12(a1)
+; RV32I-NEXT:    lbu s1, 13(a1)
+; RV32I-NEXT:    lbu s2, 14(a1)
+; RV32I-NEXT:    lbu a1, 15(a1)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    or a3, a4, a3
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
 ; RV32I-NEXT:    or a4, a6, a5
 ; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    or a4, t0, a7
 ; RV32I-NEXT:    andi a2, a2, 7
 ; RV32I-NEXT:    srl a3, a3, a2
-; RV32I-NEXT:    lbu a4, 5(a1)
-; RV32I-NEXT:    lbu a5, 4(a1)
-; RV32I-NEXT:    lbu a6, 6(a1)
-; RV32I-NEXT:    lbu a7, 7(a1)
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    or a4, a4, a5
-; RV32I-NEXT:    slli a6, a6, 16
-; RV32I-NEXT:    slli a7, a7, 24
-; RV32I-NEXT:    or a5, a7, a6
+; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t2, t2, 24
+; RV32I-NEXT:    or a5, t2, t1
 ; RV32I-NEXT:    or a4, a5, a4
 ; RV32I-NEXT:    slli a5, a4, 1
 ; RV32I-NEXT:    xori a6, a2, 31
 ; RV32I-NEXT:    sll a5, a5, a6
 ; RV32I-NEXT:    or a3, a3, a5
-; RV32I-NEXT:    srl a4, a4, a2
-; RV32I-NEXT:    lbu a5, 9(a1)
-; RV32I-NEXT:    lbu a7, 8(a1)
-; RV32I-NEXT:    lbu t0, 10(a1)
-; RV32I-NEXT:    lbu t1, 11(a1)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, a7
-; RV32I-NEXT:    slli t0, t0, 16
-; RV32I-NEXT:    slli t1, t1, 24
-; RV32I-NEXT:    or a7, t1, t0
+; RV32I-NEXT:    slli t4, t4, 8
+; RV32I-NEXT:    or a5, t4, t3
+; RV32I-NEXT:    slli t5, t5, 16
+; RV32I-NEXT:    slli t6, t6, 24
+; RV32I-NEXT:    or a7, t6, t5
 ; RV32I-NEXT:    or a5, a7, a5
 ; RV32I-NEXT:    slli a7, a5, 1
 ; RV32I-NEXT:    not t0, a2
-; RV32I-NEXT:    lbu t1, 13(a1)
 ; RV32I-NEXT:    sll a7, a7, t0
+; RV32I-NEXT:    srl a4, a4, a2
 ; RV32I-NEXT:    or a4, a4, a7
-; RV32I-NEXT:    lbu a7, 12(a1)
-; RV32I-NEXT:    slli t1, t1, 8
-; RV32I-NEXT:    lbu t0, 14(a1)
-; RV32I-NEXT:    lbu a1, 15(a1)
-; RV32I-NEXT:    or a7, t1, a7
+; RV32I-NEXT:    slli s1, s1, 8
+; RV32I-NEXT:    or s0, s1, s0
 ; RV32I-NEXT:    srl a5, a5, a2
-; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli s2, s2, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, t0
-; RV32I-NEXT:    or a1, a1, a7
+; RV32I-NEXT:    or a1, a1, s2
+; RV32I-NEXT:    or a1, a1, s0
 ; RV32I-NEXT:    slli a7, a1, 1
 ; RV32I-NEXT:    sll a6, a7, a6
 ; RV32I-NEXT:    or a5, a5, a6
@@ -265,7 +268,10 @@ define i128 @lshr128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:    sw a5, 8(a0)
 ; RV32I-NEXT:    sw a4, 4(a0)
 ; RV32I-NEXT:    sw a3, 0(a0)
-; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    lw s0, 44(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 40(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 36(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 48
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: lshr128:
@@ -293,125 +299,131 @@ define i128 @lshr128(i128 %a, i128 %b) nounwind {
 define i128 @ashr128(i128 %a, i128 %b) nounwind {
 ; RV32I-LABEL: ashr128:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -32
-; RV32I-NEXT:    lw a2, 0(a2)
-; RV32I-NEXT:    lw a3, 12(a1)
-; RV32I-NEXT:    lw a4, 8(a1)
-; RV32I-NEXT:    lw a5, 4(a1)
-; RV32I-NEXT:    lw a1, 0(a1)
+; RV32I-NEXT:    addi sp, sp, -48
+; RV32I-NEXT:    sw s0, 44(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 40(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 36(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lw a3, 8(a1)
+; RV32I-NEXT:    lw a4, 12(a1)
+; RV32I-NEXT:    lw a5, 0(a1)
+; RV32I-NEXT:    lw a6, 4(a1)
+; RV32I-NEXT:    lw a1, 0(a2)
+; RV32I-NEXT:    sb a4, 16(sp)
 ; RV32I-NEXT:    sb a3, 12(sp)
-; RV32I-NEXT:    sb a4, 8(sp)
+; RV32I-NEXT:    sb a6, 8(sp)
 ; RV32I-NEXT:    sb a5, 4(sp)
-; RV32I-NEXT:    sb a1, 0(sp)
-; RV32I-NEXT:    srai a6, a3, 31
-; RV32I-NEXT:    sb a6, 28(sp)
-; RV32I-NEXT:    sb a6, 24(sp)
-; RV32I-NEXT:    sb a6, 20(sp)
-; RV32I-NEXT:    sb a6, 16(sp)
-; RV32I-NEXT:    srli a7, a3, 24
-; RV32I-NEXT:    sb a7, 15(sp)
-; RV32I-NEXT:    srli a7, a3, 16
-; RV32I-NEXT:    sb a7, 14(sp)
+; RV32I-NEXT:    srai a2, a4, 31
+; RV32I-NEXT:    sb a2, 32(sp)
+; RV32I-NEXT:    sb a2, 28(sp)
+; RV32I-NEXT:    sb a2, 24(sp)
+; RV32I-NEXT:    sb a2, 20(sp)
+; RV32I-NEXT:    srli a7, a4, 24
+; RV32I-NEXT:    sb a7, 19(sp)
+; RV32I-NEXT:    srli a7, a4, 16
+; RV32I-NEXT:    sb a7, 18(sp)
+; RV32I-NEXT:    srli a4, a4, 8
+; RV32I-NEXT:    sb a4, 17(sp)
+; RV32I-NEXT:    srli a4, a3, 24
+; RV32I-NEXT:    sb a4, 15(sp)
+; RV32I-NEXT:    srli a4, a3, 16
+; RV32I-NEXT:    sb a4, 14(sp)
 ; RV32I-NEXT:    srli a3, a3, 8
 ; RV32I-NEXT:    sb a3, 13(sp)
-; RV32I-NEXT:    srli a3, a4, 24
+; RV32I-NEXT:    srli a3, a6, 24
 ; RV32I-NEXT:    sb a3, 11(sp)
-; RV32I-NEXT:    srli a3, a4, 16
+; RV32I-NEXT:    srli a3, a6, 16
 ; RV32I-NEXT:    sb a3, 10(sp)
-; RV32I-NEXT:    srli a4, a4, 8
-; RV32I-NEXT:    sb a4, 9(sp)
+; RV32I-NEXT:    srli a3, a6, 8
+; RV32I-NEXT:    sb a3, 9(sp)
 ; RV32I-NEXT:    srli a3, a5, 24
 ; RV32I-NEXT:    sb a3, 7(sp)
 ; RV32I-NEXT:    srli a3, a5, 16
 ; RV32I-NEXT:    sb a3, 6(sp)
 ; RV32I-NEXT:    srli a5, a5, 8
 ; RV32I-NEXT:    sb a5, 5(sp)
-; RV32I-NEXT:    srli a3, a1, 24
-; RV32I-NEXT:    sb a3, 3(sp)
-; RV32I-NEXT:    srli a3, a1, 16
-; RV32I-NEXT:    sb a3, 2(sp)
-; RV32I-NEXT:    srli a1, a1, 8
-; RV32I-NEXT:    sb a1, 1(sp)
-; RV32I-NEXT:    srli a1, a6, 24
-; RV32I-NEXT:    sb a1, 31(sp)
-; RV32I-NEXT:    srli a3, a6, 16
-; RV32I-NEXT:    sb a3, 30(sp)
-; RV32I-NEXT:    srli a4, a6, 8
-; RV32I-NEXT:    sb a4, 29(sp)
-; RV32I-NEXT:    sb a1, 27(sp)
-; RV32I-NEXT:    sb a3, 26(sp)
-; RV32I-NEXT:    sb a4, 25(sp)
-; RV32I-NEXT:    sb a1, 23(sp)
-; RV32I-NEXT:    sb a3, 22(sp)
-; RV32I-NEXT:    sb a4, 21(sp)
-; RV32I-NEXT:    sb a1, 19(sp)
-; RV32I-NEXT:    sb a3, 18(sp)
-; RV32I-NEXT:    sb a4, 17(sp)
-; RV32I-NEXT:    slli a1, a2, 25
-; RV32I-NEXT:    srli a1, a1, 28
-; RV32I-NEXT:    mv a3, sp
-; RV32I-NEXT:    add a1, a3, a1
-; RV32I-NEXT:    lbu a3, 1(a1)
-; RV32I-NEXT:    lbu a4, 0(a1)
-; RV32I-NEXT:    lbu a5, 2(a1)
-; RV32I-NEXT:    lbu a6, 3(a1)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    srli a3, a2, 24
+; RV32I-NEXT:    sb a3, 35(sp)
+; RV32I-NEXT:    srli a4, a2, 16
+; RV32I-NEXT:    sb a4, 34(sp)
+; RV32I-NEXT:    srli a2, a2, 8
+; RV32I-NEXT:    sb a2, 33(sp)
+; RV32I-NEXT:    sb a3, 31(sp)
+; RV32I-NEXT:    sb a4, 30(sp)
+; RV32I-NEXT:    sb a2, 29(sp)
+; RV32I-NEXT:    sb a3, 27(sp)
+; RV32I-NEXT:    sb a4, 26(sp)
+; RV32I-NEXT:    sb a2, 25(sp)
+; RV32I-NEXT:    sb a3, 23(sp)
+; RV32I-NEXT:    sb a4, 22(sp)
+; RV32I-NEXT:    sb a2, 21(sp)
+; RV32I-NEXT:    slli a2, a1, 25
+; RV32I-NEXT:    srli a2, a2, 28
+; RV32I-NEXT:    addi a3, sp, 4
+; RV32I-NEXT:    add a2, a3, a2
+; RV32I-NEXT:    lbu a3, 0(a2)
+; RV32I-NEXT:    lbu a4, 1(a2)
+; RV32I-NEXT:    lbu a5, 2(a2)
+; RV32I-NEXT:    lbu a6, 3(a2)
+; RV32I-NEXT:    lbu a7, 4(a2)
+; RV32I-NEXT:    lbu t0, 5(a2)
+; RV32I-NEXT:    lbu t1, 6(a2)
+; RV32I-NEXT:    lbu t2, 7(a2)
+; RV32I-NEXT:    lbu t3, 8(a2)
+; RV32I-NEXT:    lbu t4, 9(a2)
+; RV32I-NEXT:    lbu t5, 10(a2)
+; RV32I-NEXT:    lbu t6, 11(a2)
+; RV32I-NEXT:    lbu s0, 12(a2)
+; RV32I-NEXT:    lbu s1, 13(a2)
+; RV32I-NEXT:    lbu s2, 14(a2)
+; RV32I-NEXT:    lbu a2, 15(a2)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    or a3, a4, a3
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
 ; RV32I-NEXT:    or a4, a6, a5
 ; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    andi a2, a2, 7
-; RV32I-NEXT:    srl a3, a3, a2
-; RV32I-NEXT:    lbu a4, 5(a1)
-; RV32I-NEXT:    lbu a5, 4(a1)
-; RV32I-NEXT:    lbu a6, 6(a1)
-; RV32I-NEXT:    lbu a7, 7(a1)
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    or a4, a4, a5
-; RV32I-NEXT:    slli a6, a6, 16
-; RV32I-NEXT:    slli a7, a7, 24
-; RV32I-NEXT:    or a5, a7, a6
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    or a4, t0, a7
+; RV32I-NEXT:    andi a1, a1, 7
+; RV32I-NEXT:    srl a3, a3, a1
+; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t2, t2, 24
+; RV32I-NEXT:    or a5, t2, t1
 ; RV32I-NEXT:    or a4, a5, a4
 ; RV32I-NEXT:    slli a5, a4, 1
-; RV32I-NEXT:    xori a6, a2, 31
+; RV32I-NEXT:    xori a6, a1, 31
 ; RV32I-NEXT:    sll a5, a5, a6
 ; RV32I-NEXT:    or a3, a3, a5
-; RV32I-NEXT:    srl a4, a4, a2
-; RV32I-NEXT:    lbu a5, 9(a1)
-; RV32I-NEXT:    lbu a7, 8(a1)
-; RV32I-NEXT:    lbu t0, 10(a1)
-; RV32I-NEXT:    lbu t1, 11(a1)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, a7
-; RV32I-NEXT:    slli t0, t0, 16
-; RV32I-NEXT:    slli t1, t1, 24
-; RV32I-NEXT:    or a7, t1, t0
+; RV32I-NEXT:    slli t4, t4, 8
+; RV32I-NEXT:    or a5, t4, t3
+; RV32I-NEXT:    slli t5, t5, 16
+; RV32I-NEXT:    slli t6, t6, 24
+; RV32I-NEXT:    or a7, t6, t5
 ; RV32I-NEXT:    or a5, a7, a5
 ; RV32I-NEXT:    slli a7, a5, 1
-; RV32I-NEXT:    not t0, a2
-; RV32I-NEXT:    lbu t1, 13(a1)
+; RV32I-NEXT:    not t0, a1
 ; RV32I-NEXT:    sll a7, a7, t0
+; RV32I-NEXT:    srl a4, a4, a1
 ; RV32I-NEXT:    or a4, a4, a7
-; RV32I-NEXT:    lbu a7, 12(a1)
-; RV32I-NEXT:    slli t1, t1, 8
-; RV32I-NEXT:    lbu t0, 14(a1)
-; RV32I-NEXT:    lbu a1, 15(a1)
-; RV32I-NEXT:    or a7, t1, a7
-; RV32I-NEXT:    srl a5, a5, a2
-; RV32I-NEXT:    slli t0, t0, 16
-; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, t0
-; RV32I-NEXT:    or a1, a1, a7
-; RV32I-NEXT:    slli a7, a1, 1
+; RV32I-NEXT:    slli s1, s1, 8
+; RV32I-NEXT:    or s0, s1, s0
+; RV32I-NEXT:    srl a5, a5, a1
+; RV32I-NEXT:    slli s2, s2, 16
+; RV32I-NEXT:    slli a2, a2, 24
+; RV32I-NEXT:    or a2, a2, s2
+; RV32I-NEXT:    or a2, a2, s0
+; RV32I-NEXT:    slli a7, a2, 1
 ; RV32I-NEXT:    sll a6, a7, a6
 ; RV32I-NEXT:    or a5, a5, a6
-; RV32I-NEXT:    sra a1, a1, a2
+; RV32I-NEXT:    sra a1, a2, a1
 ; RV32I-NEXT:    sw a1, 12(a0)
 ; RV32I-NEXT:    sw a5, 8(a0)
 ; RV32I-NEXT:    sw a4, 4(a0)
 ; RV32I-NEXT:    sw a3, 0(a0)
-; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    lw s0, 44(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 40(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 36(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 48
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: ashr128:
@@ -439,12 +451,19 @@ define i128 @ashr128(i128 %a, i128 %b) nounwind {
 define i128 @shl128(i128 %a, i128 %b) nounwind {
 ; RV32I-LABEL: shl128:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    addi sp, sp, -48
+; RV32I-NEXT:    sw s0, 44(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 40(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 36(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    lw a2, 0(a2)
 ; RV32I-NEXT:    lw a3, 0(a1)
 ; RV32I-NEXT:    lw a4, 4(a1)
 ; RV32I-NEXT:    lw a5, 8(a1)
 ; RV32I-NEXT:    lw a1, 12(a1)
+; RV32I-NEXT:    sb zero, 19(sp)
+; RV32I-NEXT:    sb zero, 18(sp)
+; RV32I-NEXT:    sb zero, 17(sp)
+; RV32I-NEXT:    sb zero, 16(sp)
 ; RV32I-NEXT:    sb zero, 15(sp)
 ; RV32I-NEXT:    sb zero, 14(sp)
 ; RV32I-NEXT:    sb zero, 13(sp)
@@ -457,103 +476,102 @@ define i128 @shl128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:    sb zero, 6(sp)
 ; RV32I-NEXT:    sb zero, 5(sp)
 ; RV32I-NEXT:    sb zero, 4(sp)
-; RV32I-NEXT:    sb zero, 3(sp)
-; RV32I-NEXT:    sb zero, 2(sp)
-; RV32I-NEXT:    sb zero, 1(sp)
-; RV32I-NEXT:    sb zero, 0(sp)
-; RV32I-NEXT:    sb a1, 28(sp)
-; RV32I-NEXT:    sb a5, 24(sp)
-; RV32I-NEXT:    sb a4, 20(sp)
-; RV32I-NEXT:    sb a3, 16(sp)
+; RV32I-NEXT:    sb a1, 32(sp)
+; RV32I-NEXT:    sb a5, 28(sp)
+; RV32I-NEXT:    sb a4, 24(sp)
+; RV32I-NEXT:    sb a3, 20(sp)
 ; RV32I-NEXT:    srli a6, a1, 24
-; RV32I-NEXT:    sb a6, 31(sp)
+; RV32I-NEXT:    sb a6, 35(sp)
 ; RV32I-NEXT:    srli a6, a1, 16
-; RV32I-NEXT:    sb a6, 30(sp)
+; RV32I-NEXT:    sb a6, 34(sp)
 ; RV32I-NEXT:    srli a1, a1, 8
-; RV32I-NEXT:    sb a1, 29(sp)
+; RV32I-NEXT:    sb a1, 33(sp)
 ; RV32I-NEXT:    srli a1, a5, 24
-; RV32I-NEXT:    sb a1, 27(sp)
+; RV32I-NEXT:    sb a1, 31(sp)
 ; RV32I-NEXT:    srli a1, a5, 16
-; RV32I-NEXT:    sb a1, 26(sp)
+; RV32I-NEXT:    sb a1, 30(sp)
 ; RV32I-NEXT:    srli a5, a5, 8
-; RV32I-NEXT:    sb a5, 25(sp)
+; RV32I-NEXT:    sb a5, 29(sp)
 ; RV32I-NEXT:    srli a1, a4, 24
-; RV32I-NEXT:    sb a1, 23(sp)
+; RV32I-NEXT:    sb a1, 27(sp)
 ; RV32I-NEXT:    srli a1, a4, 16
-; RV32I-NEXT:    sb a1, 22(sp)
+; RV32I-NEXT:    sb a1, 26(sp)
 ; RV32I-NEXT:    srli a4, a4, 8
-; RV32I-NEXT:    sb a4, 21(sp)
+; RV32I-NEXT:    sb a4, 25(sp)
 ; RV32I-NEXT:    srli a1, a3, 24
-; RV32I-NEXT:    sb a1, 19(sp)
+; RV32I-NEXT:    sb a1, 23(sp)
 ; RV32I-NEXT:    srli a1, a3, 16
-; RV32I-NEXT:    sb a1, 18(sp)
+; RV32I-NEXT:    sb a1, 22(sp)
 ; RV32I-NEXT:    srli a3, a3, 8
-; RV32I-NEXT:    sb a3, 17(sp)
+; RV32I-NEXT:    sb a3, 21(sp)
 ; RV32I-NEXT:    slli a1, a2, 25
 ; RV32I-NEXT:    srli a1, a1, 28
-; RV32I-NEXT:    addi a3, sp, 16
-; RV32I-NEXT:    sub a1, a3, a1
-; RV32I-NEXT:    lbu a3, 5(a1)
-; RV32I-NEXT:    lbu a4, 4(a1)
-; RV32I-NEXT:    lbu a5, 6(a1)
-; RV32I-NEXT:    lbu a6, 7(a1)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a4
-; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    andi a2, a2, 7
-; RV32I-NEXT:    sll a4, a3, a2
-; RV32I-NEXT:    lbu a5, 1(a1)
-; RV32I-NEXT:    lbu a6, 0(a1)
-; RV32I-NEXT:    lbu a7, 2(a1)
-; RV32I-NEXT:    lbu t0, 3(a1)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, a6
-; RV32I-NEXT:    slli a7, a7, 16
-; RV32I-NEXT:    slli t0, t0, 24
-; RV32I-NEXT:    or a6, t0, a7
-; RV32I-NEXT:    or a5, a6, a5
-; RV32I-NEXT:    srli a6, a5, 1
-; RV32I-NEXT:    xori a7, a2, 31
-; RV32I-NEXT:    srl a6, a6, a7
-; RV32I-NEXT:    or a4, a4, a6
-; RV32I-NEXT:    lbu a6, 9(a1)
-; RV32I-NEXT:    lbu t0, 8(a1)
-; RV32I-NEXT:    lbu t1, 10(a1)
-; RV32I-NEXT:    lbu t2, 11(a1)
-; RV32I-NEXT:    slli a6, a6, 8
-; RV32I-NEXT:    or a6, a6, t0
+; RV32I-NEXT:    addi a3, sp, 20
+; RV32I-NEXT:    sub a3, a3, a1
+; RV32I-NEXT:    lbu a1, 0(a3)
+; RV32I-NEXT:    lbu a4, 1(a3)
+; RV32I-NEXT:    lbu a5, 2(a3)
+; RV32I-NEXT:    lbu a6, 3(a3)
+; RV32I-NEXT:    lbu a7, 4(a3)
+; RV32I-NEXT:    lbu t0, 5(a3)
+; RV32I-NEXT:    lbu t1, 6(a3)
+; RV32I-NEXT:    lbu t2, 7(a3)
+; RV32I-NEXT:    lbu t3, 8(a3)
+; RV32I-NEXT:    lbu t4, 9(a3)
+; RV32I-NEXT:    lbu t5, 10(a3)
+; RV32I-NEXT:    lbu t6, 11(a3)
+; RV32I-NEXT:    lbu s0, 12(a3)
+; RV32I-NEXT:    lbu s1, 13(a3)
+; RV32I-NEXT:    lbu s2, 14(a3)
+; RV32I-NEXT:    lbu a3, 15(a3)
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    or a7, t0, a7
 ; RV32I-NEXT:    slli t1, t1, 16
 ; RV32I-NEXT:    slli t2, t2, 24
 ; RV32I-NEXT:    or t0, t2, t1
-; RV32I-NEXT:    or a6, t0, a6
-; RV32I-NEXT:    sll t0, a6, a2
-; RV32I-NEXT:    srli a3, a3, 1
-; RV32I-NEXT:    not t1, a2
-; RV32I-NEXT:    srl a3, a3, t1
-; RV32I-NEXT:    or a3, t0, a3
-; RV32I-NEXT:    lbu t0, 13(a1)
-; RV32I-NEXT:    lbu t1, 12(a1)
-; RV32I-NEXT:    lbu t2, 14(a1)
-; RV32I-NEXT:    lbu a1, 15(a1)
-; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    or t0, t0, t1
-; RV32I-NEXT:    slli t2, t2, 16
-; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, t2
-; RV32I-NEXT:    or a1, a1, t0
+; RV32I-NEXT:    or a7, t0, a7
+; RV32I-NEXT:    andi a2, a2, 7
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    or a1, a4, a1
+; RV32I-NEXT:    sll a4, a7, a2
+; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a6, a6, 24
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    or a1, a5, a1
+; RV32I-NEXT:    srli a5, a1, 1
+; RV32I-NEXT:    xori a6, a2, 31
+; RV32I-NEXT:    srl a5, a5, a6
+; RV32I-NEXT:    or a4, a4, a5
+; RV32I-NEXT:    slli t4, t4, 8
+; RV32I-NEXT:    or a5, t4, t3
+; RV32I-NEXT:    slli t5, t5, 16
+; RV32I-NEXT:    slli t6, t6, 24
+; RV32I-NEXT:    or t0, t6, t5
+; RV32I-NEXT:    or a5, t0, a5
+; RV32I-NEXT:    srli a7, a7, 1
+; RV32I-NEXT:    not t0, a2
+; RV32I-NEXT:    srl a7, a7, t0
+; RV32I-NEXT:    sll t0, a5, a2
+; RV32I-NEXT:    or a7, t0, a7
+; RV32I-NEXT:    slli s1, s1, 8
+; RV32I-NEXT:    or s0, s1, s0
+; RV32I-NEXT:    slli s2, s2, 16
+; RV32I-NEXT:    slli a3, a3, 24
+; RV32I-NEXT:    or a3, a3, s2
+; RV32I-NEXT:    or a3, a3, s0
+; RV32I-NEXT:    sll a3, a3, a2
+; RV32I-NEXT:    srli a5, a5, 1
+; RV32I-NEXT:    srl a5, a5, a6
+; RV32I-NEXT:    or a3, a3, a5
 ; RV32I-NEXT:    sll a1, a1, a2
-; RV32I-NEXT:    srli a6, a6, 1
-; RV32I-NEXT:    srl a6, a6, a7
-; RV32I-NEXT:    or a1, a1, a6
-; RV32I-NEXT:    sll a2, a5, a2
-; RV32I-NEXT:    sw a2, 0(a0)
-; RV32I-NEXT:    sw a1, 12(a0)
-; RV32I-NEXT:    sw a3, 8(a0)
+; RV32I-NEXT:    sw a1, 0(a0)
+; RV32I-NEXT:    sw a3, 12(a0)
+; RV32I-NEXT:    sw a7, 8(a0)
 ; RV32I-NEXT:    sw a4, 4(a0)
-; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    lw s0, 44(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 40(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 36(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 48
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: shl128:
@@ -616,10 +634,10 @@ define i64 @fshr64_minsize(i64 %a, i64 %b) minsize nounwind {
 define i128 @fshr128_minsize(i128 %a, i128 %b) minsize nounwind {
 ; RV32I-LABEL: fshr128_minsize:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    lw a3, 8(a1)
-; RV32I-NEXT:    lw t2, 0(a1)
 ; RV32I-NEXT:    lw a2, 0(a2)
+; RV32I-NEXT:    lw t2, 0(a1)
 ; RV32I-NEXT:    lw a7, 4(a1)
+; RV32I-NEXT:    lw a3, 8(a1)
 ; RV32I-NEXT:    lw a1, 12(a1)
 ; RV32I-NEXT:    andi t1, a2, 64
 ; RV32I-NEXT:    mv t0, a7
diff --git a/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll
index 122388c1b73ec3..856a68d5f277a9 100644
--- a/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll
@@ -310,22 +310,22 @@ define void @test_srem_vec(ptr %X) nounwind {
 ; RV32-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    sw s6, 0(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    mv s0, a0
-; RV32-NEXT:    lbu a0, 12(a0)
-; RV32-NEXT:    lw a1, 8(s0)
-; RV32-NEXT:    slli a2, a0, 30
-; RV32-NEXT:    lw a3, 4(s0)
-; RV32-NEXT:    srli s1, a1, 2
-; RV32-NEXT:    or s1, s1, a2
-; RV32-NEXT:    slli a2, a1, 31
-; RV32-NEXT:    srli a4, a3, 1
-; RV32-NEXT:    or s2, a4, a2
-; RV32-NEXT:    srli a0, a0, 2
-; RV32-NEXT:    slli a0, a0, 31
-; RV32-NEXT:    srai s3, a0, 31
-; RV32-NEXT:    srli a1, a1, 1
+; RV32-NEXT:    lbu a1, 12(a0)
+; RV32-NEXT:    lw a2, 8(a0)
+; RV32-NEXT:    lw a3, 4(a0)
+; RV32-NEXT:    lw a0, 0(a0)
+; RV32-NEXT:    slli a4, a1, 30
+; RV32-NEXT:    srli s1, a2, 2
+; RV32-NEXT:    or s1, s1, a4
+; RV32-NEXT:    slli a4, a2, 31
+; RV32-NEXT:    srli a5, a3, 1
+; RV32-NEXT:    or s2, a5, a4
+; RV32-NEXT:    srli a1, a1, 2
 ; RV32-NEXT:    slli a1, a1, 31
-; RV32-NEXT:    lw a0, 0(s0)
-; RV32-NEXT:    srai s4, a1, 31
+; RV32-NEXT:    srai s3, a1, 31
+; RV32-NEXT:    srli a2, a2, 1
+; RV32-NEXT:    slli a2, a2, 31
+; RV32-NEXT:    srai s4, a2, 31
 ; RV32-NEXT:    slli a1, a3, 31
 ; RV32-NEXT:    srai a1, a1, 31
 ; RV32-NEXT:    li a2, 6
@@ -391,8 +391,8 @@ define void @test_srem_vec(ptr %X) nounwind {
 ; RV64-NEXT:    mv s0, a0
 ; RV64-NEXT:    lbu a0, 12(a0)
 ; RV64-NEXT:    lwu a1, 8(s0)
-; RV64-NEXT:    slli a0, a0, 32
 ; RV64-NEXT:    ld a2, 0(s0)
+; RV64-NEXT:    slli a0, a0, 32
 ; RV64-NEXT:    or a0, a1, a0
 ; RV64-NEXT:    slli a0, a0, 29
 ; RV64-NEXT:    srai s1, a0, 31
@@ -462,22 +462,22 @@ define void @test_srem_vec(ptr %X) nounwind {
 ; RV32M-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
 ; RV32M-NEXT:    sw s6, 0(sp) # 4-byte Folded Spill
 ; RV32M-NEXT:    mv s0, a0
-; RV32M-NEXT:    lbu a0, 12(a0)
-; RV32M-NEXT:    lw a1, 8(s0)
-; RV32M-NEXT:    slli a2, a0, 30
-; RV32M-NEXT:    lw a3, 4(s0)
-; RV32M-NEXT:    srli s1, a1, 2
-; RV32M-NEXT:    or s1, s1, a2
-; RV32M-NEXT:    slli a2, a1, 31
-; RV32M-NEXT:    srli a4, a3, 1
-; RV32M-NEXT:    or s2, a4, a2
-; RV32M-NEXT:    srli a0, a0, 2
-; RV32M-NEXT:    slli a0, a0, 31
-; RV32M-NEXT:    srai s3, a0, 31
-; RV32M-NEXT:    srli a1, a1, 1
+; RV32M-NEXT:    lbu a1, 12(a0)
+; RV32M-NEXT:    lw a2, 8(a0)
+; RV32M-NEXT:    lw a3, 4(a0)
+; RV32M-NEXT:    lw a0, 0(a0)
+; RV32M-NEXT:    slli a4, a1, 30
+; RV32M-NEXT:    srli s1, a2, 2
+; RV32M-NEXT:    or s1, s1, a4
+; RV32M-NEXT:    slli a4, a2, 31
+; RV32M-NEXT:    srli a5, a3, 1
+; RV32M-NEXT:    or s2, a5, a4
+; RV32M-NEXT:    srli a1, a1, 2
 ; RV32M-NEXT:    slli a1, a1, 31
-; RV32M-NEXT:    lw a0, 0(s0)
-; RV32M-NEXT:    srai s4, a1, 31
+; RV32M-NEXT:    srai s3, a1, 31
+; RV32M-NEXT:    srli a2, a2, 1
+; RV32M-NEXT:    slli a2, a2, 31
+; RV32M-NEXT:    srai s4, a2, 31
 ; RV32M-NEXT:    slli a1, a3, 31
 ; RV32M-NEXT:    srai a1, a1, 31
 ; RV32M-NEXT:    li a2, 6
@@ -536,34 +536,34 @@ define void @test_srem_vec(ptr %X) nounwind {
 ; RV64M:       # %bb.0:
 ; RV64M-NEXT:    ld a1, 0(a0)
 ; RV64M-NEXT:    lwu a2, 8(a0)
-; RV64M-NEXT:    srli a3, a1, 2
-; RV64M-NEXT:    lbu a4, 12(a0)
+; RV64M-NEXT:    lbu a3, 12(a0)
+; RV64M-NEXT:    srli a4, a1, 2
 ; RV64M-NEXT:    slli a5, a2, 62
-; RV64M-NEXT:    or a3, a5, a3
-; RV64M-NEXT:    srai a3, a3, 31
-; RV64M-NEXT:    slli a4, a4, 32
-; RV64M-NEXT:    or a2, a2, a4
+; RV64M-NEXT:    or a4, a5, a4
+; RV64M-NEXT:    srai a4, a4, 31
+; RV64M-NEXT:    slli a3, a3, 32
+; RV64M-NEXT:    or a2, a2, a3
 ; RV64M-NEXT:    slli a2, a2, 29
-; RV64M-NEXT:    lui a4, %hi(.LCPI3_0)
-; RV64M-NEXT:    ld a4, %lo(.LCPI3_0)(a4)
+; RV64M-NEXT:    lui a3, %hi(.LCPI3_0)
+; RV64M-NEXT:    ld a3, %lo(.LCPI3_0)(a3)
 ; RV64M-NEXT:    srai a2, a2, 31
 ; RV64M-NEXT:    slli a1, a1, 31
 ; RV64M-NEXT:    srai a1, a1, 31
-; RV64M-NEXT:    mulh a4, a2, a4
-; RV64M-NEXT:    srli a5, a4, 63
-; RV64M-NEXT:    srai a4, a4, 1
-; RV64M-NEXT:    add a4, a4, a5
+; RV64M-NEXT:    mulh a3, a2, a3
+; RV64M-NEXT:    srli a5, a3, 63
+; RV64M-NEXT:    srai a3, a3, 1
+; RV64M-NEXT:    add a3, a3, a5
 ; RV64M-NEXT:    lui a5, %hi(.LCPI3_1)
 ; RV64M-NEXT:    ld a5, %lo(.LCPI3_1)(a5)
-; RV64M-NEXT:    add a2, a2, a4
-; RV64M-NEXT:    slli a4, a4, 2
-; RV64M-NEXT:    add a2, a2, a4
-; RV64M-NEXT:    mulh a4, a3, a5
-; RV64M-NEXT:    srli a5, a4, 63
-; RV64M-NEXT:    srai a4, a4, 1
-; RV64M-NEXT:    add a4, a4, a5
-; RV64M-NEXT:    slli a5, a4, 3
-; RV64M-NEXT:    add a3, a3, a4
+; RV64M-NEXT:    add a2, a2, a3
+; RV64M-NEXT:    slli a3, a3, 2
+; RV64M-NEXT:    add a2, a2, a3
+; RV64M-NEXT:    mulh a3, a4, a5
+; RV64M-NEXT:    srli a5, a3, 63
+; RV64M-NEXT:    srai a3, a3, 1
+; RV64M-NEXT:    add a3, a3, a5
+; RV64M-NEXT:    slli a5, a3, 3
+; RV64M-NEXT:    add a3, a4, a3
 ; RV64M-NEXT:    sub a3, a3, a5
 ; RV64M-NEXT:    addi a3, a3, -1
 ; RV64M-NEXT:    seqz a3, a3
@@ -612,22 +612,22 @@ define void @test_srem_vec(ptr %X) nounwind {
 ; RV32MV-NEXT:    slli a1, a1, 1
 ; RV32MV-NEXT:    sub sp, sp, a1
 ; RV32MV-NEXT:    mv s0, a0
-; RV32MV-NEXT:    lbu a0, 12(a0)
-; RV32MV-NEXT:    lw a1, 8(s0)
-; RV32MV-NEXT:    slli a2, a0, 30
-; RV32MV-NEXT:    lw a3, 4(s0)
-; RV32MV-NEXT:    srli s1, a1, 2
-; RV32MV-NEXT:    or s1, s1, a2
-; RV32MV-NEXT:    slli a2, a1, 31
-; RV32MV-NEXT:    srli a4, a3, 1
-; RV32MV-NEXT:    or s2, a4, a2
-; RV32MV-NEXT:    srli a0, a0, 2
-; RV32MV-NEXT:    slli a0, a0, 31
-; RV32MV-NEXT:    srai s3, a0, 31
-; RV32MV-NEXT:    srli a1, a1, 1
+; RV32MV-NEXT:    lbu a1, 12(a0)
+; RV32MV-NEXT:    lw a2, 8(a0)
+; RV32MV-NEXT:    lw a3, 4(a0)
+; RV32MV-NEXT:    lw a0, 0(a0)
+; RV32MV-NEXT:    slli a4, a1, 30
+; RV32MV-NEXT:    srli s1, a2, 2
+; RV32MV-NEXT:    or s1, s1, a4
+; RV32MV-NEXT:    slli a4, a2, 31
+; RV32MV-NEXT:    srli a5, a3, 1
+; RV32MV-NEXT:    or s2, a5, a4
+; RV32MV-NEXT:    srli a1, a1, 2
 ; RV32MV-NEXT:    slli a1, a1, 31
-; RV32MV-NEXT:    lw a0, 0(s0)
-; RV32MV-NEXT:    srai s4, a1, 31
+; RV32MV-NEXT:    srai s3, a1, 31
+; RV32MV-NEXT:    srli a2, a2, 1
+; RV32MV-NEXT:    slli a2, a2, 31
+; RV32MV-NEXT:    srai s4, a2, 31
 ; RV32MV-NEXT:    slli a1, a3, 31
 ; RV32MV-NEXT:    srai a1, a1, 31
 ; RV32MV-NEXT:    li a2, 6
@@ -727,46 +727,46 @@ define void @test_srem_vec(ptr %X) nounwind {
 ; RV64MV:       # %bb.0:
 ; RV64MV-NEXT:    ld a1, 0(a0)
 ; RV64MV-NEXT:    lwu a2, 8(a0)
-; RV64MV-NEXT:    srli a3, a1, 2
-; RV64MV-NEXT:    lbu a4, 12(a0)
+; RV64MV-NEXT:    lbu a3, 12(a0)
+; RV64MV-NEXT:    srli a4, a1, 2
 ; RV64MV-NEXT:    slli a5, a2, 62
-; RV64MV-NEXT:    or a3, a5, a3
-; RV64MV-NEXT:    srai a3, a3, 31
-; RV64MV-NEXT:    slli a4, a4, 32
-; RV64MV-NEXT:    or a2, a2, a4
+; RV64MV-NEXT:    or a4, a5, a4
+; RV64MV-NEXT:    srai a4, a4, 31
+; RV64MV-NEXT:    slli a3, a3, 32
+; RV64MV-NEXT:    or a2, a2, a3
 ; RV64MV-NEXT:    slli a2, a2, 29
-; RV64MV-NEXT:    lui a4, %hi(.LCPI3_0)
-; RV64MV-NEXT:    ld a4, %lo(.LCPI3_0)(a4)
+; RV64MV-NEXT:    lui a3, %hi(.LCPI3_0)
+; RV64MV-NEXT:    ld a3, %lo(.LCPI3_0)(a3)
 ; RV64MV-NEXT:    srai a2, a2, 31
 ; RV64MV-NEXT:    slli a1, a1, 31
 ; RV64MV-NEXT:    srai a1, a1, 31
-; RV64MV-NEXT:    mulh a4, a2, a4
-; RV64MV-NEXT:    srli a5, a4, 63
-; RV64MV-NEXT:    srai a4, a4, 1
-; RV64MV-NEXT:    add a4, a4, a5
+; RV64MV-NEXT:    mulh a3, a2, a3
+; RV64MV-NEXT:    srli a5, a3, 63
+; RV64MV-NEXT:    srai a3, a3, 1
+; RV64MV-NEXT:    add a3, a3, a5
 ; RV64MV-NEXT:    lui a5, %hi(.LCPI3_1)
 ; RV64MV-NEXT:    ld a5, %lo(.LCPI3_1)(a5)
-; RV64MV-NEXT:    add a2, a2, a4
-; RV64MV-NEXT:    slli a4, a4, 2
-; RV64MV-NEXT:    add a2, a2, a4
-; RV64MV-NEXT:    mulh a4, a3, a5
-; RV64MV-NEXT:    srli a5, a4, 63
-; RV64MV-NEXT:    srai a4, a4, 1
-; RV64MV-NEXT:    add a4, a4, a5
+; RV64MV-NEXT:    add a2, a2, a3
+; RV64MV-NEXT:    slli a3, a3, 2
+; RV64MV-NEXT:    add a2, a2, a3
+; RV64MV-NEXT:    mulh a3, a4, a5
+; RV64MV-NEXT:    srli a5, a3, 63
+; RV64MV-NEXT:    srai a3, a3, 1
+; RV64MV-NEXT:    add a3, a3, a5
 ; RV64MV-NEXT:    lui a5, %hi(.LCPI3_2)
 ; RV64MV-NEXT:    ld a5, %lo(.LCPI3_2)(a5)
-; RV64MV-NEXT:    add a3, a3, a4
-; RV64MV-NEXT:    slli a4, a4, 3
-; RV64MV-NEXT:    sub a3, a3, a4
-; RV64MV-NEXT:    mulh a4, a1, a5
-; RV64MV-NEXT:    srli a5, a4, 63
-; RV64MV-NEXT:    add a4, a4, a5
+; RV64MV-NEXT:    add a4, a4, a3
+; RV64MV-NEXT:    slli a3, a3, 3
+; RV64MV-NEXT:    sub a4, a4, a3
+; RV64MV-NEXT:    mulh a3, a1, a5
+; RV64MV-NEXT:    srli a5, a3, 63
+; RV64MV-NEXT:    add a3, a3, a5
 ; RV64MV-NEXT:    li a5, 6
-; RV64MV-NEXT:    mul a4, a4, a5
-; RV64MV-NEXT:    sub a1, a1, a4
+; RV64MV-NEXT:    mul a3, a3, a5
+; RV64MV-NEXT:    sub a1, a1, a3
 ; RV64MV-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV64MV-NEXT:    vmv.v.x v8, a1
-; RV64MV-NEXT:    vslide1down.vx v8, v8, a3
+; RV64MV-NEXT:    vslide1down.vx v8, v8, a4
 ; RV64MV-NEXT:    vslide1down.vx v8, v8, a2
 ; RV64MV-NEXT:    vslidedown.vi v8, v8, 1
 ; RV64MV-NEXT:    li a1, -1
diff --git a/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll b/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll
index 3335ca3a34b6c6..ed10341580ef8d 100644
--- a/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll
+++ b/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll
@@ -18,29 +18,29 @@ define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) nounwind {
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lh s0, 12(a1)
-; RV32I-NEXT:    lh s1, 8(a1)
-; RV32I-NEXT:    lh s2, 4(a1)
 ; RV32I-NEXT:    lh a2, 0(a1)
+; RV32I-NEXT:    lh s0, 4(a1)
+; RV32I-NEXT:    lh s1, 8(a1)
+; RV32I-NEXT:    lh s2, 12(a1)
 ; RV32I-NEXT:    mv s3, a0
 ; RV32I-NEXT:    li a1, 95
 ; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:    call __modsi3 at plt
 ; RV32I-NEXT:    mv s4, a0
 ; RV32I-NEXT:    li a1, -124
-; RV32I-NEXT:    mv a0, s2
+; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    call __modsi3 at plt
-; RV32I-NEXT:    mv s2, a0
+; RV32I-NEXT:    mv s0, a0
 ; RV32I-NEXT:    li a1, 98
 ; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call __modsi3 at plt
 ; RV32I-NEXT:    mv s1, a0
 ; RV32I-NEXT:    li a1, -1003
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s2
 ; RV32I-NEXT:    call __modsi3 at plt
 ; RV32I-NEXT:    sh a0, 6(s3)
 ; RV32I-NEXT:    sh s1, 4(s3)
-; RV32I-NEXT:    sh s2, 2(s3)
+; RV32I-NEXT:    sh s0, 2(s3)
 ; RV32I-NEXT:    sh s4, 0(s3)
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
@@ -53,52 +53,52 @@ define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) nounwind {
 ;
 ; RV32IM-LABEL: fold_srem_vec_1:
 ; RV32IM:       # %bb.0:
-; RV32IM-NEXT:    lh a2, 12(a1)
-; RV32IM-NEXT:    lh a3, 8(a1)
-; RV32IM-NEXT:    lh a4, 0(a1)
-; RV32IM-NEXT:    lh a1, 4(a1)
+; RV32IM-NEXT:    lh a2, 0(a1)
+; RV32IM-NEXT:    lh a3, 4(a1)
+; RV32IM-NEXT:    lh a4, 8(a1)
+; RV32IM-NEXT:    lh a1, 12(a1)
 ; RV32IM-NEXT:    lui a5, 706409
 ; RV32IM-NEXT:    addi a5, a5, 389
-; RV32IM-NEXT:    mulh a5, a4, a5
-; RV32IM-NEXT:    add a5, a5, a4
+; RV32IM-NEXT:    mulh a5, a2, a5
+; RV32IM-NEXT:    add a5, a5, a2
 ; RV32IM-NEXT:    srli a6, a5, 31
 ; RV32IM-NEXT:    srli a5, a5, 6
 ; RV32IM-NEXT:    add a5, a5, a6
 ; RV32IM-NEXT:    li a6, 95
 ; RV32IM-NEXT:    mul a5, a5, a6
-; RV32IM-NEXT:    sub a4, a4, a5
+; RV32IM-NEXT:    sub a2, a2, a5
 ; RV32IM-NEXT:    lui a5, 507375
 ; RV32IM-NEXT:    addi a5, a5, 1981
-; RV32IM-NEXT:    mulh a5, a1, a5
-; RV32IM-NEXT:    sub a5, a5, a1
+; RV32IM-NEXT:    mulh a5, a3, a5
+; RV32IM-NEXT:    sub a5, a5, a3
 ; RV32IM-NEXT:    srli a6, a5, 31
 ; RV32IM-NEXT:    srli a5, a5, 6
 ; RV32IM-NEXT:    add a5, a5, a6
 ; RV32IM-NEXT:    li a6, -124
 ; RV32IM-NEXT:    mul a5, a5, a6
-; RV32IM-NEXT:    sub a1, a1, a5
+; RV32IM-NEXT:    sub a3, a3, a5
 ; RV32IM-NEXT:    lui a5, 342392
 ; RV32IM-NEXT:    addi a5, a5, 669
-; RV32IM-NEXT:    mulh a5, a3, a5
+; RV32IM-NEXT:    mulh a5, a4, a5
 ; RV32IM-NEXT:    srli a6, a5, 31
 ; RV32IM-NEXT:    srli a5, a5, 5
 ; RV32IM-NEXT:    add a5, a5, a6
 ; RV32IM-NEXT:    li a6, 98
 ; RV32IM-NEXT:    mul a5, a5, a6
-; RV32IM-NEXT:    sub a3, a3, a5
+; RV32IM-NEXT:    sub a4, a4, a5
 ; RV32IM-NEXT:    lui a5, 780943
 ; RV32IM-NEXT:    addi a5, a5, 1809
-; RV32IM-NEXT:    mulh a5, a2, a5
+; RV32IM-NEXT:    mulh a5, a1, a5
 ; RV32IM-NEXT:    srli a6, a5, 31
 ; RV32IM-NEXT:    srli a5, a5, 8
 ; RV32IM-NEXT:    add a5, a5, a6
 ; RV32IM-NEXT:    li a6, -1003
 ; RV32IM-NEXT:    mul a5, a5, a6
-; RV32IM-NEXT:    sub a2, a2, a5
-; RV32IM-NEXT:    sh a2, 6(a0)
-; RV32IM-NEXT:    sh a3, 4(a0)
-; RV32IM-NEXT:    sh a1, 2(a0)
-; RV32IM-NEXT:    sh a4, 0(a0)
+; RV32IM-NEXT:    sub a1, a1, a5
+; RV32IM-NEXT:    sh a1, 6(a0)
+; RV32IM-NEXT:    sh a4, 4(a0)
+; RV32IM-NEXT:    sh a3, 2(a0)
+; RV32IM-NEXT:    sh a2, 0(a0)
 ; RV32IM-NEXT:    ret
 ;
 ; RV64I-LABEL: fold_srem_vec_1:
@@ -110,29 +110,29 @@ define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) nounwind {
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s4, 0(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lh s0, 24(a1)
-; RV64I-NEXT:    lh s1, 16(a1)
-; RV64I-NEXT:    lh s2, 8(a1)
 ; RV64I-NEXT:    lh a2, 0(a1)
+; RV64I-NEXT:    lh s0, 8(a1)
+; RV64I-NEXT:    lh s1, 16(a1)
+; RV64I-NEXT:    lh s2, 24(a1)
 ; RV64I-NEXT:    mv s3, a0
 ; RV64I-NEXT:    li a1, 95
 ; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:    call __moddi3 at plt
 ; RV64I-NEXT:    mv s4, a0
 ; RV64I-NEXT:    li a1, -124
-; RV64I-NEXT:    mv a0, s2
+; RV64I-NEXT:    mv a0, s0
 ; RV64I-NEXT:    call __moddi3 at plt
-; RV64I-NEXT:    mv s2, a0
+; RV64I-NEXT:    mv s0, a0
 ; RV64I-NEXT:    li a1, 98
 ; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call __moddi3 at plt
 ; RV64I-NEXT:    mv s1, a0
 ; RV64I-NEXT:    li a1, -1003
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s2
 ; RV64I-NEXT:    call __moddi3 at plt
 ; RV64I-NEXT:    sh a0, 6(s3)
 ; RV64I-NEXT:    sh s1, 4(s3)
-; RV64I-NEXT:    sh s2, 2(s3)
+; RV64I-NEXT:    sh s0, 2(s3)
 ; RV64I-NEXT:    sh s4, 0(s3)
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
@@ -145,52 +145,52 @@ define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) nounwind {
 ;
 ; RV64IM-LABEL: fold_srem_vec_1:
 ; RV64IM:       # %bb.0:
-; RV64IM-NEXT:    lh a2, 0(a1)
-; RV64IM-NEXT:    lui a3, %hi(.LCPI0_0)
-; RV64IM-NEXT:    ld a3, %lo(.LCPI0_0)(a3)
-; RV64IM-NEXT:    lh a4, 24(a1)
+; RV64IM-NEXT:    lui a2, %hi(.LCPI0_0)
+; RV64IM-NEXT:    ld a2, %lo(.LCPI0_0)(a2)
+; RV64IM-NEXT:    lh a3, 0(a1)
+; RV64IM-NEXT:    lh a4, 8(a1)
 ; RV64IM-NEXT:    lh a5, 16(a1)
-; RV64IM-NEXT:    lh a1, 8(a1)
-; RV64IM-NEXT:    mulh a3, a2, a3
-; RV64IM-NEXT:    add a3, a3, a2
-; RV64IM-NEXT:    srli a6, a3, 63
-; RV64IM-NEXT:    srli a3, a3, 6
-; RV64IM-NEXT:    add a3, a3, a6
+; RV64IM-NEXT:    lh a1, 24(a1)
+; RV64IM-NEXT:    mulh a2, a3, a2
+; RV64IM-NEXT:    add a2, a2, a3
+; RV64IM-NEXT:    srli a6, a2, 63
+; RV64IM-NEXT:    srli a2, a2, 6
+; RV64IM-NEXT:    add a2, a2, a6
 ; RV64IM-NEXT:    lui a6, %hi(.LCPI0_1)
 ; RV64IM-NEXT:    ld a6, %lo(.LCPI0_1)(a6)
 ; RV64IM-NEXT:    li a7, 95
-; RV64IM-NEXT:    mul a3, a3, a7
-; RV64IM-NEXT:    subw a2, a2, a3
-; RV64IM-NEXT:    mulh a3, a1, a6
-; RV64IM-NEXT:    sub a3, a3, a1
-; RV64IM-NEXT:    srli a6, a3, 63
-; RV64IM-NEXT:    srli a3, a3, 6
-; RV64IM-NEXT:    add a3, a3, a6
+; RV64IM-NEXT:    mul a2, a2, a7
+; RV64IM-NEXT:    subw a3, a3, a2
+; RV64IM-NEXT:    mulh a2, a4, a6
+; RV64IM-NEXT:    sub a2, a2, a4
+; RV64IM-NEXT:    srli a6, a2, 63
+; RV64IM-NEXT:    srli a2, a2, 6
+; RV64IM-NEXT:    add a2, a2, a6
 ; RV64IM-NEXT:    lui a6, %hi(.LCPI0_2)
 ; RV64IM-NEXT:    ld a6, %lo(.LCPI0_2)(a6)
 ; RV64IM-NEXT:    li a7, -124
-; RV64IM-NEXT:    mul a3, a3, a7
-; RV64IM-NEXT:    subw a1, a1, a3
-; RV64IM-NEXT:    mulh a3, a5, a6
-; RV64IM-NEXT:    srli a6, a3, 63
-; RV64IM-NEXT:    srli a3, a3, 5
-; RV64IM-NEXT:    add a3, a3, a6
+; RV64IM-NEXT:    mul a2, a2, a7
+; RV64IM-NEXT:    subw a4, a4, a2
+; RV64IM-NEXT:    mulh a2, a5, a6
+; RV64IM-NEXT:    srli a6, a2, 63
+; RV64IM-NEXT:    srli a2, a2, 5
+; RV64IM-NEXT:    add a2, a2, a6
 ; RV64IM-NEXT:    lui a6, %hi(.LCPI0_3)
 ; RV64IM-NEXT:    ld a6, %lo(.LCPI0_3)(a6)
 ; RV64IM-NEXT:    li a7, 98
-; RV64IM-NEXT:    mul a3, a3, a7
-; RV64IM-NEXT:    subw a5, a5, a3
-; RV64IM-NEXT:    mulh a3, a4, a6
-; RV64IM-NEXT:    srli a6, a3, 63
-; RV64IM-NEXT:    srli a3, a3, 7
-; RV64IM-NEXT:    add a3, a3, a6
+; RV64IM-NEXT:    mul a2, a2, a7
+; RV64IM-NEXT:    subw a5, a5, a2
+; RV64IM-NEXT:    mulh a2, a1, a6
+; RV64IM-NEXT:    srli a6, a2, 63
+; RV64IM-NEXT:    srli a2, a2, 7
+; RV64IM-NEXT:    add a2, a2, a6
 ; RV64IM-NEXT:    li a6, -1003
-; RV64IM-NEXT:    mul a3, a3, a6
-; RV64IM-NEXT:    subw a4, a4, a3
-; RV64IM-NEXT:    sh a4, 6(a0)
+; RV64IM-NEXT:    mul a2, a2, a6
+; RV64IM-NEXT:    subw a1, a1, a2
+; RV64IM-NEXT:    sh a1, 6(a0)
 ; RV64IM-NEXT:    sh a5, 4(a0)
-; RV64IM-NEXT:    sh a1, 2(a0)
-; RV64IM-NEXT:    sh a2, 0(a0)
+; RV64IM-NEXT:    sh a4, 2(a0)
+; RV64IM-NEXT:    sh a3, 0(a0)
 ; RV64IM-NEXT:    ret
   %1 = srem <4 x i16> %x, <i16 95, i16 -124, i16 98, i16 -1003>
   ret <4 x i16> %1
@@ -206,29 +206,29 @@ define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) nounwind {
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lh s0, 12(a1)
-; RV32I-NEXT:    lh s1, 8(a1)
-; RV32I-NEXT:    lh s2, 4(a1)
 ; RV32I-NEXT:    lh a2, 0(a1)
+; RV32I-NEXT:    lh s0, 4(a1)
+; RV32I-NEXT:    lh s1, 8(a1)
+; RV32I-NEXT:    lh s2, 12(a1)
 ; RV32I-NEXT:    mv s3, a0
 ; RV32I-NEXT:    li a1, 95
 ; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:    call __modsi3 at plt
 ; RV32I-NEXT:    mv s4, a0
 ; RV32I-NEXT:    li a1, 95
-; RV32I-NEXT:    mv a0, s2
+; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    call __modsi3 at plt
-; RV32I-NEXT:    mv s2, a0
+; RV32I-NEXT:    mv s0, a0
 ; RV32I-NEXT:    li a1, 95
 ; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call __modsi3 at plt
 ; RV32I-NEXT:    mv s1, a0
 ; RV32I-NEXT:    li a1, 95
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s2
 ; RV32I-NEXT:    call __modsi3 at plt
 ; RV32I-NEXT:    sh a0, 6(s3)
 ; RV32I-NEXT:    sh s1, 4(s3)
-; RV32I-NEXT:    sh s2, 2(s3)
+; RV32I-NEXT:    sh s0, 2(s3)
 ; RV32I-NEXT:    sh s4, 0(s3)
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
@@ -241,45 +241,45 @@ define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) nounwind {
 ;
 ; RV32IM-LABEL: fold_srem_vec_2:
 ; RV32IM:       # %bb.0:
-; RV32IM-NEXT:    lh a2, 12(a1)
-; RV32IM-NEXT:    lh a3, 8(a1)
-; RV32IM-NEXT:    lh a4, 0(a1)
-; RV32IM-NEXT:    lh a1, 4(a1)
+; RV32IM-NEXT:    lh a2, 0(a1)
+; RV32IM-NEXT:    lh a3, 4(a1)
+; RV32IM-NEXT:    lh a4, 8(a1)
+; RV32IM-NEXT:    lh a1, 12(a1)
 ; RV32IM-NEXT:    lui a5, 706409
 ; RV32IM-NEXT:    addi a5, a5, 389
-; RV32IM-NEXT:    mulh a6, a4, a5
-; RV32IM-NEXT:    add a6, a6, a4
+; RV32IM-NEXT:    mulh a6, a2, a5
+; RV32IM-NEXT:    add a6, a6, a2
 ; RV32IM-NEXT:    srli a7, a6, 31
 ; RV32IM-NEXT:    srli a6, a6, 6
 ; RV32IM-NEXT:    add a6, a6, a7
 ; RV32IM-NEXT:    li a7, 95
 ; RV32IM-NEXT:    mul a6, a6, a7
-; RV32IM-NEXT:    sub a4, a4, a6
-; RV32IM-NEXT:    mulh a6, a1, a5
-; RV32IM-NEXT:    add a6, a6, a1
+; RV32IM-NEXT:    sub a2, a2, a6
+; RV32IM-NEXT:    mulh a6, a3, a5
+; RV32IM-NEXT:    add a6, a6, a3
 ; RV32IM-NEXT:    srli t0, a6, 31
 ; RV32IM-NEXT:    srli a6, a6, 6
 ; RV32IM-NEXT:    add a6, a6, t0
 ; RV32IM-NEXT:    mul a6, a6, a7
-; RV32IM-NEXT:    sub a1, a1, a6
-; RV32IM-NEXT:    mulh a6, a3, a5
-; RV32IM-NEXT:    add a6, a6, a3
+; RV32IM-NEXT:    sub a3, a3, a6
+; RV32IM-NEXT:    mulh a6, a4, a5
+; RV32IM-NEXT:    add a6, a6, a4
 ; RV32IM-NEXT:    srli t0, a6, 31
 ; RV32IM-NEXT:    srli a6, a6, 6
 ; RV32IM-NEXT:    add a6, a6, t0
 ; RV32IM-NEXT:    mul a6, a6, a7
-; RV32IM-NEXT:    sub a3, a3, a6
-; RV32IM-NEXT:    mulh a5, a2, a5
-; RV32IM-NEXT:    add a5, a5, a2
+; RV32IM-NEXT:    sub a4, a4, a6
+; RV32IM-NEXT:    mulh a5, a1, a5
+; RV32IM-NEXT:    add a5, a5, a1
 ; RV32IM-NEXT:    srli a6, a5, 31
 ; RV32IM-NEXT:    srli a5, a5, 6
 ; RV32IM-NEXT:    add a5, a5, a6
 ; RV32IM-NEXT:    mul a5, a5, a7
-; RV32IM-NEXT:    sub a2, a2, a5
-; RV32IM-NEXT:    sh a2, 6(a0)
-; RV32IM-NEXT:    sh a3, 4(a0)
-; RV32IM-NEXT:    sh a1, 2(a0)
-; RV32IM-NEXT:    sh a4, 0(a0)
+; RV32IM-NEXT:    sub a1, a1, a5
+; RV32IM-NEXT:    sh a1, 6(a0)
+; RV32IM-NEXT:    sh a4, 4(a0)
+; RV32IM-NEXT:    sh a3, 2(a0)
+; RV32IM-NEXT:    sh a2, 0(a0)
 ; RV32IM-NEXT:    ret
 ;
 ; RV64I-LABEL: fold_srem_vec_2:
@@ -291,29 +291,29 @@ define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) nounwind {
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s4, 0(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lh s0, 24(a1)
-; RV64I-NEXT:    lh s1, 16(a1)
-; RV64I-NEXT:    lh s2, 8(a1)
 ; RV64I-NEXT:    lh a2, 0(a1)
+; RV64I-NEXT:    lh s0, 8(a1)
+; RV64I-NEXT:    lh s1, 16(a1)
+; RV64I-NEXT:    lh s2, 24(a1)
 ; RV64I-NEXT:    mv s3, a0
 ; RV64I-NEXT:    li a1, 95
 ; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:    call __moddi3 at plt
 ; RV64I-NEXT:    mv s4, a0
 ; RV64I-NEXT:    li a1, 95
-; RV64I-NEXT:    mv a0, s2
+; RV64I-NEXT:    mv a0, s0
 ; RV64I-NEXT:    call __moddi3 at plt
-; RV64I-NEXT:    mv s2, a0
+; RV64I-NEXT:    mv s0, a0
 ; RV64I-NEXT:    li a1, 95
 ; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call __moddi3 at plt
 ; RV64I-NEXT:    mv s1, a0
 ; RV64I-NEXT:    li a1, 95
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s2
 ; RV64I-NEXT:    call __moddi3 at plt
 ; RV64I-NEXT:    sh a0, 6(s3)
 ; RV64I-NEXT:    sh s1, 4(s3)
-; RV64I-NEXT:    sh s2, 2(s3)
+; RV64I-NEXT:    sh s0, 2(s3)
 ; RV64I-NEXT:    sh s4, 0(s3)
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
@@ -326,45 +326,45 @@ define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) nounwind {
 ;
 ; RV64IM-LABEL: fold_srem_vec_2:
 ; RV64IM:       # %bb.0:
-; RV64IM-NEXT:    lh a2, 0(a1)
-; RV64IM-NEXT:    lui a3, %hi(.LCPI1_0)
-; RV64IM-NEXT:    ld a3, %lo(.LCPI1_0)(a3)
-; RV64IM-NEXT:    lh a4, 24(a1)
+; RV64IM-NEXT:    lui a2, %hi(.LCPI1_0)
+; RV64IM-NEXT:    ld a2, %lo(.LCPI1_0)(a2)
+; RV64IM-NEXT:    lh a3, 0(a1)
+; RV64IM-NEXT:    lh a4, 8(a1)
 ; RV64IM-NEXT:    lh a5, 16(a1)
-; RV64IM-NEXT:    lh a1, 8(a1)
-; RV64IM-NEXT:    mulh a6, a2, a3
-; RV64IM-NEXT:    add a6, a6, a2
+; RV64IM-NEXT:    lh a1, 24(a1)
+; RV64IM-NEXT:    mulh a6, a3, a2
+; RV64IM-NEXT:    add a6, a6, a3
 ; RV64IM-NEXT:    srli a7, a6, 63
 ; RV64IM-NEXT:    srli a6, a6, 6
 ; RV64IM-NEXT:    add a6, a6, a7
 ; RV64IM-NEXT:    li a7, 95
 ; RV64IM-NEXT:    mul a6, a6, a7
-; RV64IM-NEXT:    subw a2, a2, a6
-; RV64IM-NEXT:    mulh a6, a1, a3
-; RV64IM-NEXT:    add a6, a6, a1
+; RV64IM-NEXT:    subw a3, a3, a6
+; RV64IM-NEXT:    mulh a6, a4, a2
+; RV64IM-NEXT:    add a6, a6, a4
 ; RV64IM-NEXT:    srli t0, a6, 63
 ; RV64IM-NEXT:    srli a6, a6, 6
 ; RV64IM-NEXT:    add a6, a6, t0
 ; RV64IM-NEXT:    mul a6, a6, a7
-; RV64IM-NEXT:    subw a1, a1, a6
-; RV64IM-NEXT:    mulh a6, a5, a3
+; RV64IM-NEXT:    subw a4, a4, a6
+; RV64IM-NEXT:    mulh a6, a5, a2
 ; RV64IM-NEXT:    add a6, a6, a5
 ; RV64IM-NEXT:    srli t0, a6, 63
 ; RV64IM-NEXT:    srli a6, a6, 6
 ; RV64IM-NEXT:    add a6, a6, t0
 ; RV64IM-NEXT:    mul a6, a6, a7
 ; RV64IM-NEXT:    subw a5, a5, a6
-; RV64IM-NEXT:    mulh a3, a4, a3
-; RV64IM-NEXT:    add a3, a3, a4
-; RV64IM-NEXT:    srli a6, a3, 63
-; RV64IM-NEXT:    srli a3, a3, 6
-; RV64IM-NEXT:    add a3, a3, a6
-; RV64IM-NEXT:    mul a3, a3, a7
-; RV64IM-NEXT:    subw a4, a4, a3
-; RV64IM-NEXT:    sh a4, 6(a0)
+; RV64IM-NEXT:    mulh a2, a1, a2
+; RV64IM-NEXT:    add a2, a2, a1
+; RV64IM-NEXT:    srli a6, a2, 63
+; RV64IM-NEXT:    srli a2, a2, 6
+; RV64IM-NEXT:    add a2, a2, a6
+; RV64IM-NEXT:    mul a2, a2, a7
+; RV64IM-NEXT:    subw a1, a1, a2
+; RV64IM-NEXT:    sh a1, 6(a0)
 ; RV64IM-NEXT:    sh a5, 4(a0)
-; RV64IM-NEXT:    sh a1, 2(a0)
-; RV64IM-NEXT:    sh a2, 0(a0)
+; RV64IM-NEXT:    sh a4, 2(a0)
+; RV64IM-NEXT:    sh a3, 0(a0)
 ; RV64IM-NEXT:    ret
   %1 = srem <4 x i16> %x, <i16 95, i16 95, i16 95, i16 95>
   ret <4 x i16> %1
@@ -445,14 +445,14 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) nounwind {
 ;
 ; RV32IM-LABEL: combine_srem_sdiv:
 ; RV32IM:       # %bb.0:
-; RV32IM-NEXT:    lh a2, 0(a1)
-; RV32IM-NEXT:    lh a3, 4(a1)
-; RV32IM-NEXT:    lh a4, 12(a1)
+; RV32IM-NEXT:    lh a2, 12(a1)
+; RV32IM-NEXT:    lh a3, 0(a1)
+; RV32IM-NEXT:    lh a4, 4(a1)
 ; RV32IM-NEXT:    lh a1, 8(a1)
 ; RV32IM-NEXT:    lui a5, 706409
 ; RV32IM-NEXT:    addi a5, a5, 389
-; RV32IM-NEXT:    mulh a6, a4, a5
-; RV32IM-NEXT:    add a6, a6, a4
+; RV32IM-NEXT:    mulh a6, a2, a5
+; RV32IM-NEXT:    add a6, a6, a2
 ; RV32IM-NEXT:    srli a7, a6, 31
 ; RV32IM-NEXT:    srai a6, a6, 6
 ; RV32IM-NEXT:    add a6, a6, a7
@@ -464,30 +464,30 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) nounwind {
 ; RV32IM-NEXT:    srai t1, t1, 6
 ; RV32IM-NEXT:    add t1, t1, t2
 ; RV32IM-NEXT:    mul t2, t1, a7
-; RV32IM-NEXT:    mulh t3, a3, a5
-; RV32IM-NEXT:    add t3, t3, a3
+; RV32IM-NEXT:    mulh t3, a4, a5
+; RV32IM-NEXT:    add t3, t3, a4
 ; RV32IM-NEXT:    srli t4, t3, 31
 ; RV32IM-NEXT:    srai t3, t3, 6
 ; RV32IM-NEXT:    add t3, t3, t4
 ; RV32IM-NEXT:    mul t4, t3, a7
-; RV32IM-NEXT:    mulh a5, a2, a5
-; RV32IM-NEXT:    add a5, a5, a2
+; RV32IM-NEXT:    mulh a5, a3, a5
+; RV32IM-NEXT:    add a5, a5, a3
 ; RV32IM-NEXT:    srli t5, a5, 31
 ; RV32IM-NEXT:    srai a5, a5, 6
 ; RV32IM-NEXT:    add a5, a5, t5
 ; RV32IM-NEXT:    mul a7, a5, a7
-; RV32IM-NEXT:    add a2, a2, a5
-; RV32IM-NEXT:    sub a2, a2, a7
-; RV32IM-NEXT:    add a3, a3, t3
-; RV32IM-NEXT:    sub a3, a3, t4
+; RV32IM-NEXT:    add a3, a3, a5
+; RV32IM-NEXT:    sub a3, a3, a7
+; RV32IM-NEXT:    add a4, a4, t3
+; RV32IM-NEXT:    sub a4, a4, t4
 ; RV32IM-NEXT:    add a1, a1, t1
 ; RV32IM-NEXT:    sub a1, a1, t2
-; RV32IM-NEXT:    add a4, a4, a6
-; RV32IM-NEXT:    sub a4, a4, t0
-; RV32IM-NEXT:    sh a4, 6(a0)
+; RV32IM-NEXT:    add a2, a2, a6
+; RV32IM-NEXT:    sub a2, a2, t0
+; RV32IM-NEXT:    sh a2, 6(a0)
 ; RV32IM-NEXT:    sh a1, 4(a0)
-; RV32IM-NEXT:    sh a3, 2(a0)
-; RV32IM-NEXT:    sh a2, 0(a0)
+; RV32IM-NEXT:    sh a4, 2(a0)
+; RV32IM-NEXT:    sh a3, 0(a0)
 ; RV32IM-NEXT:    ret
 ;
 ; RV64I-LABEL: combine_srem_sdiv:
@@ -624,21 +624,21 @@ define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) nounwind {
 ; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a0
 ; RV32I-NEXT:    lh a2, 0(a1)
+; RV32I-NEXT:    lh a3, 4(a1)
+; RV32I-NEXT:    lh a4, 8(a1)
 ; RV32I-NEXT:    lh a0, 12(a1)
-; RV32I-NEXT:    lh a3, 8(a1)
-; RV32I-NEXT:    lh a1, 4(a1)
-; RV32I-NEXT:    srli a4, a2, 26
-; RV32I-NEXT:    add a4, a2, a4
-; RV32I-NEXT:    andi a4, a4, -64
-; RV32I-NEXT:    sub s1, a2, a4
-; RV32I-NEXT:    srli a2, a1, 27
-; RV32I-NEXT:    add a2, a1, a2
-; RV32I-NEXT:    andi a2, a2, -32
-; RV32I-NEXT:    sub s2, a1, a2
-; RV32I-NEXT:    srli a1, a3, 29
+; RV32I-NEXT:    srli a1, a2, 26
+; RV32I-NEXT:    add a1, a2, a1
+; RV32I-NEXT:    andi a1, a1, -64
+; RV32I-NEXT:    sub s1, a2, a1
+; RV32I-NEXT:    srli a1, a3, 27
 ; RV32I-NEXT:    add a1, a3, a1
+; RV32I-NEXT:    andi a1, a1, -32
+; RV32I-NEXT:    sub s2, a3, a1
+; RV32I-NEXT:    srli a1, a4, 29
+; RV32I-NEXT:    add a1, a4, a1
 ; RV32I-NEXT:    andi a1, a1, -8
-; RV32I-NEXT:    sub s3, a3, a1
+; RV32I-NEXT:    sub s3, a4, a1
 ; RV32I-NEXT:    li a1, 95
 ; RV32I-NEXT:    call __modsi3 at plt
 ; RV32I-NEXT:    sh a0, 6(s0)
@@ -655,8 +655,8 @@ define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) nounwind {
 ;
 ; RV32IM-LABEL: dont_fold_srem_power_of_two:
 ; RV32IM:       # %bb.0:
-; RV32IM-NEXT:    lh a2, 8(a1)
-; RV32IM-NEXT:    lh a3, 4(a1)
+; RV32IM-NEXT:    lh a2, 4(a1)
+; RV32IM-NEXT:    lh a3, 8(a1)
 ; RV32IM-NEXT:    lh a4, 12(a1)
 ; RV32IM-NEXT:    lh a1, 0(a1)
 ; RV32IM-NEXT:    lui a5, 706409
@@ -673,16 +673,16 @@ define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) nounwind {
 ; RV32IM-NEXT:    add a5, a1, a5
 ; RV32IM-NEXT:    andi a5, a5, -64
 ; RV32IM-NEXT:    sub a1, a1, a5
-; RV32IM-NEXT:    srli a5, a3, 27
-; RV32IM-NEXT:    add a5, a3, a5
-; RV32IM-NEXT:    andi a5, a5, -32
-; RV32IM-NEXT:    sub a3, a3, a5
-; RV32IM-NEXT:    srli a5, a2, 29
+; RV32IM-NEXT:    srli a5, a2, 27
 ; RV32IM-NEXT:    add a5, a2, a5
-; RV32IM-NEXT:    andi a5, a5, -8
+; RV32IM-NEXT:    andi a5, a5, -32
 ; RV32IM-NEXT:    sub a2, a2, a5
-; RV32IM-NEXT:    sh a2, 4(a0)
-; RV32IM-NEXT:    sh a3, 2(a0)
+; RV32IM-NEXT:    srli a5, a3, 29
+; RV32IM-NEXT:    add a5, a3, a5
+; RV32IM-NEXT:    andi a5, a5, -8
+; RV32IM-NEXT:    sub a3, a3, a5
+; RV32IM-NEXT:    sh a3, 4(a0)
+; RV32IM-NEXT:    sh a2, 2(a0)
 ; RV32IM-NEXT:    sh a1, 0(a0)
 ; RV32IM-NEXT:    sh a4, 6(a0)
 ; RV32IM-NEXT:    ret
@@ -697,21 +697,21 @@ define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) nounwind {
 ; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    mv s0, a0
 ; RV64I-NEXT:    lh a2, 0(a1)
+; RV64I-NEXT:    lh a3, 8(a1)
+; RV64I-NEXT:    lh a4, 16(a1)
 ; RV64I-NEXT:    lh a0, 24(a1)
-; RV64I-NEXT:    lh a3, 16(a1)
-; RV64I-NEXT:    lh a1, 8(a1)
-; RV64I-NEXT:    srli a4, a2, 58
-; RV64I-NEXT:    add a4, a2, a4
-; RV64I-NEXT:    andi a4, a4, -64
-; RV64I-NEXT:    subw s1, a2, a4
-; RV64I-NEXT:    srli a2, a1, 59
-; RV64I-NEXT:    add a2, a1, a2
-; RV64I-NEXT:    andi a2, a2, -32
-; RV64I-NEXT:    subw s2, a1, a2
-; RV64I-NEXT:    srli a1, a3, 61
+; RV64I-NEXT:    srli a1, a2, 58
+; RV64I-NEXT:    add a1, a2, a1
+; RV64I-NEXT:    andi a1, a1, -64
+; RV64I-NEXT:    subw s1, a2, a1
+; RV64I-NEXT:    srli a1, a3, 59
 ; RV64I-NEXT:    add a1, a3, a1
+; RV64I-NEXT:    andi a1, a1, -32
+; RV64I-NEXT:    subw s2, a3, a1
+; RV64I-NEXT:    srli a1, a4, 61
+; RV64I-NEXT:    add a1, a4, a1
 ; RV64I-NEXT:    andi a1, a1, -8
-; RV64I-NEXT:    subw s3, a3, a1
+; RV64I-NEXT:    subw s3, a4, a1
 ; RV64I-NEXT:    li a1, 95
 ; RV64I-NEXT:    call __moddi3 at plt
 ; RV64I-NEXT:    sh a0, 6(s0)
@@ -773,24 +773,24 @@ define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lh s0, 12(a1)
-; RV32I-NEXT:    lh s1, 8(a1)
 ; RV32I-NEXT:    lh a2, 4(a1)
+; RV32I-NEXT:    lh s0, 8(a1)
+; RV32I-NEXT:    lh s1, 12(a1)
 ; RV32I-NEXT:    mv s2, a0
 ; RV32I-NEXT:    li a1, 654
 ; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:    call __modsi3 at plt
 ; RV32I-NEXT:    mv s3, a0
 ; RV32I-NEXT:    li a1, 23
-; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    call __modsi3 at plt
-; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    mv s0, a0
 ; RV32I-NEXT:    lui a0, 1
 ; RV32I-NEXT:    addi a1, a0, 1327
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call __modsi3 at plt
 ; RV32I-NEXT:    sh a0, 6(s2)
-; RV32I-NEXT:    sh s1, 4(s2)
+; RV32I-NEXT:    sh s0, 4(s2)
 ; RV32I-NEXT:    sh s3, 2(s2)
 ; RV32I-NEXT:    sh zero, 0(s2)
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
@@ -803,43 +803,43 @@ define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) nounwind {
 ;
 ; RV32IM-LABEL: dont_fold_srem_one:
 ; RV32IM:       # %bb.0:
-; RV32IM-NEXT:    lh a2, 12(a1)
-; RV32IM-NEXT:    lh a3, 4(a1)
-; RV32IM-NEXT:    lh a1, 8(a1)
+; RV32IM-NEXT:    lh a2, 4(a1)
+; RV32IM-NEXT:    lh a3, 8(a1)
+; RV32IM-NEXT:    lh a1, 12(a1)
 ; RV32IM-NEXT:    lui a4, 820904
 ; RV32IM-NEXT:    addi a4, a4, -1903
-; RV32IM-NEXT:    mulh a4, a3, a4
-; RV32IM-NEXT:    add a4, a4, a3
+; RV32IM-NEXT:    mulh a4, a2, a4
+; RV32IM-NEXT:    add a4, a4, a2
 ; RV32IM-NEXT:    srli a5, a4, 31
 ; RV32IM-NEXT:    srli a4, a4, 9
 ; RV32IM-NEXT:    add a4, a4, a5
 ; RV32IM-NEXT:    li a5, 654
 ; RV32IM-NEXT:    mul a4, a4, a5
-; RV32IM-NEXT:    sub a3, a3, a4
+; RV32IM-NEXT:    sub a2, a2, a4
 ; RV32IM-NEXT:    lui a4, 729444
 ; RV32IM-NEXT:    addi a4, a4, 713
-; RV32IM-NEXT:    mulh a4, a1, a4
-; RV32IM-NEXT:    add a4, a4, a1
+; RV32IM-NEXT:    mulh a4, a3, a4
+; RV32IM-NEXT:    add a4, a4, a3
 ; RV32IM-NEXT:    srli a5, a4, 31
 ; RV32IM-NEXT:    srli a4, a4, 4
 ; RV32IM-NEXT:    add a4, a4, a5
 ; RV32IM-NEXT:    li a5, 23
 ; RV32IM-NEXT:    mul a4, a4, a5
-; RV32IM-NEXT:    sub a1, a1, a4
+; RV32IM-NEXT:    sub a3, a3, a4
 ; RV32IM-NEXT:    lui a4, 395996
 ; RV32IM-NEXT:    addi a4, a4, -2009
-; RV32IM-NEXT:    mulh a4, a2, a4
+; RV32IM-NEXT:    mulh a4, a1, a4
 ; RV32IM-NEXT:    srli a5, a4, 31
 ; RV32IM-NEXT:    srli a4, a4, 11
 ; RV32IM-NEXT:    add a4, a4, a5
 ; RV32IM-NEXT:    lui a5, 1
 ; RV32IM-NEXT:    addi a5, a5, 1327
 ; RV32IM-NEXT:    mul a4, a4, a5
-; RV32IM-NEXT:    sub a2, a2, a4
+; RV32IM-NEXT:    sub a1, a1, a4
 ; RV32IM-NEXT:    sh zero, 0(a0)
-; RV32IM-NEXT:    sh a2, 6(a0)
-; RV32IM-NEXT:    sh a1, 4(a0)
-; RV32IM-NEXT:    sh a3, 2(a0)
+; RV32IM-NEXT:    sh a1, 6(a0)
+; RV32IM-NEXT:    sh a3, 4(a0)
+; RV32IM-NEXT:    sh a2, 2(a0)
 ; RV32IM-NEXT:    ret
 ;
 ; RV64I-LABEL: dont_fold_srem_one:
@@ -850,24 +850,24 @@ define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) nounwind {
 ; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lh s0, 24(a1)
-; RV64I-NEXT:    lh s1, 16(a1)
 ; RV64I-NEXT:    lh a2, 8(a1)
+; RV64I-NEXT:    lh s0, 16(a1)
+; RV64I-NEXT:    lh s1, 24(a1)
 ; RV64I-NEXT:    mv s2, a0
 ; RV64I-NEXT:    li a1, 654
 ; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:    call __moddi3 at plt
 ; RV64I-NEXT:    mv s3, a0
 ; RV64I-NEXT:    li a1, 23
-; RV64I-NEXT:    mv a0, s1
+; RV64I-NEXT:    mv a0, s0
 ; RV64I-NEXT:    call __moddi3 at plt
-; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    mv s0, a0
 ; RV64I-NEXT:    lui a0, 1
 ; RV64I-NEXT:    addiw a1, a0, 1327
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call __moddi3 at plt
 ; RV64I-NEXT:    sh a0, 6(s2)
-; RV64I-NEXT:    sh s1, 4(s2)
+; RV64I-NEXT:    sh s0, 4(s2)
 ; RV64I-NEXT:    sh s3, 2(s2)
 ; RV64I-NEXT:    sh zero, 0(s2)
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
@@ -880,42 +880,42 @@ define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) nounwind {
 ;
 ; RV64IM-LABEL: dont_fold_srem_one:
 ; RV64IM:       # %bb.0:
-; RV64IM-NEXT:    lh a2, 16(a1)
-; RV64IM-NEXT:    lui a3, %hi(.LCPI4_0)
-; RV64IM-NEXT:    ld a3, %lo(.LCPI4_0)(a3)
-; RV64IM-NEXT:    lh a4, 24(a1)
-; RV64IM-NEXT:    lh a1, 8(a1)
-; RV64IM-NEXT:    mulh a3, a2, a3
-; RV64IM-NEXT:    add a3, a3, a2
-; RV64IM-NEXT:    srli a5, a3, 63
-; RV64IM-NEXT:    srli a3, a3, 4
-; RV64IM-NEXT:    add a3, a3, a5
+; RV64IM-NEXT:    lui a2, %hi(.LCPI4_0)
+; RV64IM-NEXT:    ld a2, %lo(.LCPI4_0)(a2)
+; RV64IM-NEXT:    lh a3, 16(a1)
+; RV64IM-NEXT:    lh a4, 8(a1)
+; RV64IM-NEXT:    lh a1, 24(a1)
+; RV64IM-NEXT:    mulh a2, a3, a2
+; RV64IM-NEXT:    add a2, a2, a3
+; RV64IM-NEXT:    srli a5, a2, 63
+; RV64IM-NEXT:    srli a2, a2, 4
+; RV64IM-NEXT:    add a2, a2, a5
 ; RV64IM-NEXT:    lui a5, %hi(.LCPI4_1)
 ; RV64IM-NEXT:    ld a5, %lo(.LCPI4_1)(a5)
 ; RV64IM-NEXT:    li a6, 23
-; RV64IM-NEXT:    mul a3, a3, a6
-; RV64IM-NEXT:    subw a2, a2, a3
-; RV64IM-NEXT:    mulh a3, a1, a5
-; RV64IM-NEXT:    srli a5, a3, 63
-; RV64IM-NEXT:    srli a3, a3, 8
-; RV64IM-NEXT:    add a3, a3, a5
+; RV64IM-NEXT:    mul a2, a2, a6
+; RV64IM-NEXT:    subw a3, a3, a2
+; RV64IM-NEXT:    mulh a2, a4, a5
+; RV64IM-NEXT:    srli a5, a2, 63
+; RV64IM-NEXT:    srli a2, a2, 8
+; RV64IM-NEXT:    add a2, a2, a5
 ; RV64IM-NEXT:    lui a5, %hi(.LCPI4_2)
 ; RV64IM-NEXT:    ld a5, %lo(.LCPI4_2)(a5)
 ; RV64IM-NEXT:    li a6, 654
-; RV64IM-NEXT:    mul a3, a3, a6
-; RV64IM-NEXT:    subw a1, a1, a3
-; RV64IM-NEXT:    mulh a3, a4, a5
-; RV64IM-NEXT:    srli a5, a3, 63
-; RV64IM-NEXT:    srli a3, a3, 11
-; RV64IM-NEXT:    add a3, a3, a5
+; RV64IM-NEXT:    mul a2, a2, a6
+; RV64IM-NEXT:    subw a4, a4, a2
+; RV64IM-NEXT:    mulh a2, a1, a5
+; RV64IM-NEXT:    srli a5, a2, 63
+; RV64IM-NEXT:    srli a2, a2, 11
+; RV64IM-NEXT:    add a2, a2, a5
 ; RV64IM-NEXT:    lui a5, 1
 ; RV64IM-NEXT:    addi a5, a5, 1327
-; RV64IM-NEXT:    mul a3, a3, a5
-; RV64IM-NEXT:    subw a4, a4, a3
+; RV64IM-NEXT:    mul a2, a2, a5
+; RV64IM-NEXT:    subw a1, a1, a2
 ; RV64IM-NEXT:    sh zero, 0(a0)
-; RV64IM-NEXT:    sh a4, 6(a0)
-; RV64IM-NEXT:    sh a1, 2(a0)
-; RV64IM-NEXT:    sh a2, 4(a0)
+; RV64IM-NEXT:    sh a1, 6(a0)
+; RV64IM-NEXT:    sh a4, 2(a0)
+; RV64IM-NEXT:    sh a3, 4(a0)
 ; RV64IM-NEXT:    ret
   %1 = srem <4 x i16> %x, <i16 1, i16 654, i16 23, i16 5423>
   ret <4 x i16> %1
@@ -933,8 +933,8 @@ define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) nounwind {
 ; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    lh a2, 4(a1)
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lh s1, 12(a1)
 ; RV32I-NEXT:    lh a0, 8(a1)
+; RV32I-NEXT:    lh s1, 12(a1)
 ; RV32I-NEXT:    srli a1, a2, 17
 ; RV32I-NEXT:    add a1, a2, a1
 ; RV32I-NEXT:    lui a3, 8
@@ -1005,8 +1005,8 @@ define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) nounwind {
 ; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    lh a2, 8(a1)
 ; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    lh s1, 24(a1)
 ; RV64I-NEXT:    lh a0, 16(a1)
+; RV64I-NEXT:    lh s1, 24(a1)
 ; RV64I-NEXT:    srli a1, a2, 49
 ; RV64I-NEXT:    add a1, a2, a1
 ; RV64I-NEXT:    lui a3, 8
@@ -1033,38 +1033,38 @@ define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) nounwind {
 ;
 ; RV64IM-LABEL: dont_fold_urem_i16_smax:
 ; RV64IM:       # %bb.0:
-; RV64IM-NEXT:    lh a2, 16(a1)
-; RV64IM-NEXT:    lui a3, %hi(.LCPI5_0)
-; RV64IM-NEXT:    ld a3, %lo(.LCPI5_0)(a3)
-; RV64IM-NEXT:    lh a4, 24(a1)
-; RV64IM-NEXT:    mulh a3, a2, a3
-; RV64IM-NEXT:    add a3, a3, a2
-; RV64IM-NEXT:    srli a5, a3, 63
-; RV64IM-NEXT:    srli a3, a3, 4
-; RV64IM-NEXT:    add a3, a3, a5
-; RV64IM-NEXT:    li a5, 23
-; RV64IM-NEXT:    lui a6, %hi(.LCPI5_1)
-; RV64IM-NEXT:    ld a6, %lo(.LCPI5_1)(a6)
-; RV64IM-NEXT:    mul a3, a3, a5
-; RV64IM-NEXT:    lh a1, 8(a1)
-; RV64IM-NEXT:    subw a2, a2, a3
-; RV64IM-NEXT:    mulh a3, a4, a6
-; RV64IM-NEXT:    srli a5, a3, 63
-; RV64IM-NEXT:    srli a3, a3, 11
-; RV64IM-NEXT:    add a3, a3, a5
+; RV64IM-NEXT:    lui a2, %hi(.LCPI5_0)
+; RV64IM-NEXT:    ld a2, %lo(.LCPI5_0)(a2)
+; RV64IM-NEXT:    lh a3, 16(a1)
+; RV64IM-NEXT:    lh a4, 8(a1)
+; RV64IM-NEXT:    lh a1, 24(a1)
+; RV64IM-NEXT:    mulh a2, a3, a2
+; RV64IM-NEXT:    add a2, a2, a3
+; RV64IM-NEXT:    srli a5, a2, 63
+; RV64IM-NEXT:    srli a2, a2, 4
+; RV64IM-NEXT:    add a2, a2, a5
+; RV64IM-NEXT:    lui a5, %hi(.LCPI5_1)
+; RV64IM-NEXT:    ld a5, %lo(.LCPI5_1)(a5)
+; RV64IM-NEXT:    li a6, 23
+; RV64IM-NEXT:    mul a2, a2, a6
+; RV64IM-NEXT:    subw a3, a3, a2
+; RV64IM-NEXT:    mulh a2, a1, a5
+; RV64IM-NEXT:    srli a5, a2, 63
+; RV64IM-NEXT:    srli a2, a2, 11
+; RV64IM-NEXT:    add a2, a2, a5
 ; RV64IM-NEXT:    lui a5, 1
 ; RV64IM-NEXT:    addi a5, a5, 1327
-; RV64IM-NEXT:    mul a3, a3, a5
-; RV64IM-NEXT:    subw a4, a4, a3
-; RV64IM-NEXT:    srli a3, a1, 49
-; RV64IM-NEXT:    add a3, a1, a3
+; RV64IM-NEXT:    mul a2, a2, a5
+; RV64IM-NEXT:    subw a1, a1, a2
+; RV64IM-NEXT:    srli a2, a4, 49
+; RV64IM-NEXT:    add a2, a4, a2
 ; RV64IM-NEXT:    lui a5, 8
-; RV64IM-NEXT:    and a3, a3, a5
-; RV64IM-NEXT:    subw a1, a1, a3
+; RV64IM-NEXT:    and a2, a2, a5
+; RV64IM-NEXT:    subw a4, a4, a2
 ; RV64IM-NEXT:    sh zero, 0(a0)
-; RV64IM-NEXT:    sh a1, 2(a0)
-; RV64IM-NEXT:    sh a4, 6(a0)
-; RV64IM-NEXT:    sh a2, 4(a0)
+; RV64IM-NEXT:    sh a4, 2(a0)
+; RV64IM-NEXT:    sh a1, 6(a0)
+; RV64IM-NEXT:    sh a3, 4(a0)
 ; RV64IM-NEXT:    ret
   %1 = srem <4 x i16> %x, <i16 1, i16 32768, i16 23, i16 5423>
   ret <4 x i16> %1
@@ -1085,28 +1085,29 @@ define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) nounwind {
 ; RV32I-NEXT:    sw s6, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s7, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s8, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lw s0, 24(a1)
-; RV32I-NEXT:    lw s1, 28(a1)
+; RV32I-NEXT:    lw a3, 0(a1)
+; RV32I-NEXT:    lw a4, 4(a1)
+; RV32I-NEXT:    lw s0, 8(a1)
+; RV32I-NEXT:    lw s1, 12(a1)
 ; RV32I-NEXT:    lw s2, 16(a1)
 ; RV32I-NEXT:    lw s3, 20(a1)
-; RV32I-NEXT:    lw s4, 8(a1)
-; RV32I-NEXT:    lw s5, 12(a1)
-; RV32I-NEXT:    lw a3, 0(a1)
-; RV32I-NEXT:    lw a1, 4(a1)
+; RV32I-NEXT:    lw s4, 24(a1)
+; RV32I-NEXT:    lw s5, 28(a1)
 ; RV32I-NEXT:    mv s6, a0
 ; RV32I-NEXT:    li a2, 1
 ; RV32I-NEXT:    mv a0, a3
+; RV32I-NEXT:    mv a1, a4
 ; RV32I-NEXT:    li a3, 0
 ; RV32I-NEXT:    call __moddi3 at plt
 ; RV32I-NEXT:    mv s7, a0
 ; RV32I-NEXT:    mv s8, a1
 ; RV32I-NEXT:    li a2, 654
-; RV32I-NEXT:    mv a0, s4
-; RV32I-NEXT:    mv a1, s5
+; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a1, s1
 ; RV32I-NEXT:    li a3, 0
 ; RV32I-NEXT:    call __moddi3 at plt
-; RV32I-NEXT:    mv s4, a0
-; RV32I-NEXT:    mv s5, a1
+; RV32I-NEXT:    mv s0, a0
+; RV32I-NEXT:    mv s1, a1
 ; RV32I-NEXT:    li a2, 23
 ; RV32I-NEXT:    mv a0, s2
 ; RV32I-NEXT:    mv a1, s3
@@ -1116,16 +1117,16 @@ define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) nounwind {
 ; RV32I-NEXT:    mv s3, a1
 ; RV32I-NEXT:    lui a0, 1
 ; RV32I-NEXT:    addi a2, a0, 1327
-; RV32I-NEXT:    mv a0, s0
-; RV32I-NEXT:    mv a1, s1
+; RV32I-NEXT:    mv a0, s4
+; RV32I-NEXT:    mv a1, s5
 ; RV32I-NEXT:    li a3, 0
 ; RV32I-NEXT:    call __moddi3 at plt
 ; RV32I-NEXT:    sw a1, 28(s6)
 ; RV32I-NEXT:    sw a0, 24(s6)
 ; RV32I-NEXT:    sw s3, 20(s6)
 ; RV32I-NEXT:    sw s2, 16(s6)
-; RV32I-NEXT:    sw s5, 12(s6)
-; RV32I-NEXT:    sw s4, 8(s6)
+; RV32I-NEXT:    sw s1, 12(s6)
+; RV32I-NEXT:    sw s0, 8(s6)
 ; RV32I-NEXT:    sw s8, 4(s6)
 ; RV32I-NEXT:    sw s7, 0(s6)
 ; RV32I-NEXT:    lw ra, 44(sp) # 4-byte Folded Reload
@@ -1154,28 +1155,29 @@ define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) nounwind {
 ; RV32IM-NEXT:    sw s6, 16(sp) # 4-byte Folded Spill
 ; RV32IM-NEXT:    sw s7, 12(sp) # 4-byte Folded Spill
 ; RV32IM-NEXT:    sw s8, 8(sp) # 4-byte Folded Spill
-; RV32IM-NEXT:    lw s0, 24(a1)
-; RV32IM-NEXT:    lw s1, 28(a1)
+; RV32IM-NEXT:    lw a3, 0(a1)
+; RV32IM-NEXT:    lw a4, 4(a1)
+; RV32IM-NEXT:    lw s0, 8(a1)
+; RV32IM-NEXT:    lw s1, 12(a1)
 ; RV32IM-NEXT:    lw s2, 16(a1)
 ; RV32IM-NEXT:    lw s3, 20(a1)
-; RV32IM-NEXT:    lw s4, 8(a1)
-; RV32IM-NEXT:    lw s5, 12(a1)
-; RV32IM-NEXT:    lw a3, 0(a1)
-; RV32IM-NEXT:    lw a1, 4(a1)
+; RV32IM-NEXT:    lw s4, 24(a1)
+; RV32IM-NEXT:    lw s5, 28(a1)
 ; RV32IM-NEXT:    mv s6, a0
 ; RV32IM-NEXT:    li a2, 1
 ; RV32IM-NEXT:    mv a0, a3
+; RV32IM-NEXT:    mv a1, a4
 ; RV32IM-NEXT:    li a3, 0
 ; RV32IM-NEXT:    call __moddi3 at plt
 ; RV32IM-NEXT:    mv s7, a0
 ; RV32IM-NEXT:    mv s8, a1
 ; RV32IM-NEXT:    li a2, 654
-; RV32IM-NEXT:    mv a0, s4
-; RV32IM-NEXT:    mv a1, s5
+; RV32IM-NEXT:    mv a0, s0
+; RV32IM-NEXT:    mv a1, s1
 ; RV32IM-NEXT:    li a3, 0
 ; RV32IM-NEXT:    call __moddi3 at plt
-; RV32IM-NEXT:    mv s4, a0
-; RV32IM-NEXT:    mv s5, a1
+; RV32IM-NEXT:    mv s0, a0
+; RV32IM-NEXT:    mv s1, a1
 ; RV32IM-NEXT:    li a2, 23
 ; RV32IM-NEXT:    mv a0, s2
 ; RV32IM-NEXT:    mv a1, s3
@@ -1185,16 +1187,16 @@ define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) nounwind {
 ; RV32IM-NEXT:    mv s3, a1
 ; RV32IM-NEXT:    lui a0, 1
 ; RV32IM-NEXT:    addi a2, a0, 1327
-; RV32IM-NEXT:    mv a0, s0
-; RV32IM-NEXT:    mv a1, s1
+; RV32IM-NEXT:    mv a0, s4
+; RV32IM-NEXT:    mv a1, s5
 ; RV32IM-NEXT:    li a3, 0
 ; RV32IM-NEXT:    call __moddi3 at plt
 ; RV32IM-NEXT:    sw a1, 28(s6)
 ; RV32IM-NEXT:    sw a0, 24(s6)
 ; RV32IM-NEXT:    sw s3, 20(s6)
 ; RV32IM-NEXT:    sw s2, 16(s6)
-; RV32IM-NEXT:    sw s5, 12(s6)
-; RV32IM-NEXT:    sw s4, 8(s6)
+; RV32IM-NEXT:    sw s1, 12(s6)
+; RV32IM-NEXT:    sw s0, 8(s6)
 ; RV32IM-NEXT:    sw s8, 4(s6)
 ; RV32IM-NEXT:    sw s7, 0(s6)
 ; RV32IM-NEXT:    lw ra, 44(sp) # 4-byte Folded Reload
@@ -1218,24 +1220,24 @@ define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) nounwind {
 ; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    ld s0, 24(a1)
-; RV64I-NEXT:    ld s1, 16(a1)
 ; RV64I-NEXT:    ld a2, 8(a1)
+; RV64I-NEXT:    ld s0, 16(a1)
+; RV64I-NEXT:    ld s1, 24(a1)
 ; RV64I-NEXT:    mv s2, a0
 ; RV64I-NEXT:    li a1, 654
 ; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:    call __moddi3 at plt
 ; RV64I-NEXT:    mv s3, a0
 ; RV64I-NEXT:    li a1, 23
-; RV64I-NEXT:    mv a0, s1
+; RV64I-NEXT:    mv a0, s0
 ; RV64I-NEXT:    call __moddi3 at plt
-; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    mv s0, a0
 ; RV64I-NEXT:    lui a0, 1
 ; RV64I-NEXT:    addiw a1, a0, 1327
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call __moddi3 at plt
 ; RV64I-NEXT:    sd a0, 24(s2)
-; RV64I-NEXT:    sd s1, 16(s2)
+; RV64I-NEXT:    sd s0, 16(s2)
 ; RV64I-NEXT:    sd s3, 8(s2)
 ; RV64I-NEXT:    sd zero, 0(s2)
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
@@ -1248,42 +1250,42 @@ define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) nounwind {
 ;
 ; RV64IM-LABEL: dont_fold_srem_i64:
 ; RV64IM:       # %bb.0:
-; RV64IM-NEXT:    ld a2, 16(a1)
-; RV64IM-NEXT:    lui a3, %hi(.LCPI6_0)
-; RV64IM-NEXT:    ld a3, %lo(.LCPI6_0)(a3)
-; RV64IM-NEXT:    ld a4, 24(a1)
-; RV64IM-NEXT:    ld a1, 8(a1)
-; RV64IM-NEXT:    mulh a3, a2, a3
-; RV64IM-NEXT:    add a3, a3, a2
-; RV64IM-NEXT:    srli a5, a3, 63
-; RV64IM-NEXT:    srai a3, a3, 4
-; RV64IM-NEXT:    add a3, a3, a5
+; RV64IM-NEXT:    lui a2, %hi(.LCPI6_0)
+; RV64IM-NEXT:    ld a2, %lo(.LCPI6_0)(a2)
+; RV64IM-NEXT:    ld a3, 16(a1)
+; RV64IM-NEXT:    ld a4, 8(a1)
+; RV64IM-NEXT:    ld a1, 24(a1)
+; RV64IM-NEXT:    mulh a2, a3, a2
+; RV64IM-NEXT:    add a2, a2, a3
+; RV64IM-NEXT:    srli a5, a2, 63
+; RV64IM-NEXT:    srai a2, a2, 4
+; RV64IM-NEXT:    add a2, a2, a5
 ; RV64IM-NEXT:    lui a5, %hi(.LCPI6_1)
 ; RV64IM-NEXT:    ld a5, %lo(.LCPI6_1)(a5)
 ; RV64IM-NEXT:    li a6, 23
-; RV64IM-NEXT:    mul a3, a3, a6
-; RV64IM-NEXT:    sub a2, a2, a3
-; RV64IM-NEXT:    mulh a3, a1, a5
-; RV64IM-NEXT:    srli a5, a3, 63
-; RV64IM-NEXT:    srai a3, a3, 8
-; RV64IM-NEXT:    add a3, a3, a5
+; RV64IM-NEXT:    mul a2, a2, a6
+; RV64IM-NEXT:    sub a3, a3, a2
+; RV64IM-NEXT:    mulh a2, a4, a5
+; RV64IM-NEXT:    srli a5, a2, 63
+; RV64IM-NEXT:    srai a2, a2, 8
+; RV64IM-NEXT:    add a2, a2, a5
 ; RV64IM-NEXT:    lui a5, %hi(.LCPI6_2)
 ; RV64IM-NEXT:    ld a5, %lo(.LCPI6_2)(a5)
 ; RV64IM-NEXT:    li a6, 654
-; RV64IM-NEXT:    mul a3, a3, a6
-; RV64IM-NEXT:    sub a1, a1, a3
-; RV64IM-NEXT:    mulh a3, a4, a5
-; RV64IM-NEXT:    srli a5, a3, 63
-; RV64IM-NEXT:    srai a3, a3, 11
-; RV64IM-NEXT:    add a3, a3, a5
+; RV64IM-NEXT:    mul a2, a2, a6
+; RV64IM-NEXT:    sub a4, a4, a2
+; RV64IM-NEXT:    mulh a2, a1, a5
+; RV64IM-NEXT:    srli a5, a2, 63
+; RV64IM-NEXT:    srai a2, a2, 11
+; RV64IM-NEXT:    add a2, a2, a5
 ; RV64IM-NEXT:    lui a5, 1
 ; RV64IM-NEXT:    addiw a5, a5, 1327
-; RV64IM-NEXT:    mul a3, a3, a5
-; RV64IM-NEXT:    sub a4, a4, a3
+; RV64IM-NEXT:    mul a2, a2, a5
+; RV64IM-NEXT:    sub a1, a1, a2
 ; RV64IM-NEXT:    sd zero, 0(a0)
-; RV64IM-NEXT:    sd a4, 24(a0)
-; RV64IM-NEXT:    sd a1, 8(a0)
-; RV64IM-NEXT:    sd a2, 16(a0)
+; RV64IM-NEXT:    sd a1, 24(a0)
+; RV64IM-NEXT:    sd a4, 8(a0)
+; RV64IM-NEXT:    sd a3, 16(a0)
 ; RV64IM-NEXT:    ret
   %1 = srem <4 x i64> %x, <i64 1, i64 654, i64 23, i64 5423>
   ret <4 x i64> %1
diff --git a/llvm/test/CodeGen/RISCV/stack-store-check.ll b/llvm/test/CodeGen/RISCV/stack-store-check.ll
index 651df94bab496e..2b2ba675b29f22 100644
--- a/llvm/test/CodeGen/RISCV/stack-store-check.ll
+++ b/llvm/test/CodeGen/RISCV/stack-store-check.ll
@@ -143,15 +143,15 @@ define void @main() local_unnamed_addr nounwind {
 ; CHECK-NEXT:    addi a2, sp, 392
 ; CHECK-NEXT:    sw a3, 392(sp)
 ; CHECK-NEXT:    call __subtf3 at plt
-; CHECK-NEXT:    lw a0, 424(sp)
+; CHECK-NEXT:    lw a0, 432(sp)
 ; CHECK-NEXT:    lw a1, 436(sp)
-; CHECK-NEXT:    lw a2, 432(sp)
+; CHECK-NEXT:    lw a2, 424(sp)
 ; CHECK-NEXT:    lw a3, 428(sp)
 ; CHECK-NEXT:    lui a4, %hi(X)
 ; CHECK-NEXT:    sw a1, %lo(X+12)(a4)
-; CHECK-NEXT:    sw a2, %lo(X+8)(a4)
+; CHECK-NEXT:    sw a0, %lo(X+8)(a4)
 ; CHECK-NEXT:    sw a3, %lo(X+4)(a4)
-; CHECK-NEXT:    sw a0, %lo(X)(a4)
+; CHECK-NEXT:    sw a2, %lo(X)(a4)
 ; CHECK-NEXT:    lw s8, 4(sp) # 4-byte Folded Reload
 ; CHECK-NEXT:    sw s8, 212(sp)
 ; CHECK-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
@@ -190,15 +190,15 @@ define void @main() local_unnamed_addr nounwind {
 ; CHECK-NEXT:    addi a2, sp, 344
 ; CHECK-NEXT:    sw s9, 360(sp)
 ; CHECK-NEXT:    call __multf3 at plt
-; CHECK-NEXT:    lw a0, 376(sp)
+; CHECK-NEXT:    lw a0, 384(sp)
 ; CHECK-NEXT:    lw a1, 388(sp)
-; CHECK-NEXT:    lw a2, 384(sp)
+; CHECK-NEXT:    lw a2, 376(sp)
 ; CHECK-NEXT:    lw a3, 380(sp)
 ; CHECK-NEXT:    lui a4, %hi(S)
 ; CHECK-NEXT:    sw a1, %lo(S+12)(a4)
-; CHECK-NEXT:    sw a2, %lo(S+8)(a4)
+; CHECK-NEXT:    sw a0, %lo(S+8)(a4)
 ; CHECK-NEXT:    sw a3, %lo(S+4)(a4)
-; CHECK-NEXT:    sw a0, %lo(S)(a4)
+; CHECK-NEXT:    sw a2, %lo(S)(a4)
 ; CHECK-NEXT:    lw a0, 48(sp) # 4-byte Folded Reload
 ; CHECK-NEXT:    sw a0, 260(sp)
 ; CHECK-NEXT:    sw s10, 256(sp)
@@ -216,15 +216,15 @@ define void @main() local_unnamed_addr nounwind {
 ; CHECK-NEXT:    lw a3, 44(sp) # 4-byte Folded Reload
 ; CHECK-NEXT:    sw a3, 264(sp)
 ; CHECK-NEXT:    call __subtf3 at plt
-; CHECK-NEXT:    lw a0, 280(sp)
+; CHECK-NEXT:    lw a0, 288(sp)
 ; CHECK-NEXT:    lw a1, 292(sp)
-; CHECK-NEXT:    lw a2, 288(sp)
+; CHECK-NEXT:    lw a2, 280(sp)
 ; CHECK-NEXT:    lw a3, 284(sp)
 ; CHECK-NEXT:    lui a4, %hi(T)
 ; CHECK-NEXT:    sw a1, %lo(T+12)(a4)
-; CHECK-NEXT:    sw a2, %lo(T+8)(a4)
+; CHECK-NEXT:    sw a0, %lo(T+8)(a4)
 ; CHECK-NEXT:    sw a3, %lo(T+4)(a4)
-; CHECK-NEXT:    sw a0, %lo(T)(a4)
+; CHECK-NEXT:    sw a2, %lo(T)(a4)
 ; CHECK-NEXT:    sw zero, 164(sp)
 ; CHECK-NEXT:    sw zero, 160(sp)
 ; CHECK-NEXT:    sw zero, 156(sp)
@@ -238,15 +238,15 @@ define void @main() local_unnamed_addr nounwind {
 ; CHECK-NEXT:    addi a2, sp, 152
 ; CHECK-NEXT:    sw s1, 168(sp)
 ; CHECK-NEXT:    call __addtf3 at plt
-; CHECK-NEXT:    lw a0, 184(sp)
+; CHECK-NEXT:    lw a0, 192(sp)
 ; CHECK-NEXT:    lw a1, 196(sp)
-; CHECK-NEXT:    lw a2, 192(sp)
+; CHECK-NEXT:    lw a2, 184(sp)
 ; CHECK-NEXT:    lw a3, 188(sp)
 ; CHECK-NEXT:    lui a4, %hi(Y)
 ; CHECK-NEXT:    sw a1, %lo(Y+12)(a4)
-; CHECK-NEXT:    sw a2, %lo(Y+8)(a4)
+; CHECK-NEXT:    sw a0, %lo(Y+8)(a4)
 ; CHECK-NEXT:    sw a3, %lo(Y+4)(a4)
-; CHECK-NEXT:    sw a0, %lo(Y)(a4)
+; CHECK-NEXT:    sw a2, %lo(Y)(a4)
 ; CHECK-NEXT:    sw zero, 116(sp)
 ; CHECK-NEXT:    sw zero, 112(sp)
 ; CHECK-NEXT:    sw zero, 108(sp)
diff --git a/llvm/test/CodeGen/RISCV/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/RISCV/umulo-128-legalisation-lowering.ll
index 8c0d97afe6c21d..1a9126acac8da1 100644
--- a/llvm/test/CodeGen/RISCV/umulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/RISCV/umulo-128-legalisation-lowering.ll
@@ -10,47 +10,47 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) #0 {
 ; RISCV32-NEXT:    sw s2, 20(sp) # 4-byte Folded Spill
 ; RISCV32-NEXT:    sw s3, 16(sp) # 4-byte Folded Spill
 ; RISCV32-NEXT:    sw s4, 12(sp) # 4-byte Folded Spill
-; RISCV32-NEXT:    lw a3, 12(a1)
-; RISCV32-NEXT:    lw a7, 12(a2)
-; RISCV32-NEXT:    lw a6, 8(a1)
-; RISCV32-NEXT:    lw a4, 0(a2)
-; RISCV32-NEXT:    lw a5, 0(a1)
+; RISCV32-NEXT:    lw a3, 0(a1)
 ; RISCV32-NEXT:    lw t2, 4(a1)
-; RISCV32-NEXT:    lw t0, 8(a2)
-; RISCV32-NEXT:    lw a2, 4(a2)
-; RISCV32-NEXT:    mulhu a1, a5, a4
-; RISCV32-NEXT:    mul t1, t2, a4
-; RISCV32-NEXT:    add a1, t1, a1
-; RISCV32-NEXT:    sltu t1, a1, t1
-; RISCV32-NEXT:    mulhu t3, t2, a4
+; RISCV32-NEXT:    lw a4, 8(a1)
+; RISCV32-NEXT:    lw a5, 12(a1)
+; RISCV32-NEXT:    lw a1, 0(a2)
+; RISCV32-NEXT:    lw t0, 4(a2)
+; RISCV32-NEXT:    lw a6, 8(a2)
+; RISCV32-NEXT:    lw a7, 12(a2)
+; RISCV32-NEXT:    mulhu a2, a3, a1
+; RISCV32-NEXT:    mul t1, t2, a1
+; RISCV32-NEXT:    add a2, t1, a2
+; RISCV32-NEXT:    sltu t1, a2, t1
+; RISCV32-NEXT:    mulhu t3, t2, a1
 ; RISCV32-NEXT:    add t4, t3, t1
-; RISCV32-NEXT:    mul t1, a5, a2
-; RISCV32-NEXT:    add a1, t1, a1
-; RISCV32-NEXT:    sltu t1, a1, t1
-; RISCV32-NEXT:    mulhu t3, a5, a2
+; RISCV32-NEXT:    mul t1, a3, t0
+; RISCV32-NEXT:    add a2, t1, a2
+; RISCV32-NEXT:    sltu t1, a2, t1
+; RISCV32-NEXT:    mulhu t3, a3, t0
 ; RISCV32-NEXT:    add t1, t3, t1
 ; RISCV32-NEXT:    add t5, t4, t1
-; RISCV32-NEXT:    mul t6, t2, a2
+; RISCV32-NEXT:    mul t6, t2, t0
 ; RISCV32-NEXT:    add s0, t6, t5
-; RISCV32-NEXT:    mul t1, t0, a5
-; RISCV32-NEXT:    mul s3, a6, a4
+; RISCV32-NEXT:    mul t1, a6, a3
+; RISCV32-NEXT:    mul s3, a4, a1
 ; RISCV32-NEXT:    add s4, s3, t1
 ; RISCV32-NEXT:    add t1, s0, s4
 ; RISCV32-NEXT:    sltu t3, t1, s0
 ; RISCV32-NEXT:    sltu s0, s0, t6
 ; RISCV32-NEXT:    sltu t4, t5, t4
-; RISCV32-NEXT:    mulhu t5, t2, a2
+; RISCV32-NEXT:    mulhu t5, t2, t0
 ; RISCV32-NEXT:    add t4, t5, t4
 ; RISCV32-NEXT:    add s0, t4, s0
-; RISCV32-NEXT:    mul t4, t2, t0
-; RISCV32-NEXT:    mul t5, a7, a5
+; RISCV32-NEXT:    mul t4, t2, a6
+; RISCV32-NEXT:    mul t5, a7, a3
 ; RISCV32-NEXT:    add t4, t5, t4
-; RISCV32-NEXT:    mulhu s1, t0, a5
+; RISCV32-NEXT:    mulhu s1, a6, a3
 ; RISCV32-NEXT:    add s2, s1, t4
-; RISCV32-NEXT:    mul t4, a2, a6
-; RISCV32-NEXT:    mul t5, a3, a4
+; RISCV32-NEXT:    mul t4, t0, a4
+; RISCV32-NEXT:    mul t5, a5, a1
 ; RISCV32-NEXT:    add t4, t5, t4
-; RISCV32-NEXT:    mulhu t5, a6, a4
+; RISCV32-NEXT:    mulhu t5, a4, a1
 ; RISCV32-NEXT:    add t6, t5, t4
 ; RISCV32-NEXT:    add t4, t6, s2
 ; RISCV32-NEXT:    sltu s3, s4, s3
@@ -65,39 +65,39 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) #0 {
 ; RISCV32-NEXT:    snez s1, t2
 ; RISCV32-NEXT:    snez s2, a7
 ; RISCV32-NEXT:    and s1, s2, s1
-; RISCV32-NEXT:    mulhu s2, a7, a5
+; RISCV32-NEXT:    mulhu s2, a7, a3
 ; RISCV32-NEXT:    snez s2, s2
 ; RISCV32-NEXT:    or s1, s1, s2
-; RISCV32-NEXT:    mulhu t2, t2, t0
+; RISCV32-NEXT:    mulhu t2, t2, a6
 ; RISCV32-NEXT:    snez t2, t2
 ; RISCV32-NEXT:    or t2, s1, t2
 ; RISCV32-NEXT:    or t2, t2, s0
 ; RISCV32-NEXT:    sltu t5, t6, t5
-; RISCV32-NEXT:    snez t6, a2
-; RISCV32-NEXT:    snez s0, a3
+; RISCV32-NEXT:    snez t6, t0
+; RISCV32-NEXT:    snez s0, a5
 ; RISCV32-NEXT:    and t6, s0, t6
-; RISCV32-NEXT:    mulhu s0, a3, a4
+; RISCV32-NEXT:    mulhu s0, a5, a1
 ; RISCV32-NEXT:    snez s0, s0
 ; RISCV32-NEXT:    or t6, t6, s0
-; RISCV32-NEXT:    mulhu a2, a2, a6
-; RISCV32-NEXT:    snez a2, a2
-; RISCV32-NEXT:    or a2, t6, a2
-; RISCV32-NEXT:    or a2, a2, t5
-; RISCV32-NEXT:    or a7, t0, a7
-; RISCV32-NEXT:    snez a7, a7
-; RISCV32-NEXT:    or a3, a6, a3
-; RISCV32-NEXT:    snez a3, a3
-; RISCV32-NEXT:    and a3, a3, a7
-; RISCV32-NEXT:    or a2, a3, a2
-; RISCV32-NEXT:    or a3, t2, t3
-; RISCV32-NEXT:    or a2, a2, a3
-; RISCV32-NEXT:    mul a3, a5, a4
-; RISCV32-NEXT:    andi a2, a2, 1
-; RISCV32-NEXT:    sw a3, 0(a0)
-; RISCV32-NEXT:    sw a1, 4(a0)
+; RISCV32-NEXT:    mulhu t0, t0, a4
+; RISCV32-NEXT:    snez t0, t0
+; RISCV32-NEXT:    or t0, t6, t0
+; RISCV32-NEXT:    or t0, t0, t5
+; RISCV32-NEXT:    or a6, a6, a7
+; RISCV32-NEXT:    snez a6, a6
+; RISCV32-NEXT:    or a4, a4, a5
+; RISCV32-NEXT:    snez a4, a4
+; RISCV32-NEXT:    and a4, a4, a6
+; RISCV32-NEXT:    or a4, a4, t0
+; RISCV32-NEXT:    or a5, t2, t3
+; RISCV32-NEXT:    or a4, a4, a5
+; RISCV32-NEXT:    mul a1, a3, a1
+; RISCV32-NEXT:    andi a4, a4, 1
+; RISCV32-NEXT:    sw a1, 0(a0)
+; RISCV32-NEXT:    sw a2, 4(a0)
 ; RISCV32-NEXT:    sw t1, 8(a0)
 ; RISCV32-NEXT:    sw t4, 12(a0)
-; RISCV32-NEXT:    sb a2, 16(a0)
+; RISCV32-NEXT:    sb a4, 16(a0)
 ; RISCV32-NEXT:    lw s0, 28(sp) # 4-byte Folded Reload
 ; RISCV32-NEXT:    lw s1, 24(sp) # 4-byte Folded Reload
 ; RISCV32-NEXT:    lw s2, 20(sp) # 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/unaligned-load-store.ll b/llvm/test/CodeGen/RISCV/unaligned-load-store.ll
index 599b0d08629eaf..f2a7008728db34 100644
--- a/llvm/test/CodeGen/RISCV/unaligned-load-store.ll
+++ b/llvm/test/CodeGen/RISCV/unaligned-load-store.ll
@@ -85,50 +85,49 @@ define i32 @load_i32(ptr %p) {
 define i64 @load_i64(ptr %p) {
 ; RV32I-LABEL: load_i64:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    lbu a1, 1(a0)
-; RV32I-NEXT:    lbu a2, 0(a0)
+; RV32I-NEXT:    lbu a1, 0(a0)
+; RV32I-NEXT:    lbu a2, 1(a0)
 ; RV32I-NEXT:    lbu a3, 2(a0)
 ; RV32I-NEXT:    lbu a4, 3(a0)
-; RV32I-NEXT:    slli a1, a1, 8
-; RV32I-NEXT:    or a1, a1, a2
+; RV32I-NEXT:    lbu a5, 4(a0)
+; RV32I-NEXT:    lbu a6, 5(a0)
+; RV32I-NEXT:    lbu a7, 6(a0)
+; RV32I-NEXT:    lbu t0, 7(a0)
+; RV32I-NEXT:    slli a2, a2, 8
+; RV32I-NEXT:    or a1, a2, a1
 ; RV32I-NEXT:    slli a3, a3, 16
 ; RV32I-NEXT:    slli a4, a4, 24
-; RV32I-NEXT:    or a2, a4, a3
-; RV32I-NEXT:    or a2, a2, a1
-; RV32I-NEXT:    lbu a1, 5(a0)
-; RV32I-NEXT:    lbu a3, 4(a0)
-; RV32I-NEXT:    lbu a4, 6(a0)
-; RV32I-NEXT:    lbu a0, 7(a0)
-; RV32I-NEXT:    slli a1, a1, 8
-; RV32I-NEXT:    or a1, a1, a3
-; RV32I-NEXT:    slli a4, a4, 16
-; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or a0, a0, a4
-; RV32I-NEXT:    or a1, a0, a1
-; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    or a0, a4, a3
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a1, a6, a5
+; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    slli t0, t0, 24
+; RV32I-NEXT:    or a2, t0, a7
+; RV32I-NEXT:    or a1, a2, a1
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: load_i64:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lbu a1, 1(a0)
-; RV64I-NEXT:    lbu a2, 0(a0)
+; RV64I-NEXT:    lbu a1, 0(a0)
+; RV64I-NEXT:    lbu a2, 1(a0)
 ; RV64I-NEXT:    lbu a3, 2(a0)
 ; RV64I-NEXT:    lbu a4, 3(a0)
-; RV64I-NEXT:    slli a1, a1, 8
-; RV64I-NEXT:    or a1, a1, a2
+; RV64I-NEXT:    lbu a5, 4(a0)
+; RV64I-NEXT:    lbu a6, 5(a0)
+; RV64I-NEXT:    lbu a7, 6(a0)
+; RV64I-NEXT:    lbu a0, 7(a0)
+; RV64I-NEXT:    slli a2, a2, 8
+; RV64I-NEXT:    or a1, a2, a1
 ; RV64I-NEXT:    slli a3, a3, 16
 ; RV64I-NEXT:    slli a4, a4, 24
 ; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    or a1, a3, a1
-; RV64I-NEXT:    lbu a2, 5(a0)
-; RV64I-NEXT:    lbu a3, 4(a0)
-; RV64I-NEXT:    lbu a4, 6(a0)
-; RV64I-NEXT:    lbu a0, 7(a0)
-; RV64I-NEXT:    slli a2, a2, 8
-; RV64I-NEXT:    or a2, a2, a3
-; RV64I-NEXT:    slli a4, a4, 16
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a2, a6, a5
+; RV64I-NEXT:    slli a7, a7, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, a4
+; RV64I-NEXT:    or a0, a0, a7
 ; RV64I-NEXT:    or a0, a0, a2
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    or a0, a0, a1
diff --git a/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll b/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll
index 32aca29d16e9b9..765a3a1ee801c1 100644
--- a/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll
+++ b/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll
@@ -19,29 +19,29 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) nounwind {
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lhu s0, 12(a1)
-; RV32I-NEXT:    lhu s1, 8(a1)
-; RV32I-NEXT:    lhu s2, 4(a1)
 ; RV32I-NEXT:    lhu a2, 0(a1)
+; RV32I-NEXT:    lhu s0, 4(a1)
+; RV32I-NEXT:    lhu s1, 8(a1)
+; RV32I-NEXT:    lhu s2, 12(a1)
 ; RV32I-NEXT:    mv s3, a0
 ; RV32I-NEXT:    li a1, 95
 ; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:    call __umodsi3 at plt
 ; RV32I-NEXT:    mv s4, a0
 ; RV32I-NEXT:    li a1, 124
-; RV32I-NEXT:    mv a0, s2
+; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    call __umodsi3 at plt
-; RV32I-NEXT:    mv s2, a0
+; RV32I-NEXT:    mv s0, a0
 ; RV32I-NEXT:    li a1, 98
 ; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call __umodsi3 at plt
 ; RV32I-NEXT:    mv s1, a0
 ; RV32I-NEXT:    li a1, 1003
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s2
 ; RV32I-NEXT:    call __umodsi3 at plt
 ; RV32I-NEXT:    sh a0, 6(s3)
 ; RV32I-NEXT:    sh s1, 4(s3)
-; RV32I-NEXT:    sh s2, 2(s3)
+; RV32I-NEXT:    sh s0, 2(s3)
 ; RV32I-NEXT:    sh s4, 0(s3)
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
@@ -54,38 +54,38 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) nounwind {
 ;
 ; RV32IM-LABEL: fold_urem_vec_1:
 ; RV32IM:       # %bb.0:
-; RV32IM-NEXT:    lhu a2, 12(a1)
-; RV32IM-NEXT:    lhu a3, 8(a1)
-; RV32IM-NEXT:    lhu a4, 0(a1)
-; RV32IM-NEXT:    lhu a1, 4(a1)
+; RV32IM-NEXT:    lhu a2, 0(a1)
+; RV32IM-NEXT:    lhu a3, 4(a1)
+; RV32IM-NEXT:    lhu a4, 8(a1)
+; RV32IM-NEXT:    lhu a1, 12(a1)
 ; RV32IM-NEXT:    lui a5, 11038
 ; RV32IM-NEXT:    addi a5, a5, -1465
-; RV32IM-NEXT:    mulhu a5, a4, a5
+; RV32IM-NEXT:    mulhu a5, a2, a5
 ; RV32IM-NEXT:    li a6, 95
 ; RV32IM-NEXT:    mul a5, a5, a6
-; RV32IM-NEXT:    sub a4, a4, a5
+; RV32IM-NEXT:    sub a2, a2, a5
 ; RV32IM-NEXT:    lui a5, 8456
 ; RV32IM-NEXT:    addi a5, a5, 1058
-; RV32IM-NEXT:    mulhu a5, a1, a5
+; RV32IM-NEXT:    mulhu a5, a3, a5
 ; RV32IM-NEXT:    li a6, 124
 ; RV32IM-NEXT:    mul a5, a5, a6
-; RV32IM-NEXT:    sub a1, a1, a5
+; RV32IM-NEXT:    sub a3, a3, a5
 ; RV32IM-NEXT:    lui a5, 10700
 ; RV32IM-NEXT:    addi a5, a5, -1003
-; RV32IM-NEXT:    mulhu a5, a3, a5
+; RV32IM-NEXT:    mulhu a5, a4, a5
 ; RV32IM-NEXT:    li a6, 98
 ; RV32IM-NEXT:    mul a5, a5, a6
-; RV32IM-NEXT:    sub a3, a3, a5
+; RV32IM-NEXT:    sub a4, a4, a5
 ; RV32IM-NEXT:    lui a5, 1045
 ; RV32IM-NEXT:    addi a5, a5, 1801
-; RV32IM-NEXT:    mulhu a5, a2, a5
+; RV32IM-NEXT:    mulhu a5, a1, a5
 ; RV32IM-NEXT:    li a6, 1003
 ; RV32IM-NEXT:    mul a5, a5, a6
-; RV32IM-NEXT:    sub a2, a2, a5
-; RV32IM-NEXT:    sh a2, 6(a0)
-; RV32IM-NEXT:    sh a3, 4(a0)
-; RV32IM-NEXT:    sh a1, 2(a0)
-; RV32IM-NEXT:    sh a4, 0(a0)
+; RV32IM-NEXT:    sub a1, a1, a5
+; RV32IM-NEXT:    sh a1, 6(a0)
+; RV32IM-NEXT:    sh a4, 4(a0)
+; RV32IM-NEXT:    sh a3, 2(a0)
+; RV32IM-NEXT:    sh a2, 0(a0)
 ; RV32IM-NEXT:    ret
 ;
 ; RV64I-LABEL: fold_urem_vec_1:
@@ -97,29 +97,29 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) nounwind {
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s4, 0(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lhu s0, 24(a1)
-; RV64I-NEXT:    lhu s1, 16(a1)
-; RV64I-NEXT:    lhu s2, 8(a1)
 ; RV64I-NEXT:    lhu a2, 0(a1)
+; RV64I-NEXT:    lhu s0, 8(a1)
+; RV64I-NEXT:    lhu s1, 16(a1)
+; RV64I-NEXT:    lhu s2, 24(a1)
 ; RV64I-NEXT:    mv s3, a0
 ; RV64I-NEXT:    li a1, 95
 ; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:    call __umoddi3 at plt
 ; RV64I-NEXT:    mv s4, a0
 ; RV64I-NEXT:    li a1, 124
-; RV64I-NEXT:    mv a0, s2
+; RV64I-NEXT:    mv a0, s0
 ; RV64I-NEXT:    call __umoddi3 at plt
-; RV64I-NEXT:    mv s2, a0
+; RV64I-NEXT:    mv s0, a0
 ; RV64I-NEXT:    li a1, 98
 ; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call __umoddi3 at plt
 ; RV64I-NEXT:    mv s1, a0
 ; RV64I-NEXT:    li a1, 1003
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s2
 ; RV64I-NEXT:    call __umoddi3 at plt
 ; RV64I-NEXT:    sh a0, 6(s3)
 ; RV64I-NEXT:    sh s1, 4(s3)
-; RV64I-NEXT:    sh s2, 2(s3)
+; RV64I-NEXT:    sh s0, 2(s3)
 ; RV64I-NEXT:    sh s4, 0(s3)
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
@@ -132,38 +132,38 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) nounwind {
 ;
 ; RV64IM-LABEL: fold_urem_vec_1:
 ; RV64IM:       # %bb.0:
-; RV64IM-NEXT:    lhu a2, 0(a1)
-; RV64IM-NEXT:    lui a3, %hi(.LCPI0_0)
-; RV64IM-NEXT:    ld a3, %lo(.LCPI0_0)(a3)
-; RV64IM-NEXT:    lhu a4, 24(a1)
+; RV64IM-NEXT:    lui a2, %hi(.LCPI0_0)
+; RV64IM-NEXT:    ld a2, %lo(.LCPI0_0)(a2)
+; RV64IM-NEXT:    lhu a3, 0(a1)
+; RV64IM-NEXT:    lhu a4, 8(a1)
 ; RV64IM-NEXT:    lhu a5, 16(a1)
-; RV64IM-NEXT:    lhu a1, 8(a1)
-; RV64IM-NEXT:    mulhu a3, a2, a3
+; RV64IM-NEXT:    lhu a1, 24(a1)
+; RV64IM-NEXT:    mulhu a2, a3, a2
 ; RV64IM-NEXT:    lui a6, %hi(.LCPI0_1)
 ; RV64IM-NEXT:    ld a6, %lo(.LCPI0_1)(a6)
 ; RV64IM-NEXT:    li a7, 95
-; RV64IM-NEXT:    mul a3, a3, a7
-; RV64IM-NEXT:    subw a2, a2, a3
-; RV64IM-NEXT:    mulhu a3, a1, a6
+; RV64IM-NEXT:    mul a2, a2, a7
+; RV64IM-NEXT:    subw a3, a3, a2
+; RV64IM-NEXT:    mulhu a2, a4, a6
 ; RV64IM-NEXT:    lui a6, %hi(.LCPI0_2)
 ; RV64IM-NEXT:    ld a6, %lo(.LCPI0_2)(a6)
 ; RV64IM-NEXT:    li a7, 124
-; RV64IM-NEXT:    mul a3, a3, a7
-; RV64IM-NEXT:    subw a1, a1, a3
-; RV64IM-NEXT:    mulhu a3, a5, a6
+; RV64IM-NEXT:    mul a2, a2, a7
+; RV64IM-NEXT:    subw a4, a4, a2
+; RV64IM-NEXT:    mulhu a2, a5, a6
 ; RV64IM-NEXT:    lui a6, %hi(.LCPI0_3)
 ; RV64IM-NEXT:    ld a6, %lo(.LCPI0_3)(a6)
 ; RV64IM-NEXT:    li a7, 98
-; RV64IM-NEXT:    mul a3, a3, a7
-; RV64IM-NEXT:    subw a5, a5, a3
-; RV64IM-NEXT:    mulhu a3, a4, a6
+; RV64IM-NEXT:    mul a2, a2, a7
+; RV64IM-NEXT:    subw a5, a5, a2
+; RV64IM-NEXT:    mulhu a2, a1, a6
 ; RV64IM-NEXT:    li a6, 1003
-; RV64IM-NEXT:    mul a3, a3, a6
-; RV64IM-NEXT:    subw a4, a4, a3
-; RV64IM-NEXT:    sh a4, 6(a0)
+; RV64IM-NEXT:    mul a2, a2, a6
+; RV64IM-NEXT:    subw a1, a1, a2
+; RV64IM-NEXT:    sh a1, 6(a0)
 ; RV64IM-NEXT:    sh a5, 4(a0)
-; RV64IM-NEXT:    sh a1, 2(a0)
-; RV64IM-NEXT:    sh a2, 0(a0)
+; RV64IM-NEXT:    sh a4, 2(a0)
+; RV64IM-NEXT:    sh a3, 0(a0)
 ; RV64IM-NEXT:    ret
   %1 = urem <4 x i16> %x, <i16 95, i16 124, i16 98, i16 1003>
   ret <4 x i16> %1
@@ -179,29 +179,29 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) nounwind {
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lhu s0, 12(a1)
-; RV32I-NEXT:    lhu s1, 8(a1)
-; RV32I-NEXT:    lhu s2, 4(a1)
 ; RV32I-NEXT:    lhu a2, 0(a1)
+; RV32I-NEXT:    lhu s0, 4(a1)
+; RV32I-NEXT:    lhu s1, 8(a1)
+; RV32I-NEXT:    lhu s2, 12(a1)
 ; RV32I-NEXT:    mv s3, a0
 ; RV32I-NEXT:    li a1, 95
 ; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:    call __umodsi3 at plt
 ; RV32I-NEXT:    mv s4, a0
 ; RV32I-NEXT:    li a1, 95
-; RV32I-NEXT:    mv a0, s2
+; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    call __umodsi3 at plt
-; RV32I-NEXT:    mv s2, a0
+; RV32I-NEXT:    mv s0, a0
 ; RV32I-NEXT:    li a1, 95
 ; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call __umodsi3 at plt
 ; RV32I-NEXT:    mv s1, a0
 ; RV32I-NEXT:    li a1, 95
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s2
 ; RV32I-NEXT:    call __umodsi3 at plt
 ; RV32I-NEXT:    sh a0, 6(s3)
 ; RV32I-NEXT:    sh s1, 4(s3)
-; RV32I-NEXT:    sh s2, 2(s3)
+; RV32I-NEXT:    sh s0, 2(s3)
 ; RV32I-NEXT:    sh s4, 0(s3)
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
@@ -214,29 +214,29 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) nounwind {
 ;
 ; RV32IM-LABEL: fold_urem_vec_2:
 ; RV32IM:       # %bb.0:
-; RV32IM-NEXT:    lhu a2, 12(a1)
-; RV32IM-NEXT:    lhu a3, 8(a1)
-; RV32IM-NEXT:    lhu a4, 0(a1)
-; RV32IM-NEXT:    lhu a1, 4(a1)
+; RV32IM-NEXT:    lhu a2, 0(a1)
+; RV32IM-NEXT:    lhu a3, 4(a1)
+; RV32IM-NEXT:    lhu a4, 8(a1)
+; RV32IM-NEXT:    lhu a1, 12(a1)
 ; RV32IM-NEXT:    lui a5, 11038
 ; RV32IM-NEXT:    addi a5, a5, -1465
-; RV32IM-NEXT:    mulhu a6, a4, a5
+; RV32IM-NEXT:    mulhu a6, a2, a5
 ; RV32IM-NEXT:    li a7, 95
 ; RV32IM-NEXT:    mul a6, a6, a7
-; RV32IM-NEXT:    sub a4, a4, a6
-; RV32IM-NEXT:    mulhu a6, a1, a5
-; RV32IM-NEXT:    mul a6, a6, a7
-; RV32IM-NEXT:    sub a1, a1, a6
+; RV32IM-NEXT:    sub a2, a2, a6
 ; RV32IM-NEXT:    mulhu a6, a3, a5
 ; RV32IM-NEXT:    mul a6, a6, a7
 ; RV32IM-NEXT:    sub a3, a3, a6
-; RV32IM-NEXT:    mulhu a5, a2, a5
+; RV32IM-NEXT:    mulhu a6, a4, a5
+; RV32IM-NEXT:    mul a6, a6, a7
+; RV32IM-NEXT:    sub a4, a4, a6
+; RV32IM-NEXT:    mulhu a5, a1, a5
 ; RV32IM-NEXT:    mul a5, a5, a7
-; RV32IM-NEXT:    sub a2, a2, a5
-; RV32IM-NEXT:    sh a2, 6(a0)
-; RV32IM-NEXT:    sh a3, 4(a0)
-; RV32IM-NEXT:    sh a1, 2(a0)
-; RV32IM-NEXT:    sh a4, 0(a0)
+; RV32IM-NEXT:    sub a1, a1, a5
+; RV32IM-NEXT:    sh a1, 6(a0)
+; RV32IM-NEXT:    sh a4, 4(a0)
+; RV32IM-NEXT:    sh a3, 2(a0)
+; RV32IM-NEXT:    sh a2, 0(a0)
 ; RV32IM-NEXT:    ret
 ;
 ; RV64I-LABEL: fold_urem_vec_2:
@@ -248,29 +248,29 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) nounwind {
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s4, 0(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lhu s0, 24(a1)
-; RV64I-NEXT:    lhu s1, 16(a1)
-; RV64I-NEXT:    lhu s2, 8(a1)
 ; RV64I-NEXT:    lhu a2, 0(a1)
+; RV64I-NEXT:    lhu s0, 8(a1)
+; RV64I-NEXT:    lhu s1, 16(a1)
+; RV64I-NEXT:    lhu s2, 24(a1)
 ; RV64I-NEXT:    mv s3, a0
 ; RV64I-NEXT:    li a1, 95
 ; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:    call __umoddi3 at plt
 ; RV64I-NEXT:    mv s4, a0
 ; RV64I-NEXT:    li a1, 95
-; RV64I-NEXT:    mv a0, s2
+; RV64I-NEXT:    mv a0, s0
 ; RV64I-NEXT:    call __umoddi3 at plt
-; RV64I-NEXT:    mv s2, a0
+; RV64I-NEXT:    mv s0, a0
 ; RV64I-NEXT:    li a1, 95
 ; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call __umoddi3 at plt
 ; RV64I-NEXT:    mv s1, a0
 ; RV64I-NEXT:    li a1, 95
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s2
 ; RV64I-NEXT:    call __umoddi3 at plt
 ; RV64I-NEXT:    sh a0, 6(s3)
 ; RV64I-NEXT:    sh s1, 4(s3)
-; RV64I-NEXT:    sh s2, 2(s3)
+; RV64I-NEXT:    sh s0, 2(s3)
 ; RV64I-NEXT:    sh s4, 0(s3)
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
@@ -283,29 +283,29 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) nounwind {
 ;
 ; RV64IM-LABEL: fold_urem_vec_2:
 ; RV64IM:       # %bb.0:
-; RV64IM-NEXT:    lhu a2, 0(a1)
-; RV64IM-NEXT:    lui a3, %hi(.LCPI1_0)
-; RV64IM-NEXT:    ld a3, %lo(.LCPI1_0)(a3)
-; RV64IM-NEXT:    lhu a4, 24(a1)
+; RV64IM-NEXT:    lui a2, %hi(.LCPI1_0)
+; RV64IM-NEXT:    ld a2, %lo(.LCPI1_0)(a2)
+; RV64IM-NEXT:    lhu a3, 0(a1)
+; RV64IM-NEXT:    lhu a4, 8(a1)
 ; RV64IM-NEXT:    lhu a5, 16(a1)
-; RV64IM-NEXT:    lhu a1, 8(a1)
-; RV64IM-NEXT:    mulhu a6, a2, a3
+; RV64IM-NEXT:    lhu a1, 24(a1)
+; RV64IM-NEXT:    mulhu a6, a3, a2
 ; RV64IM-NEXT:    li a7, 95
 ; RV64IM-NEXT:    mul a6, a6, a7
-; RV64IM-NEXT:    subw a2, a2, a6
-; RV64IM-NEXT:    mulhu a6, a1, a3
+; RV64IM-NEXT:    subw a3, a3, a6
+; RV64IM-NEXT:    mulhu a6, a4, a2
 ; RV64IM-NEXT:    mul a6, a6, a7
-; RV64IM-NEXT:    subw a1, a1, a6
-; RV64IM-NEXT:    mulhu a6, a5, a3
+; RV64IM-NEXT:    subw a4, a4, a6
+; RV64IM-NEXT:    mulhu a6, a5, a2
 ; RV64IM-NEXT:    mul a6, a6, a7
 ; RV64IM-NEXT:    subw a5, a5, a6
-; RV64IM-NEXT:    mulhu a3, a4, a3
-; RV64IM-NEXT:    mul a3, a3, a7
-; RV64IM-NEXT:    subw a4, a4, a3
-; RV64IM-NEXT:    sh a4, 6(a0)
+; RV64IM-NEXT:    mulhu a2, a1, a2
+; RV64IM-NEXT:    mul a2, a2, a7
+; RV64IM-NEXT:    subw a1, a1, a2
+; RV64IM-NEXT:    sh a1, 6(a0)
 ; RV64IM-NEXT:    sh a5, 4(a0)
-; RV64IM-NEXT:    sh a1, 2(a0)
-; RV64IM-NEXT:    sh a2, 0(a0)
+; RV64IM-NEXT:    sh a4, 2(a0)
+; RV64IM-NEXT:    sh a3, 0(a0)
 ; RV64IM-NEXT:    ret
   %1 = urem <4 x i16> %x, <i16 95, i16 95, i16 95, i16 95>
   ret <4 x i16> %1
@@ -386,33 +386,33 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) nounwind {
 ;
 ; RV32IM-LABEL: combine_urem_udiv:
 ; RV32IM:       # %bb.0:
-; RV32IM-NEXT:    lhu a2, 0(a1)
-; RV32IM-NEXT:    lhu a3, 4(a1)
-; RV32IM-NEXT:    lhu a4, 12(a1)
+; RV32IM-NEXT:    lhu a2, 12(a1)
+; RV32IM-NEXT:    lhu a3, 0(a1)
+; RV32IM-NEXT:    lhu a4, 4(a1)
 ; RV32IM-NEXT:    lhu a1, 8(a1)
 ; RV32IM-NEXT:    lui a5, 11038
 ; RV32IM-NEXT:    addi a5, a5, -1465
-; RV32IM-NEXT:    mulhu a6, a4, a5
+; RV32IM-NEXT:    mulhu a6, a2, a5
 ; RV32IM-NEXT:    li a7, 95
 ; RV32IM-NEXT:    mul t0, a6, a7
 ; RV32IM-NEXT:    mulhu t1, a1, a5
 ; RV32IM-NEXT:    mul t2, t1, a7
-; RV32IM-NEXT:    mulhu t3, a3, a5
+; RV32IM-NEXT:    mulhu t3, a4, a5
 ; RV32IM-NEXT:    mul t4, t3, a7
-; RV32IM-NEXT:    mulhu a5, a2, a5
+; RV32IM-NEXT:    mulhu a5, a3, a5
 ; RV32IM-NEXT:    mul a7, a5, a7
-; RV32IM-NEXT:    add a2, a2, a5
-; RV32IM-NEXT:    sub a2, a2, a7
-; RV32IM-NEXT:    add a3, a3, t3
-; RV32IM-NEXT:    sub a3, a3, t4
+; RV32IM-NEXT:    add a3, a3, a5
+; RV32IM-NEXT:    sub a3, a3, a7
+; RV32IM-NEXT:    add a4, a4, t3
+; RV32IM-NEXT:    sub a4, a4, t4
 ; RV32IM-NEXT:    add a1, a1, t1
 ; RV32IM-NEXT:    sub a1, a1, t2
-; RV32IM-NEXT:    add a4, a4, a6
-; RV32IM-NEXT:    sub a4, a4, t0
-; RV32IM-NEXT:    sh a4, 6(a0)
+; RV32IM-NEXT:    add a2, a2, a6
+; RV32IM-NEXT:    sub a2, a2, t0
+; RV32IM-NEXT:    sh a2, 6(a0)
 ; RV32IM-NEXT:    sh a1, 4(a0)
-; RV32IM-NEXT:    sh a3, 2(a0)
-; RV32IM-NEXT:    sh a2, 0(a0)
+; RV32IM-NEXT:    sh a4, 2(a0)
+; RV32IM-NEXT:    sh a3, 0(a0)
 ; RV32IM-NEXT:    ret
 ;
 ; RV64I-LABEL: combine_urem_udiv:
@@ -531,19 +531,19 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lhu s1, 8(a1)
+; RV32I-NEXT:    lhu s1, 0(a1)
 ; RV32I-NEXT:    lhu s2, 4(a1)
-; RV32I-NEXT:    lhu s3, 0(a1)
+; RV32I-NEXT:    lhu s3, 8(a1)
 ; RV32I-NEXT:    lhu a2, 12(a1)
 ; RV32I-NEXT:    mv s0, a0
 ; RV32I-NEXT:    li a1, 95
 ; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:    call __umodsi3 at plt
-; RV32I-NEXT:    andi a1, s3, 63
+; RV32I-NEXT:    andi a1, s1, 63
 ; RV32I-NEXT:    andi a2, s2, 31
-; RV32I-NEXT:    andi s1, s1, 7
+; RV32I-NEXT:    andi a3, s3, 7
 ; RV32I-NEXT:    sh a0, 6(s0)
-; RV32I-NEXT:    sh s1, 4(s0)
+; RV32I-NEXT:    sh a3, 4(s0)
 ; RV32I-NEXT:    sh a2, 2(s0)
 ; RV32I-NEXT:    sh a1, 0(s0)
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
@@ -556,8 +556,8 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) nounwind {
 ;
 ; RV32IM-LABEL: dont_fold_urem_power_of_two:
 ; RV32IM:       # %bb.0:
-; RV32IM-NEXT:    lhu a2, 8(a1)
-; RV32IM-NEXT:    lhu a3, 4(a1)
+; RV32IM-NEXT:    lhu a2, 4(a1)
+; RV32IM-NEXT:    lhu a3, 8(a1)
 ; RV32IM-NEXT:    lhu a4, 12(a1)
 ; RV32IM-NEXT:    lhu a1, 0(a1)
 ; RV32IM-NEXT:    lui a5, 11038
@@ -567,10 +567,10 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) nounwind {
 ; RV32IM-NEXT:    mul a5, a5, a6
 ; RV32IM-NEXT:    sub a4, a4, a5
 ; RV32IM-NEXT:    andi a1, a1, 63
-; RV32IM-NEXT:    andi a3, a3, 31
-; RV32IM-NEXT:    andi a2, a2, 7
-; RV32IM-NEXT:    sh a2, 4(a0)
-; RV32IM-NEXT:    sh a3, 2(a0)
+; RV32IM-NEXT:    andi a2, a2, 31
+; RV32IM-NEXT:    andi a3, a3, 7
+; RV32IM-NEXT:    sh a3, 4(a0)
+; RV32IM-NEXT:    sh a2, 2(a0)
 ; RV32IM-NEXT:    sh a1, 0(a0)
 ; RV32IM-NEXT:    sh a4, 6(a0)
 ; RV32IM-NEXT:    ret
@@ -583,19 +583,19 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) nounwind {
 ; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lhu s1, 16(a1)
+; RV64I-NEXT:    lhu s1, 0(a1)
 ; RV64I-NEXT:    lhu s2, 8(a1)
-; RV64I-NEXT:    lhu s3, 0(a1)
+; RV64I-NEXT:    lhu s3, 16(a1)
 ; RV64I-NEXT:    lhu a2, 24(a1)
 ; RV64I-NEXT:    mv s0, a0
 ; RV64I-NEXT:    li a1, 95
 ; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:    call __umoddi3 at plt
-; RV64I-NEXT:    andi a1, s3, 63
+; RV64I-NEXT:    andi a1, s1, 63
 ; RV64I-NEXT:    andi a2, s2, 31
-; RV64I-NEXT:    andi s1, s1, 7
+; RV64I-NEXT:    andi a3, s3, 7
 ; RV64I-NEXT:    sh a0, 6(s0)
-; RV64I-NEXT:    sh s1, 4(s0)
+; RV64I-NEXT:    sh a3, 4(s0)
 ; RV64I-NEXT:    sh a2, 2(s0)
 ; RV64I-NEXT:    sh a1, 0(s0)
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
@@ -640,24 +640,24 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lhu s0, 12(a1)
-; RV32I-NEXT:    lhu s1, 8(a1)
 ; RV32I-NEXT:    lhu a2, 4(a1)
+; RV32I-NEXT:    lhu s0, 8(a1)
+; RV32I-NEXT:    lhu s1, 12(a1)
 ; RV32I-NEXT:    mv s2, a0
 ; RV32I-NEXT:    li a1, 654
 ; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:    call __umodsi3 at plt
 ; RV32I-NEXT:    mv s3, a0
 ; RV32I-NEXT:    li a1, 23
-; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    call __umodsi3 at plt
-; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    mv s0, a0
 ; RV32I-NEXT:    lui a0, 1
 ; RV32I-NEXT:    addi a1, a0, 1327
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call __umodsi3 at plt
 ; RV32I-NEXT:    sh a0, 6(s2)
-; RV32I-NEXT:    sh s1, 4(s2)
+; RV32I-NEXT:    sh s0, 4(s2)
 ; RV32I-NEXT:    sh s3, 2(s2)
 ; RV32I-NEXT:    sh zero, 0(s2)
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
@@ -670,32 +670,32 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) nounwind {
 ;
 ; RV32IM-LABEL: dont_fold_urem_one:
 ; RV32IM:       # %bb.0:
-; RV32IM-NEXT:    lhu a2, 12(a1)
-; RV32IM-NEXT:    lhu a3, 4(a1)
-; RV32IM-NEXT:    lhu a1, 8(a1)
+; RV32IM-NEXT:    lhu a2, 4(a1)
+; RV32IM-NEXT:    lhu a3, 8(a1)
+; RV32IM-NEXT:    lhu a1, 12(a1)
 ; RV32IM-NEXT:    lui a4, 1603
 ; RV32IM-NEXT:    addi a4, a4, 1341
-; RV32IM-NEXT:    mulhu a4, a3, a4
+; RV32IM-NEXT:    mulhu a4, a2, a4
 ; RV32IM-NEXT:    li a5, 654
 ; RV32IM-NEXT:    mul a4, a4, a5
-; RV32IM-NEXT:    sub a3, a3, a4
+; RV32IM-NEXT:    sub a2, a2, a4
 ; RV32IM-NEXT:    lui a4, 45590
 ; RV32IM-NEXT:    addi a4, a4, 1069
-; RV32IM-NEXT:    mulhu a4, a1, a4
+; RV32IM-NEXT:    mulhu a4, a3, a4
 ; RV32IM-NEXT:    li a5, 23
 ; RV32IM-NEXT:    mul a4, a4, a5
-; RV32IM-NEXT:    sub a1, a1, a4
+; RV32IM-NEXT:    sub a3, a3, a4
 ; RV32IM-NEXT:    lui a4, 193
 ; RV32IM-NEXT:    addi a4, a4, 1464
-; RV32IM-NEXT:    mulhu a4, a2, a4
+; RV32IM-NEXT:    mulhu a4, a1, a4
 ; RV32IM-NEXT:    lui a5, 1
 ; RV32IM-NEXT:    addi a5, a5, 1327
 ; RV32IM-NEXT:    mul a4, a4, a5
-; RV32IM-NEXT:    sub a2, a2, a4
+; RV32IM-NEXT:    sub a1, a1, a4
 ; RV32IM-NEXT:    sh zero, 0(a0)
-; RV32IM-NEXT:    sh a2, 6(a0)
-; RV32IM-NEXT:    sh a1, 4(a0)
-; RV32IM-NEXT:    sh a3, 2(a0)
+; RV32IM-NEXT:    sh a1, 6(a0)
+; RV32IM-NEXT:    sh a3, 4(a0)
+; RV32IM-NEXT:    sh a2, 2(a0)
 ; RV32IM-NEXT:    ret
 ;
 ; RV64I-LABEL: dont_fold_urem_one:
@@ -706,24 +706,24 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) nounwind {
 ; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lhu s0, 24(a1)
-; RV64I-NEXT:    lhu s1, 16(a1)
 ; RV64I-NEXT:    lhu a2, 8(a1)
+; RV64I-NEXT:    lhu s0, 16(a1)
+; RV64I-NEXT:    lhu s1, 24(a1)
 ; RV64I-NEXT:    mv s2, a0
 ; RV64I-NEXT:    li a1, 654
 ; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:    call __umoddi3 at plt
 ; RV64I-NEXT:    mv s3, a0
 ; RV64I-NEXT:    li a1, 23
-; RV64I-NEXT:    mv a0, s1
+; RV64I-NEXT:    mv a0, s0
 ; RV64I-NEXT:    call __umoddi3 at plt
-; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    mv s0, a0
 ; RV64I-NEXT:    lui a0, 1
 ; RV64I-NEXT:    addiw a1, a0, 1327
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call __umoddi3 at plt
 ; RV64I-NEXT:    sh a0, 6(s2)
-; RV64I-NEXT:    sh s1, 4(s2)
+; RV64I-NEXT:    sh s0, 4(s2)
 ; RV64I-NEXT:    sh s3, 2(s2)
 ; RV64I-NEXT:    sh zero, 0(s2)
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
@@ -736,32 +736,32 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) nounwind {
 ;
 ; RV64IM-LABEL: dont_fold_urem_one:
 ; RV64IM:       # %bb.0:
-; RV64IM-NEXT:    lhu a2, 8(a1)
-; RV64IM-NEXT:    lui a3, %hi(.LCPI4_0)
-; RV64IM-NEXT:    ld a3, %lo(.LCPI4_0)(a3)
-; RV64IM-NEXT:    lhu a4, 24(a1)
-; RV64IM-NEXT:    lhu a1, 16(a1)
-; RV64IM-NEXT:    mulhu a3, a2, a3
+; RV64IM-NEXT:    lui a2, %hi(.LCPI4_0)
+; RV64IM-NEXT:    ld a2, %lo(.LCPI4_0)(a2)
+; RV64IM-NEXT:    lhu a3, 8(a1)
+; RV64IM-NEXT:    lhu a4, 16(a1)
+; RV64IM-NEXT:    lhu a1, 24(a1)
+; RV64IM-NEXT:    mulhu a2, a3, a2
 ; RV64IM-NEXT:    lui a5, %hi(.LCPI4_1)
 ; RV64IM-NEXT:    ld a5, %lo(.LCPI4_1)(a5)
 ; RV64IM-NEXT:    li a6, 654
-; RV64IM-NEXT:    mul a3, a3, a6
-; RV64IM-NEXT:    subw a2, a2, a3
-; RV64IM-NEXT:    mulhu a3, a1, a5
+; RV64IM-NEXT:    mul a2, a2, a6
+; RV64IM-NEXT:    subw a3, a3, a2
+; RV64IM-NEXT:    mulhu a2, a4, a5
 ; RV64IM-NEXT:    lui a5, %hi(.LCPI4_2)
 ; RV64IM-NEXT:    ld a5, %lo(.LCPI4_2)(a5)
 ; RV64IM-NEXT:    li a6, 23
-; RV64IM-NEXT:    mul a3, a3, a6
-; RV64IM-NEXT:    subw a1, a1, a3
-; RV64IM-NEXT:    mulhu a3, a4, a5
+; RV64IM-NEXT:    mul a2, a2, a6
+; RV64IM-NEXT:    subw a4, a4, a2
+; RV64IM-NEXT:    mulhu a2, a1, a5
 ; RV64IM-NEXT:    lui a5, 1
 ; RV64IM-NEXT:    addi a5, a5, 1327
-; RV64IM-NEXT:    mul a3, a3, a5
-; RV64IM-NEXT:    subw a4, a4, a3
+; RV64IM-NEXT:    mul a2, a2, a5
+; RV64IM-NEXT:    subw a1, a1, a2
 ; RV64IM-NEXT:    sh zero, 0(a0)
-; RV64IM-NEXT:    sh a4, 6(a0)
-; RV64IM-NEXT:    sh a1, 4(a0)
-; RV64IM-NEXT:    sh a2, 2(a0)
+; RV64IM-NEXT:    sh a1, 6(a0)
+; RV64IM-NEXT:    sh a4, 4(a0)
+; RV64IM-NEXT:    sh a3, 2(a0)
 ; RV64IM-NEXT:    ret
   %1 = urem <4 x i16> %x, <i16 1, i16 654, i16 23, i16 5423>
   ret <4 x i16> %1
@@ -791,28 +791,29 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) nounwind {
 ; RV32I-NEXT:    sw s6, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s7, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s8, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lw s0, 24(a1)
-; RV32I-NEXT:    lw s1, 28(a1)
+; RV32I-NEXT:    lw a3, 0(a1)
+; RV32I-NEXT:    lw a4, 4(a1)
+; RV32I-NEXT:    lw s0, 8(a1)
+; RV32I-NEXT:    lw s1, 12(a1)
 ; RV32I-NEXT:    lw s2, 16(a1)
 ; RV32I-NEXT:    lw s3, 20(a1)
-; RV32I-NEXT:    lw s4, 8(a1)
-; RV32I-NEXT:    lw s5, 12(a1)
-; RV32I-NEXT:    lw a3, 0(a1)
-; RV32I-NEXT:    lw a1, 4(a1)
+; RV32I-NEXT:    lw s4, 24(a1)
+; RV32I-NEXT:    lw s5, 28(a1)
 ; RV32I-NEXT:    mv s6, a0
 ; RV32I-NEXT:    li a2, 1
 ; RV32I-NEXT:    mv a0, a3
+; RV32I-NEXT:    mv a1, a4
 ; RV32I-NEXT:    li a3, 0
 ; RV32I-NEXT:    call __umoddi3 at plt
 ; RV32I-NEXT:    mv s7, a0
 ; RV32I-NEXT:    mv s8, a1
 ; RV32I-NEXT:    li a2, 654
-; RV32I-NEXT:    mv a0, s4
-; RV32I-NEXT:    mv a1, s5
+; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a1, s1
 ; RV32I-NEXT:    li a3, 0
 ; RV32I-NEXT:    call __umoddi3 at plt
-; RV32I-NEXT:    mv s4, a0
-; RV32I-NEXT:    mv s5, a1
+; RV32I-NEXT:    mv s0, a0
+; RV32I-NEXT:    mv s1, a1
 ; RV32I-NEXT:    li a2, 23
 ; RV32I-NEXT:    mv a0, s2
 ; RV32I-NEXT:    mv a1, s3
@@ -822,16 +823,16 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) nounwind {
 ; RV32I-NEXT:    mv s3, a1
 ; RV32I-NEXT:    lui a0, 1
 ; RV32I-NEXT:    addi a2, a0, 1327
-; RV32I-NEXT:    mv a0, s0
-; RV32I-NEXT:    mv a1, s1
+; RV32I-NEXT:    mv a0, s4
+; RV32I-NEXT:    mv a1, s5
 ; RV32I-NEXT:    li a3, 0
 ; RV32I-NEXT:    call __umoddi3 at plt
 ; RV32I-NEXT:    sw a1, 28(s6)
 ; RV32I-NEXT:    sw a0, 24(s6)
 ; RV32I-NEXT:    sw s3, 20(s6)
 ; RV32I-NEXT:    sw s2, 16(s6)
-; RV32I-NEXT:    sw s5, 12(s6)
-; RV32I-NEXT:    sw s4, 8(s6)
+; RV32I-NEXT:    sw s1, 12(s6)
+; RV32I-NEXT:    sw s0, 8(s6)
 ; RV32I-NEXT:    sw s8, 4(s6)
 ; RV32I-NEXT:    sw s7, 0(s6)
 ; RV32I-NEXT:    lw ra, 44(sp) # 4-byte Folded Reload
@@ -860,28 +861,29 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) nounwind {
 ; RV32IM-NEXT:    sw s6, 16(sp) # 4-byte Folded Spill
 ; RV32IM-NEXT:    sw s7, 12(sp) # 4-byte Folded Spill
 ; RV32IM-NEXT:    sw s8, 8(sp) # 4-byte Folded Spill
-; RV32IM-NEXT:    lw s0, 24(a1)
-; RV32IM-NEXT:    lw s1, 28(a1)
+; RV32IM-NEXT:    lw a3, 0(a1)
+; RV32IM-NEXT:    lw a4, 4(a1)
+; RV32IM-NEXT:    lw s0, 8(a1)
+; RV32IM-NEXT:    lw s1, 12(a1)
 ; RV32IM-NEXT:    lw s2, 16(a1)
 ; RV32IM-NEXT:    lw s3, 20(a1)
-; RV32IM-NEXT:    lw s4, 8(a1)
-; RV32IM-NEXT:    lw s5, 12(a1)
-; RV32IM-NEXT:    lw a3, 0(a1)
-; RV32IM-NEXT:    lw a1, 4(a1)
+; RV32IM-NEXT:    lw s4, 24(a1)
+; RV32IM-NEXT:    lw s5, 28(a1)
 ; RV32IM-NEXT:    mv s6, a0
 ; RV32IM-NEXT:    li a2, 1
 ; RV32IM-NEXT:    mv a0, a3
+; RV32IM-NEXT:    mv a1, a4
 ; RV32IM-NEXT:    li a3, 0
 ; RV32IM-NEXT:    call __umoddi3 at plt
 ; RV32IM-NEXT:    mv s7, a0
 ; RV32IM-NEXT:    mv s8, a1
 ; RV32IM-NEXT:    li a2, 654
-; RV32IM-NEXT:    mv a0, s4
-; RV32IM-NEXT:    mv a1, s5
+; RV32IM-NEXT:    mv a0, s0
+; RV32IM-NEXT:    mv a1, s1
 ; RV32IM-NEXT:    li a3, 0
 ; RV32IM-NEXT:    call __umoddi3 at plt
-; RV32IM-NEXT:    mv s4, a0
-; RV32IM-NEXT:    mv s5, a1
+; RV32IM-NEXT:    mv s0, a0
+; RV32IM-NEXT:    mv s1, a1
 ; RV32IM-NEXT:    li a2, 23
 ; RV32IM-NEXT:    mv a0, s2
 ; RV32IM-NEXT:    mv a1, s3
@@ -891,16 +893,16 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) nounwind {
 ; RV32IM-NEXT:    mv s3, a1
 ; RV32IM-NEXT:    lui a0, 1
 ; RV32IM-NEXT:    addi a2, a0, 1327
-; RV32IM-NEXT:    mv a0, s0
-; RV32IM-NEXT:    mv a1, s1
+; RV32IM-NEXT:    mv a0, s4
+; RV32IM-NEXT:    mv a1, s5
 ; RV32IM-NEXT:    li a3, 0
 ; RV32IM-NEXT:    call __umoddi3 at plt
 ; RV32IM-NEXT:    sw a1, 28(s6)
 ; RV32IM-NEXT:    sw a0, 24(s6)
 ; RV32IM-NEXT:    sw s3, 20(s6)
 ; RV32IM-NEXT:    sw s2, 16(s6)
-; RV32IM-NEXT:    sw s5, 12(s6)
-; RV32IM-NEXT:    sw s4, 8(s6)
+; RV32IM-NEXT:    sw s1, 12(s6)
+; RV32IM-NEXT:    sw s0, 8(s6)
 ; RV32IM-NEXT:    sw s8, 4(s6)
 ; RV32IM-NEXT:    sw s7, 0(s6)
 ; RV32IM-NEXT:    lw ra, 44(sp) # 4-byte Folded Reload
@@ -924,24 +926,24 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) nounwind {
 ; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    ld s0, 24(a1)
-; RV64I-NEXT:    ld s1, 16(a1)
 ; RV64I-NEXT:    ld a2, 8(a1)
+; RV64I-NEXT:    ld s0, 16(a1)
+; RV64I-NEXT:    ld s1, 24(a1)
 ; RV64I-NEXT:    mv s2, a0
 ; RV64I-NEXT:    li a1, 654
 ; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:    call __umoddi3 at plt
 ; RV64I-NEXT:    mv s3, a0
 ; RV64I-NEXT:    li a1, 23
-; RV64I-NEXT:    mv a0, s1
+; RV64I-NEXT:    mv a0, s0
 ; RV64I-NEXT:    call __umoddi3 at plt
-; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    mv s0, a0
 ; RV64I-NEXT:    lui a0, 1
 ; RV64I-NEXT:    addiw a1, a0, 1327
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call __umoddi3 at plt
 ; RV64I-NEXT:    sd a0, 24(s2)
-; RV64I-NEXT:    sd s1, 16(s2)
+; RV64I-NEXT:    sd s0, 16(s2)
 ; RV64I-NEXT:    sd s3, 8(s2)
 ; RV64I-NEXT:    sd zero, 0(s2)
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
@@ -954,39 +956,39 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) nounwind {
 ;
 ; RV64IM-LABEL: dont_fold_urem_i64:
 ; RV64IM:       # %bb.0:
-; RV64IM-NEXT:    ld a2, 16(a1)
-; RV64IM-NEXT:    lui a3, %hi(.LCPI6_0)
-; RV64IM-NEXT:    ld a3, %lo(.LCPI6_0)(a3)
-; RV64IM-NEXT:    ld a4, 24(a1)
-; RV64IM-NEXT:    ld a1, 8(a1)
-; RV64IM-NEXT:    mulhu a3, a2, a3
-; RV64IM-NEXT:    sub a5, a2, a3
+; RV64IM-NEXT:    lui a2, %hi(.LCPI6_0)
+; RV64IM-NEXT:    ld a2, %lo(.LCPI6_0)(a2)
+; RV64IM-NEXT:    ld a3, 16(a1)
+; RV64IM-NEXT:    ld a4, 8(a1)
+; RV64IM-NEXT:    ld a1, 24(a1)
+; RV64IM-NEXT:    mulhu a2, a3, a2
+; RV64IM-NEXT:    sub a5, a3, a2
 ; RV64IM-NEXT:    srli a5, a5, 1
-; RV64IM-NEXT:    add a3, a5, a3
-; RV64IM-NEXT:    srli a3, a3, 4
+; RV64IM-NEXT:    add a2, a5, a2
+; RV64IM-NEXT:    srli a2, a2, 4
 ; RV64IM-NEXT:    li a5, 23
 ; RV64IM-NEXT:    lui a6, %hi(.LCPI6_1)
 ; RV64IM-NEXT:    ld a6, %lo(.LCPI6_1)(a6)
-; RV64IM-NEXT:    mul a3, a3, a5
-; RV64IM-NEXT:    sub a2, a2, a3
-; RV64IM-NEXT:    srli a3, a1, 1
-; RV64IM-NEXT:    mulhu a3, a3, a6
-; RV64IM-NEXT:    srli a3, a3, 7
+; RV64IM-NEXT:    mul a2, a2, a5
+; RV64IM-NEXT:    sub a3, a3, a2
+; RV64IM-NEXT:    srli a2, a4, 1
+; RV64IM-NEXT:    mulhu a2, a2, a6
+; RV64IM-NEXT:    srli a2, a2, 7
 ; RV64IM-NEXT:    lui a5, %hi(.LCPI6_2)
 ; RV64IM-NEXT:    ld a5, %lo(.LCPI6_2)(a5)
 ; RV64IM-NEXT:    li a6, 654
-; RV64IM-NEXT:    mul a3, a3, a6
-; RV64IM-NEXT:    sub a1, a1, a3
-; RV64IM-NEXT:    mulhu a3, a4, a5
-; RV64IM-NEXT:    srli a3, a3, 12
+; RV64IM-NEXT:    mul a2, a2, a6
+; RV64IM-NEXT:    sub a4, a4, a2
+; RV64IM-NEXT:    mulhu a2, a1, a5
+; RV64IM-NEXT:    srli a2, a2, 12
 ; RV64IM-NEXT:    lui a5, 1
 ; RV64IM-NEXT:    addiw a5, a5, 1327
-; RV64IM-NEXT:    mul a3, a3, a5
-; RV64IM-NEXT:    sub a4, a4, a3
+; RV64IM-NEXT:    mul a2, a2, a5
+; RV64IM-NEXT:    sub a1, a1, a2
 ; RV64IM-NEXT:    sd zero, 0(a0)
-; RV64IM-NEXT:    sd a4, 24(a0)
-; RV64IM-NEXT:    sd a1, 8(a0)
-; RV64IM-NEXT:    sd a2, 16(a0)
+; RV64IM-NEXT:    sd a1, 24(a0)
+; RV64IM-NEXT:    sd a4, 8(a0)
+; RV64IM-NEXT:    sd a3, 16(a0)
 ; RV64IM-NEXT:    ret
   %1 = urem <4 x i64> %x, <i64 1, i64 654, i64 23, i64 5423>
   ret <4 x i64> %1
diff --git a/llvm/test/CodeGen/RISCV/vararg.ll b/llvm/test/CodeGen/RISCV/vararg.ll
index 67d1bfac4d614b..a847bdacc9cb08 100644
--- a/llvm/test/CodeGen/RISCV/vararg.ll
+++ b/llvm/test/CodeGen/RISCV/vararg.ll
@@ -498,11 +498,11 @@ define i64 @va2(ptr %fmt, ...) nounwind {
 ; ILP32-ILP32F-FPELIM-NEXT:    addi a0, sp, 20
 ; ILP32-ILP32F-FPELIM-NEXT:    sw a0, 12(sp)
 ; ILP32-ILP32F-FPELIM-NEXT:    addi a0, sp, 27
-; ILP32-ILP32F-FPELIM-NEXT:    andi a0, a0, -8
-; ILP32-ILP32F-FPELIM-NEXT:    addi a1, sp, 35
-; ILP32-ILP32F-FPELIM-NEXT:    sw a1, 12(sp)
-; ILP32-ILP32F-FPELIM-NEXT:    lw a1, 4(a0)
-; ILP32-ILP32F-FPELIM-NEXT:    lw a0, 0(a0)
+; ILP32-ILP32F-FPELIM-NEXT:    andi a1, a0, -8
+; ILP32-ILP32F-FPELIM-NEXT:    addi a0, sp, 35
+; ILP32-ILP32F-FPELIM-NEXT:    sw a0, 12(sp)
+; ILP32-ILP32F-FPELIM-NEXT:    lw a0, 0(a1)
+; ILP32-ILP32F-FPELIM-NEXT:    lw a1, 4(a1)
 ; ILP32-ILP32F-FPELIM-NEXT:    addi sp, sp, 48
 ; ILP32-ILP32F-FPELIM-NEXT:    ret
 ;
@@ -522,11 +522,11 @@ define i64 @va2(ptr %fmt, ...) nounwind {
 ; ILP32-ILP32F-WITHFP-NEXT:    addi a0, s0, 4
 ; ILP32-ILP32F-WITHFP-NEXT:    sw a0, -12(s0)
 ; ILP32-ILP32F-WITHFP-NEXT:    addi a0, s0, 11
-; ILP32-ILP32F-WITHFP-NEXT:    andi a0, a0, -8
-; ILP32-ILP32F-WITHFP-NEXT:    addi a1, s0, 19
-; ILP32-ILP32F-WITHFP-NEXT:    sw a1, -12(s0)
-; ILP32-ILP32F-WITHFP-NEXT:    lw a1, 4(a0)
-; ILP32-ILP32F-WITHFP-NEXT:    lw a0, 0(a0)
+; ILP32-ILP32F-WITHFP-NEXT:    andi a1, a0, -8
+; ILP32-ILP32F-WITHFP-NEXT:    addi a0, s0, 19
+; ILP32-ILP32F-WITHFP-NEXT:    sw a0, -12(s0)
+; ILP32-ILP32F-WITHFP-NEXT:    lw a0, 0(a1)
+; ILP32-ILP32F-WITHFP-NEXT:    lw a1, 4(a1)
 ; ILP32-ILP32F-WITHFP-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; ILP32-ILP32F-WITHFP-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; ILP32-ILP32F-WITHFP-NEXT:    addi sp, sp, 48
@@ -545,11 +545,11 @@ define i64 @va2(ptr %fmt, ...) nounwind {
 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    addi a0, sp, 20
 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    sw a0, 12(sp)
 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    addi a0, sp, 27
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    andi a0, a0, -8
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    addi a1, sp, 35
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    sw a1, 12(sp)
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    lw a1, 4(a0)
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    lw a0, 0(a0)
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    andi a1, a0, -8
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    addi a0, sp, 35
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    sw a0, 12(sp)
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    lw a0, 0(a1)
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    lw a1, 4(a1)
 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    addi sp, sp, 48
 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll
index b0d435368e92bd..c712421f16acb8 100644
--- a/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll
+++ b/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll
@@ -8,8 +8,8 @@ define void @lshr_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    lbu a3, 1(a0)
 ; RV64I-NEXT:    lbu a4, 0(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
-; RV64I-NEXT:    slli a3, a3, 8
 ; RV64I-NEXT:    lb a0, 3(a0)
+; RV64I-NEXT:    slli a3, a3, 8
 ; RV64I-NEXT:    or a3, a3, a4
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    lbu a1, 0(a1)
@@ -38,17 +38,17 @@ define void @lshr_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a0, a0, 24
 ; RV32I-NEXT:    or a0, a0, a5
-; RV32I-NEXT:    or a0, a0, a3
-; RV32I-NEXT:    lbu a3, 1(a1)
 ; RV32I-NEXT:    lbu a4, 0(a1)
-; RV32I-NEXT:    lbu a5, 2(a1)
+; RV32I-NEXT:    lbu a5, 1(a1)
+; RV32I-NEXT:    or a0, a0, a3
+; RV32I-NEXT:    lbu a3, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a4
-; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    slli a3, a3, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, a5
 ; RV32I-NEXT:    or a1, a1, a3
+; RV32I-NEXT:    or a1, a1, a4
 ; RV32I-NEXT:    slli a1, a1, 3
 ; RV32I-NEXT:    srl a0, a0, a1
 ; RV32I-NEXT:    sb a0, 0(a2)
@@ -72,8 +72,8 @@ define void @shl_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    lbu a3, 1(a0)
 ; RV64I-NEXT:    lbu a4, 0(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
-; RV64I-NEXT:    slli a3, a3, 8
 ; RV64I-NEXT:    lb a0, 3(a0)
+; RV64I-NEXT:    slli a3, a3, 8
 ; RV64I-NEXT:    or a3, a3, a4
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    lbu a1, 0(a1)
@@ -102,17 +102,17 @@ define void @shl_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a0, a0, 24
 ; RV32I-NEXT:    or a0, a0, a5
-; RV32I-NEXT:    or a0, a0, a3
-; RV32I-NEXT:    lbu a3, 1(a1)
 ; RV32I-NEXT:    lbu a4, 0(a1)
-; RV32I-NEXT:    lbu a5, 2(a1)
+; RV32I-NEXT:    lbu a5, 1(a1)
+; RV32I-NEXT:    or a0, a0, a3
+; RV32I-NEXT:    lbu a3, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a4
-; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    slli a3, a3, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, a5
 ; RV32I-NEXT:    or a1, a1, a3
+; RV32I-NEXT:    or a1, a1, a4
 ; RV32I-NEXT:    slli a1, a1, 3
 ; RV32I-NEXT:    sll a0, a0, a1
 ; RV32I-NEXT:    sb a0, 0(a2)
@@ -136,8 +136,8 @@ define void @ashr_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    lbu a3, 1(a0)
 ; RV64I-NEXT:    lbu a4, 0(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
-; RV64I-NEXT:    slli a3, a3, 8
 ; RV64I-NEXT:    lb a0, 3(a0)
+; RV64I-NEXT:    slli a3, a3, 8
 ; RV64I-NEXT:    or a3, a3, a4
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    lbu a1, 0(a1)
@@ -166,17 +166,17 @@ define void @ashr_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a0, a0, 24
 ; RV32I-NEXT:    or a0, a0, a5
-; RV32I-NEXT:    or a0, a0, a3
-; RV32I-NEXT:    lbu a3, 1(a1)
 ; RV32I-NEXT:    lbu a4, 0(a1)
-; RV32I-NEXT:    lbu a5, 2(a1)
+; RV32I-NEXT:    lbu a5, 1(a1)
+; RV32I-NEXT:    or a0, a0, a3
+; RV32I-NEXT:    lbu a3, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a4
-; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    slli a3, a3, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, a5
 ; RV32I-NEXT:    or a1, a1, a3
+; RV32I-NEXT:    or a1, a1, a4
 ; RV32I-NEXT:    slli a1, a1, 3
 ; RV32I-NEXT:    sra a0, a0, a1
 ; RV32I-NEXT:    sb a0, 0(a2)
@@ -198,51 +198,51 @@ define void @ashr_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 define void @lshr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: lshr_8bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lbu a3, 1(a0)
-; RV64I-NEXT:    lbu a4, 0(a0)
+; RV64I-NEXT:    lbu a3, 0(a0)
+; RV64I-NEXT:    lbu a4, 1(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
 ; RV64I-NEXT:    lbu a6, 3(a0)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    lbu a7, 4(a0)
+; RV64I-NEXT:    lbu t0, 5(a0)
+; RV64I-NEXT:    lbu t1, 6(a0)
+; RV64I-NEXT:    lbu a0, 7(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    or a4, a6, a5
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 5(a0)
-; RV64I-NEXT:    lbu a5, 4(a0)
-; RV64I-NEXT:    lbu a6, 6(a0)
-; RV64I-NEXT:    lbu a0, 7(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a4, t0, a7
+; RV64I-NEXT:    slli t1, t1, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, a6
+; RV64I-NEXT:    or a0, a0, t1
 ; RV64I-NEXT:    or a0, a0, a4
 ; RV64I-NEXT:    slli a0, a0, 32
-; RV64I-NEXT:    or a0, a0, a3
-; RV64I-NEXT:    lbu a3, 5(a1)
-; RV64I-NEXT:    lbu a4, 4(a1)
-; RV64I-NEXT:    lbu a5, 6(a1)
-; RV64I-NEXT:    lbu a6, 7(a1)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
-; RV64I-NEXT:    slli a5, a5, 16
-; RV64I-NEXT:    slli a6, a6, 24
-; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 1(a1)
-; RV64I-NEXT:    lbu a5, 0(a1)
+; RV64I-NEXT:    lbu a4, 0(a1)
+; RV64I-NEXT:    lbu a5, 1(a1)
 ; RV64I-NEXT:    lbu a6, 2(a1)
-; RV64I-NEXT:    lbu a1, 3(a1)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    lbu a7, 3(a1)
+; RV64I-NEXT:    lbu t0, 4(a1)
+; RV64I-NEXT:    lbu t1, 5(a1)
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    lbu a3, 6(a1)
+; RV64I-NEXT:    lbu a1, 7(a1)
+; RV64I-NEXT:    slli t1, t1, 8
+; RV64I-NEXT:    or t0, t1, t0
+; RV64I-NEXT:    slli a3, a3, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, a6
-; RV64I-NEXT:    or a1, a1, a4
-; RV64I-NEXT:    slli a1, a1, 3
-; RV64I-NEXT:    slli a3, a3, 35
-; RV64I-NEXT:    or a1, a3, a1
+; RV64I-NEXT:    or a1, a1, a3
+; RV64I-NEXT:    or a1, a1, t0
+; RV64I-NEXT:    slli a5, a5, 8
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a7, a7, 24
+; RV64I-NEXT:    or a3, a7, a6
+; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    slli a3, a3, 3
+; RV64I-NEXT:    slli a1, a1, 35
+; RV64I-NEXT:    or a1, a1, a3
 ; RV64I-NEXT:    srl a0, a0, a1
 ; RV64I-NEXT:    sb a0, 0(a2)
 ; RV64I-NEXT:    srli a1, a0, 48
@@ -272,17 +272,17 @@ define void @lshr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
 ; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    lbu a4, 1(a1)
 ; RV32I-NEXT:    lbu a5, 0(a1)
-; RV32I-NEXT:    lbu a6, 2(a1)
+; RV32I-NEXT:    lbu a6, 1(a1)
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    or a4, a4, a5
-; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    slli a4, a4, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, a6
 ; RV32I-NEXT:    or a1, a1, a4
+; RV32I-NEXT:    or a1, a1, a5
 ; RV32I-NEXT:    slli a5, a1, 3
 ; RV32I-NEXT:    addi a4, a5, -32
 ; RV32I-NEXT:    srl a1, a3, a5
@@ -334,51 +334,51 @@ define void @lshr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 define void @shl_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: shl_8bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lbu a3, 1(a0)
-; RV64I-NEXT:    lbu a4, 0(a0)
+; RV64I-NEXT:    lbu a3, 0(a0)
+; RV64I-NEXT:    lbu a4, 1(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
 ; RV64I-NEXT:    lbu a6, 3(a0)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    lbu a7, 4(a0)
+; RV64I-NEXT:    lbu t0, 5(a0)
+; RV64I-NEXT:    lbu t1, 6(a0)
+; RV64I-NEXT:    lbu a0, 7(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    or a4, a6, a5
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 5(a0)
-; RV64I-NEXT:    lbu a5, 4(a0)
-; RV64I-NEXT:    lbu a6, 6(a0)
-; RV64I-NEXT:    lbu a0, 7(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a4, t0, a7
+; RV64I-NEXT:    slli t1, t1, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, a6
+; RV64I-NEXT:    or a0, a0, t1
 ; RV64I-NEXT:    or a0, a0, a4
 ; RV64I-NEXT:    slli a0, a0, 32
-; RV64I-NEXT:    or a0, a0, a3
-; RV64I-NEXT:    lbu a3, 5(a1)
-; RV64I-NEXT:    lbu a4, 4(a1)
-; RV64I-NEXT:    lbu a5, 6(a1)
-; RV64I-NEXT:    lbu a6, 7(a1)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
-; RV64I-NEXT:    slli a5, a5, 16
-; RV64I-NEXT:    slli a6, a6, 24
-; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 1(a1)
-; RV64I-NEXT:    lbu a5, 0(a1)
+; RV64I-NEXT:    lbu a4, 0(a1)
+; RV64I-NEXT:    lbu a5, 1(a1)
 ; RV64I-NEXT:    lbu a6, 2(a1)
-; RV64I-NEXT:    lbu a1, 3(a1)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    lbu a7, 3(a1)
+; RV64I-NEXT:    lbu t0, 4(a1)
+; RV64I-NEXT:    lbu t1, 5(a1)
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    lbu a3, 6(a1)
+; RV64I-NEXT:    lbu a1, 7(a1)
+; RV64I-NEXT:    slli t1, t1, 8
+; RV64I-NEXT:    or t0, t1, t0
+; RV64I-NEXT:    slli a3, a3, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, a6
-; RV64I-NEXT:    or a1, a1, a4
-; RV64I-NEXT:    slli a1, a1, 3
-; RV64I-NEXT:    slli a3, a3, 35
-; RV64I-NEXT:    or a1, a3, a1
+; RV64I-NEXT:    or a1, a1, a3
+; RV64I-NEXT:    or a1, a1, t0
+; RV64I-NEXT:    slli a5, a5, 8
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a7, a7, 24
+; RV64I-NEXT:    or a3, a7, a6
+; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    slli a3, a3, 3
+; RV64I-NEXT:    slli a1, a1, 35
+; RV64I-NEXT:    or a1, a1, a3
 ; RV64I-NEXT:    sll a0, a0, a1
 ; RV64I-NEXT:    sb a0, 0(a2)
 ; RV64I-NEXT:    srli a1, a0, 48
@@ -408,17 +408,17 @@ define void @shl_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
 ; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    lbu a4, 1(a1)
 ; RV32I-NEXT:    lbu a5, 0(a1)
-; RV32I-NEXT:    lbu a6, 2(a1)
+; RV32I-NEXT:    lbu a6, 1(a1)
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    or a4, a4, a5
-; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    slli a4, a4, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, a6
 ; RV32I-NEXT:    or a1, a1, a4
+; RV32I-NEXT:    or a1, a1, a5
 ; RV32I-NEXT:    slli a5, a1, 3
 ; RV32I-NEXT:    addi a4, a5, -32
 ; RV32I-NEXT:    sll a1, a3, a5
@@ -470,51 +470,51 @@ define void @shl_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 define void @ashr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: ashr_8bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lbu a3, 1(a0)
-; RV64I-NEXT:    lbu a4, 0(a0)
+; RV64I-NEXT:    lbu a3, 0(a0)
+; RV64I-NEXT:    lbu a4, 1(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
 ; RV64I-NEXT:    lbu a6, 3(a0)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    lbu a7, 4(a0)
+; RV64I-NEXT:    lbu t0, 5(a0)
+; RV64I-NEXT:    lbu t1, 6(a0)
+; RV64I-NEXT:    lbu a0, 7(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    or a4, a6, a5
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 5(a0)
-; RV64I-NEXT:    lbu a5, 4(a0)
-; RV64I-NEXT:    lbu a6, 6(a0)
-; RV64I-NEXT:    lbu a0, 7(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a4, t0, a7
+; RV64I-NEXT:    slli t1, t1, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, a6
+; RV64I-NEXT:    or a0, a0, t1
 ; RV64I-NEXT:    or a0, a0, a4
 ; RV64I-NEXT:    slli a0, a0, 32
-; RV64I-NEXT:    or a0, a0, a3
-; RV64I-NEXT:    lbu a3, 5(a1)
-; RV64I-NEXT:    lbu a4, 4(a1)
-; RV64I-NEXT:    lbu a5, 6(a1)
-; RV64I-NEXT:    lbu a6, 7(a1)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
-; RV64I-NEXT:    slli a5, a5, 16
-; RV64I-NEXT:    slli a6, a6, 24
-; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 1(a1)
-; RV64I-NEXT:    lbu a5, 0(a1)
+; RV64I-NEXT:    lbu a4, 0(a1)
+; RV64I-NEXT:    lbu a5, 1(a1)
 ; RV64I-NEXT:    lbu a6, 2(a1)
-; RV64I-NEXT:    lbu a1, 3(a1)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    lbu a7, 3(a1)
+; RV64I-NEXT:    lbu t0, 4(a1)
+; RV64I-NEXT:    lbu t1, 5(a1)
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    lbu a3, 6(a1)
+; RV64I-NEXT:    lbu a1, 7(a1)
+; RV64I-NEXT:    slli t1, t1, 8
+; RV64I-NEXT:    or t0, t1, t0
+; RV64I-NEXT:    slli a3, a3, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, a6
-; RV64I-NEXT:    or a1, a1, a4
-; RV64I-NEXT:    slli a1, a1, 3
-; RV64I-NEXT:    slli a3, a3, 35
-; RV64I-NEXT:    or a1, a3, a1
+; RV64I-NEXT:    or a1, a1, a3
+; RV64I-NEXT:    or a1, a1, t0
+; RV64I-NEXT:    slli a5, a5, 8
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a7, a7, 24
+; RV64I-NEXT:    or a3, a7, a6
+; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    slli a3, a3, 3
+; RV64I-NEXT:    slli a1, a1, 35
+; RV64I-NEXT:    or a1, a1, a3
 ; RV64I-NEXT:    sra a0, a0, a1
 ; RV64I-NEXT:    sb a0, 0(a2)
 ; RV64I-NEXT:    srli a1, a0, 48
@@ -544,18 +544,18 @@ define void @ashr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a4, a6, 24
 ; RV32I-NEXT:    or a5, a4, a5
-; RV32I-NEXT:    or a3, a5, a3
-; RV32I-NEXT:    lbu a5, 1(a1)
 ; RV32I-NEXT:    lbu a6, 0(a1)
-; RV32I-NEXT:    lbu a7, 2(a1)
+; RV32I-NEXT:    lbu a7, 1(a1)
+; RV32I-NEXT:    or a3, a5, a3
+; RV32I-NEXT:    lbu a5, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, a6
-; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, a7
 ; RV32I-NEXT:    or a1, a1, a5
-; RV32I-NEXT:    slli a5, a1, 3
+; RV32I-NEXT:    or a5, a1, a6
+; RV32I-NEXT:    slli a5, a5, 3
 ; RV32I-NEXT:    addi a6, a5, -32
 ; RV32I-NEXT:    sra a1, a3, a5
 ; RV32I-NEXT:    bltz a6, .LBB5_2
@@ -607,51 +607,51 @@ define void @ashr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: lshr_16bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lbu a3, 9(a0)
-; RV64I-NEXT:    lbu a4, 8(a0)
+; RV64I-NEXT:    lbu a3, 8(a0)
+; RV64I-NEXT:    lbu a4, 9(a0)
 ; RV64I-NEXT:    lbu a5, 10(a0)
 ; RV64I-NEXT:    lbu a6, 11(a0)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    lbu a7, 12(a0)
+; RV64I-NEXT:    lbu t0, 13(a0)
+; RV64I-NEXT:    lbu t1, 14(a0)
+; RV64I-NEXT:    lbu t2, 15(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    or a4, a6, a5
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 13(a0)
-; RV64I-NEXT:    lbu a5, 12(a0)
-; RV64I-NEXT:    lbu a6, 14(a0)
-; RV64I-NEXT:    lbu a7, 15(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
-; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a5, a7, a6
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a4, t0, a7
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t2, t2, 24
+; RV64I-NEXT:    or a5, t2, t1
 ; RV64I-NEXT:    or a4, a5, a4
 ; RV64I-NEXT:    slli a4, a4, 32
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 5(a1)
-; RV64I-NEXT:    lbu a5, 4(a1)
-; RV64I-NEXT:    lbu a6, 6(a1)
-; RV64I-NEXT:    lbu a7, 7(a1)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
-; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a5, a7, a6
-; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    lbu a5, 1(a1)
-; RV64I-NEXT:    lbu a6, 0(a1)
+; RV64I-NEXT:    lbu a5, 0(a1)
+; RV64I-NEXT:    lbu a6, 1(a1)
 ; RV64I-NEXT:    lbu a7, 2(a1)
-; RV64I-NEXT:    lbu a1, 3(a1)
-; RV64I-NEXT:    slli a5, a5, 8
-; RV64I-NEXT:    or a5, a5, a6
-; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    lbu t0, 3(a1)
+; RV64I-NEXT:    lbu t1, 4(a1)
+; RV64I-NEXT:    lbu t2, 5(a1)
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 6(a1)
+; RV64I-NEXT:    lbu a1, 7(a1)
+; RV64I-NEXT:    slli t2, t2, 8
+; RV64I-NEXT:    or t1, t2, t1
+; RV64I-NEXT:    slli a4, a4, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, a7
-; RV64I-NEXT:    or a1, a1, a5
-; RV64I-NEXT:    slli a1, a1, 3
-; RV64I-NEXT:    slli a4, a4, 35
-; RV64I-NEXT:    or a5, a4, a1
+; RV64I-NEXT:    or a1, a1, a4
+; RV64I-NEXT:    or a1, a1, t1
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli t0, t0, 24
+; RV64I-NEXT:    or a5, t0, a7
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    slli a4, a4, 3
+; RV64I-NEXT:    slli a1, a1, 35
+; RV64I-NEXT:    or a5, a1, a4
 ; RV64I-NEXT:    addi a4, a5, -64
 ; RV64I-NEXT:    srl a1, a3, a5
 ; RV64I-NEXT:    bltz a4, .LBB6_2
@@ -659,25 +659,25 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    mv a0, a1
 ; RV64I-NEXT:    j .LBB6_3
 ; RV64I-NEXT:  .LBB6_2:
-; RV64I-NEXT:    lbu a6, 1(a0)
-; RV64I-NEXT:    lbu a7, 0(a0)
+; RV64I-NEXT:    lbu a6, 0(a0)
+; RV64I-NEXT:    lbu a7, 1(a0)
 ; RV64I-NEXT:    lbu t0, 2(a0)
 ; RV64I-NEXT:    lbu t1, 3(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    lbu t2, 4(a0)
+; RV64I-NEXT:    lbu t3, 5(a0)
+; RV64I-NEXT:    lbu t4, 6(a0)
+; RV64I-NEXT:    lbu a0, 7(a0)
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or a6, a7, a6
 ; RV64I-NEXT:    slli t0, t0, 16
 ; RV64I-NEXT:    slli t1, t1, 24
 ; RV64I-NEXT:    or a7, t1, t0
 ; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    lbu a7, 5(a0)
-; RV64I-NEXT:    lbu t0, 4(a0)
-; RV64I-NEXT:    lbu t1, 6(a0)
-; RV64I-NEXT:    lbu a0, 7(a0)
-; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    or a7, a7, t0
-; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t3, t3, 8
+; RV64I-NEXT:    or a7, t3, t2
+; RV64I-NEXT:    slli t4, t4, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, t1
+; RV64I-NEXT:    or a0, a0, t4
 ; RV64I-NEXT:    or a0, a0, a7
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    or a0, a0, a6
@@ -779,38 +779,38 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    andi a1, a1, 15
 ; RV32I-NEXT:    addi a0, sp, 4
 ; RV32I-NEXT:    add a0, a0, a1
-; RV32I-NEXT:    lbu a1, 5(a0)
-; RV32I-NEXT:    lbu a3, 4(a0)
-; RV32I-NEXT:    lbu a4, 7(a0)
-; RV32I-NEXT:    lbu a5, 6(a0)
-; RV32I-NEXT:    lbu a6, 1(a0)
-; RV32I-NEXT:    lbu a7, 0(a0)
-; RV32I-NEXT:    lbu t0, 3(a0)
-; RV32I-NEXT:    lbu t1, 2(a0)
-; RV32I-NEXT:    lbu t2, 13(a0)
-; RV32I-NEXT:    lbu t3, 12(a0)
-; RV32I-NEXT:    lbu t4, 15(a0)
-; RV32I-NEXT:    lbu t5, 14(a0)
-; RV32I-NEXT:    lbu t6, 10(a0)
-; RV32I-NEXT:    lbu s0, 11(a0)
-; RV32I-NEXT:    lbu s1, 8(a0)
-; RV32I-NEXT:    lbu a0, 9(a0)
-; RV32I-NEXT:    sb t6, 10(a2)
-; RV32I-NEXT:    sb s0, 11(a2)
-; RV32I-NEXT:    sb s1, 8(a2)
-; RV32I-NEXT:    sb a0, 9(a2)
-; RV32I-NEXT:    sb t5, 14(a2)
-; RV32I-NEXT:    sb t4, 15(a2)
-; RV32I-NEXT:    sb t3, 12(a2)
-; RV32I-NEXT:    sb t2, 13(a2)
-; RV32I-NEXT:    sb t1, 2(a2)
-; RV32I-NEXT:    sb t0, 3(a2)
-; RV32I-NEXT:    sb a7, 0(a2)
-; RV32I-NEXT:    sb a6, 1(a2)
-; RV32I-NEXT:    sb a5, 6(a2)
-; RV32I-NEXT:    sb a4, 7(a2)
-; RV32I-NEXT:    sb a3, 4(a2)
-; RV32I-NEXT:    sb a1, 5(a2)
+; RV32I-NEXT:    lbu a1, 0(a0)
+; RV32I-NEXT:    lbu a3, 1(a0)
+; RV32I-NEXT:    lbu a4, 2(a0)
+; RV32I-NEXT:    lbu a5, 3(a0)
+; RV32I-NEXT:    lbu a6, 4(a0)
+; RV32I-NEXT:    lbu a7, 5(a0)
+; RV32I-NEXT:    lbu t0, 6(a0)
+; RV32I-NEXT:    lbu t1, 7(a0)
+; RV32I-NEXT:    lbu t2, 8(a0)
+; RV32I-NEXT:    lbu t3, 9(a0)
+; RV32I-NEXT:    lbu t4, 10(a0)
+; RV32I-NEXT:    lbu t5, 11(a0)
+; RV32I-NEXT:    lbu t6, 12(a0)
+; RV32I-NEXT:    lbu s0, 13(a0)
+; RV32I-NEXT:    lbu s1, 14(a0)
+; RV32I-NEXT:    lbu a0, 15(a0)
+; RV32I-NEXT:    sb t4, 10(a2)
+; RV32I-NEXT:    sb t5, 11(a2)
+; RV32I-NEXT:    sb t2, 8(a2)
+; RV32I-NEXT:    sb t3, 9(a2)
+; RV32I-NEXT:    sb s1, 14(a2)
+; RV32I-NEXT:    sb a0, 15(a2)
+; RV32I-NEXT:    sb t6, 12(a2)
+; RV32I-NEXT:    sb s0, 13(a2)
+; RV32I-NEXT:    sb a4, 2(a2)
+; RV32I-NEXT:    sb a5, 3(a2)
+; RV32I-NEXT:    sb a1, 0(a2)
+; RV32I-NEXT:    sb a3, 1(a2)
+; RV32I-NEXT:    sb t0, 6(a2)
+; RV32I-NEXT:    sb t1, 7(a2)
+; RV32I-NEXT:    sb a6, 4(a2)
+; RV32I-NEXT:    sb a7, 5(a2)
 ; RV32I-NEXT:    lw s0, 44(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 40(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s2, 36(sp) # 4-byte Folded Reload
@@ -826,51 +826,51 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: shl_16bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lbu a3, 1(a0)
-; RV64I-NEXT:    lbu a4, 0(a0)
+; RV64I-NEXT:    lbu a3, 0(a0)
+; RV64I-NEXT:    lbu a4, 1(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
 ; RV64I-NEXT:    lbu a6, 3(a0)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    lbu a7, 4(a0)
+; RV64I-NEXT:    lbu t0, 5(a0)
+; RV64I-NEXT:    lbu t1, 6(a0)
+; RV64I-NEXT:    lbu t2, 7(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    or a4, a6, a5
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 5(a0)
-; RV64I-NEXT:    lbu a5, 4(a0)
-; RV64I-NEXT:    lbu a6, 6(a0)
-; RV64I-NEXT:    lbu a7, 7(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
-; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a5, a7, a6
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a4, t0, a7
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t2, t2, 24
+; RV64I-NEXT:    or a5, t2, t1
 ; RV64I-NEXT:    or a4, a5, a4
 ; RV64I-NEXT:    slli a4, a4, 32
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 5(a1)
-; RV64I-NEXT:    lbu a5, 4(a1)
-; RV64I-NEXT:    lbu a6, 6(a1)
-; RV64I-NEXT:    lbu a7, 7(a1)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
-; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a5, a7, a6
-; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    lbu a5, 1(a1)
-; RV64I-NEXT:    lbu a6, 0(a1)
+; RV64I-NEXT:    lbu a5, 0(a1)
+; RV64I-NEXT:    lbu a6, 1(a1)
 ; RV64I-NEXT:    lbu a7, 2(a1)
-; RV64I-NEXT:    lbu a1, 3(a1)
-; RV64I-NEXT:    slli a5, a5, 8
-; RV64I-NEXT:    or a5, a5, a6
-; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    lbu t0, 3(a1)
+; RV64I-NEXT:    lbu t1, 4(a1)
+; RV64I-NEXT:    lbu t2, 5(a1)
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 6(a1)
+; RV64I-NEXT:    lbu a1, 7(a1)
+; RV64I-NEXT:    slli t2, t2, 8
+; RV64I-NEXT:    or t1, t2, t1
+; RV64I-NEXT:    slli a4, a4, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, a7
-; RV64I-NEXT:    or a1, a1, a5
-; RV64I-NEXT:    slli a1, a1, 3
-; RV64I-NEXT:    slli a4, a4, 35
-; RV64I-NEXT:    or a5, a4, a1
+; RV64I-NEXT:    or a1, a1, a4
+; RV64I-NEXT:    or a1, a1, t1
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli t0, t0, 24
+; RV64I-NEXT:    or a5, t0, a7
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    slli a4, a4, 3
+; RV64I-NEXT:    slli a1, a1, 35
+; RV64I-NEXT:    or a5, a1, a4
 ; RV64I-NEXT:    addi a4, a5, -64
 ; RV64I-NEXT:    sll a1, a3, a5
 ; RV64I-NEXT:    bltz a4, .LBB7_2
@@ -878,25 +878,25 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    mv a0, a1
 ; RV64I-NEXT:    j .LBB7_3
 ; RV64I-NEXT:  .LBB7_2:
-; RV64I-NEXT:    lbu a6, 9(a0)
-; RV64I-NEXT:    lbu a7, 8(a0)
+; RV64I-NEXT:    lbu a6, 8(a0)
+; RV64I-NEXT:    lbu a7, 9(a0)
 ; RV64I-NEXT:    lbu t0, 10(a0)
 ; RV64I-NEXT:    lbu t1, 11(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    lbu t2, 12(a0)
+; RV64I-NEXT:    lbu t3, 13(a0)
+; RV64I-NEXT:    lbu t4, 14(a0)
+; RV64I-NEXT:    lbu a0, 15(a0)
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or a6, a7, a6
 ; RV64I-NEXT:    slli t0, t0, 16
 ; RV64I-NEXT:    slli t1, t1, 24
 ; RV64I-NEXT:    or a7, t1, t0
 ; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    lbu a7, 13(a0)
-; RV64I-NEXT:    lbu t0, 12(a0)
-; RV64I-NEXT:    lbu t1, 14(a0)
-; RV64I-NEXT:    lbu a0, 15(a0)
-; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    or a7, a7, t0
-; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t3, t3, 8
+; RV64I-NEXT:    or a7, t3, t2
+; RV64I-NEXT:    slli t4, t4, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, t1
+; RV64I-NEXT:    or a0, a0, t4
 ; RV64I-NEXT:    or a0, a0, a7
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    or a0, a0, a6
@@ -998,38 +998,38 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    andi a1, a1, 15
 ; RV32I-NEXT:    addi a0, sp, 20
 ; RV32I-NEXT:    sub a0, a0, a1
-; RV32I-NEXT:    lbu a1, 5(a0)
-; RV32I-NEXT:    lbu a3, 4(a0)
-; RV32I-NEXT:    lbu a4, 7(a0)
-; RV32I-NEXT:    lbu a5, 6(a0)
-; RV32I-NEXT:    lbu a6, 1(a0)
-; RV32I-NEXT:    lbu a7, 0(a0)
-; RV32I-NEXT:    lbu t0, 3(a0)
-; RV32I-NEXT:    lbu t1, 2(a0)
-; RV32I-NEXT:    lbu t2, 13(a0)
-; RV32I-NEXT:    lbu t3, 12(a0)
-; RV32I-NEXT:    lbu t4, 15(a0)
-; RV32I-NEXT:    lbu t5, 14(a0)
-; RV32I-NEXT:    lbu t6, 10(a0)
-; RV32I-NEXT:    lbu s0, 11(a0)
-; RV32I-NEXT:    lbu s1, 8(a0)
-; RV32I-NEXT:    lbu a0, 9(a0)
-; RV32I-NEXT:    sb t6, 10(a2)
-; RV32I-NEXT:    sb s0, 11(a2)
-; RV32I-NEXT:    sb s1, 8(a2)
-; RV32I-NEXT:    sb a0, 9(a2)
-; RV32I-NEXT:    sb t5, 14(a2)
-; RV32I-NEXT:    sb t4, 15(a2)
-; RV32I-NEXT:    sb t3, 12(a2)
-; RV32I-NEXT:    sb t2, 13(a2)
-; RV32I-NEXT:    sb t1, 2(a2)
-; RV32I-NEXT:    sb t0, 3(a2)
-; RV32I-NEXT:    sb a7, 0(a2)
-; RV32I-NEXT:    sb a6, 1(a2)
-; RV32I-NEXT:    sb a5, 6(a2)
-; RV32I-NEXT:    sb a4, 7(a2)
-; RV32I-NEXT:    sb a3, 4(a2)
-; RV32I-NEXT:    sb a1, 5(a2)
+; RV32I-NEXT:    lbu a1, 0(a0)
+; RV32I-NEXT:    lbu a3, 1(a0)
+; RV32I-NEXT:    lbu a4, 2(a0)
+; RV32I-NEXT:    lbu a5, 3(a0)
+; RV32I-NEXT:    lbu a6, 4(a0)
+; RV32I-NEXT:    lbu a7, 5(a0)
+; RV32I-NEXT:    lbu t0, 6(a0)
+; RV32I-NEXT:    lbu t1, 7(a0)
+; RV32I-NEXT:    lbu t2, 8(a0)
+; RV32I-NEXT:    lbu t3, 9(a0)
+; RV32I-NEXT:    lbu t4, 10(a0)
+; RV32I-NEXT:    lbu t5, 11(a0)
+; RV32I-NEXT:    lbu t6, 12(a0)
+; RV32I-NEXT:    lbu s0, 13(a0)
+; RV32I-NEXT:    lbu s1, 14(a0)
+; RV32I-NEXT:    lbu a0, 15(a0)
+; RV32I-NEXT:    sb t4, 10(a2)
+; RV32I-NEXT:    sb t5, 11(a2)
+; RV32I-NEXT:    sb t2, 8(a2)
+; RV32I-NEXT:    sb t3, 9(a2)
+; RV32I-NEXT:    sb s1, 14(a2)
+; RV32I-NEXT:    sb a0, 15(a2)
+; RV32I-NEXT:    sb t6, 12(a2)
+; RV32I-NEXT:    sb s0, 13(a2)
+; RV32I-NEXT:    sb a4, 2(a2)
+; RV32I-NEXT:    sb a5, 3(a2)
+; RV32I-NEXT:    sb a1, 0(a2)
+; RV32I-NEXT:    sb a3, 1(a2)
+; RV32I-NEXT:    sb t0, 6(a2)
+; RV32I-NEXT:    sb t1, 7(a2)
+; RV32I-NEXT:    sb a6, 4(a2)
+; RV32I-NEXT:    sb a7, 5(a2)
 ; RV32I-NEXT:    lw s0, 44(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 40(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s2, 36(sp) # 4-byte Folded Reload
@@ -1045,51 +1045,51 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: ashr_16bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lbu a3, 9(a0)
-; RV64I-NEXT:    lbu a4, 8(a0)
+; RV64I-NEXT:    lbu a3, 8(a0)
+; RV64I-NEXT:    lbu a4, 9(a0)
 ; RV64I-NEXT:    lbu a5, 10(a0)
 ; RV64I-NEXT:    lbu a6, 11(a0)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    lbu a7, 12(a0)
+; RV64I-NEXT:    lbu t0, 13(a0)
+; RV64I-NEXT:    lbu t1, 14(a0)
+; RV64I-NEXT:    lbu t2, 15(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    or a4, a6, a5
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 13(a0)
-; RV64I-NEXT:    lbu a5, 12(a0)
-; RV64I-NEXT:    lbu a6, 14(a0)
-; RV64I-NEXT:    lbu a7, 15(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
-; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a5, a7, a6
-; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    slli a5, a4, 32
-; RV64I-NEXT:    or a3, a5, a3
-; RV64I-NEXT:    lbu a5, 5(a1)
-; RV64I-NEXT:    lbu a6, 4(a1)
-; RV64I-NEXT:    lbu a7, 6(a1)
-; RV64I-NEXT:    lbu t0, 7(a1)
-; RV64I-NEXT:    slli a5, a5, 8
-; RV64I-NEXT:    or a5, a5, a6
-; RV64I-NEXT:    slli a7, a7, 16
-; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    or a6, t0, a7
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    lbu a6, 1(a1)
-; RV64I-NEXT:    lbu a7, 0(a1)
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a4, t0, a7
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t2, t2, 24
+; RV64I-NEXT:    or a5, t2, t1
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    slli a5, a4, 32
+; RV64I-NEXT:    lbu a6, 0(a1)
+; RV64I-NEXT:    lbu a7, 1(a1)
 ; RV64I-NEXT:    lbu t0, 2(a1)
-; RV64I-NEXT:    lbu a1, 3(a1)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    lbu t1, 3(a1)
+; RV64I-NEXT:    lbu t2, 4(a1)
+; RV64I-NEXT:    lbu t3, 5(a1)
+; RV64I-NEXT:    or a3, a5, a3
+; RV64I-NEXT:    lbu a5, 6(a1)
+; RV64I-NEXT:    lbu a1, 7(a1)
+; RV64I-NEXT:    slli t3, t3, 8
+; RV64I-NEXT:    or t2, t3, t2
+; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, t0
-; RV64I-NEXT:    or a1, a1, a6
-; RV64I-NEXT:    slli a1, a1, 3
-; RV64I-NEXT:    slli a5, a5, 35
-; RV64I-NEXT:    or a5, a5, a1
+; RV64I-NEXT:    or a1, a1, a5
+; RV64I-NEXT:    or a1, a1, t2
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or a5, a7, a6
+; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t1, t1, 24
+; RV64I-NEXT:    or a6, t1, t0
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    slli a5, a5, 3
+; RV64I-NEXT:    slli a1, a1, 35
+; RV64I-NEXT:    or a5, a1, a5
 ; RV64I-NEXT:    addi a6, a5, -64
 ; RV64I-NEXT:    sra a1, a3, a5
 ; RV64I-NEXT:    bltz a6, .LBB8_2
@@ -1099,25 +1099,25 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    mv a1, a3
 ; RV64I-NEXT:    j .LBB8_3
 ; RV64I-NEXT:  .LBB8_2:
-; RV64I-NEXT:    lbu a4, 1(a0)
-; RV64I-NEXT:    lbu a6, 0(a0)
+; RV64I-NEXT:    lbu a4, 0(a0)
+; RV64I-NEXT:    lbu a6, 1(a0)
 ; RV64I-NEXT:    lbu a7, 2(a0)
 ; RV64I-NEXT:    lbu t0, 3(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a6
+; RV64I-NEXT:    lbu t1, 4(a0)
+; RV64I-NEXT:    lbu t2, 5(a0)
+; RV64I-NEXT:    lbu t3, 6(a0)
+; RV64I-NEXT:    lbu a0, 7(a0)
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a4, a6, a4
 ; RV64I-NEXT:    slli a7, a7, 16
 ; RV64I-NEXT:    slli t0, t0, 24
 ; RV64I-NEXT:    or a6, t0, a7
 ; RV64I-NEXT:    or a4, a6, a4
-; RV64I-NEXT:    lbu a6, 5(a0)
-; RV64I-NEXT:    lbu a7, 4(a0)
-; RV64I-NEXT:    lbu t0, 6(a0)
-; RV64I-NEXT:    lbu a0, 7(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t2, t2, 8
+; RV64I-NEXT:    or a6, t2, t1
+; RV64I-NEXT:    slli t3, t3, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, t0
+; RV64I-NEXT:    or a0, a0, t3
 ; RV64I-NEXT:    or a0, a0, a6
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    or a0, a0, a4
@@ -1167,94 +1167,94 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sw s2, 36(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s3, 32(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    lbu a3, 15(a0)
-; RV32I-NEXT:    slli a4, a3, 24
-; RV32I-NEXT:    lbu a5, 0(a0)
-; RV32I-NEXT:    lbu a6, 1(a0)
-; RV32I-NEXT:    lbu a7, 2(a0)
-; RV32I-NEXT:    lbu t0, 3(a0)
-; RV32I-NEXT:    lbu t1, 4(a0)
-; RV32I-NEXT:    lbu t2, 5(a0)
-; RV32I-NEXT:    lbu t3, 6(a0)
-; RV32I-NEXT:    lbu t4, 7(a0)
-; RV32I-NEXT:    lbu t5, 8(a0)
-; RV32I-NEXT:    lbu t6, 9(a0)
-; RV32I-NEXT:    lbu s0, 10(a0)
-; RV32I-NEXT:    lbu s1, 11(a0)
-; RV32I-NEXT:    lbu s2, 12(a0)
-; RV32I-NEXT:    lbu s3, 14(a0)
-; RV32I-NEXT:    lbu a0, 13(a0)
+; RV32I-NEXT:    lbu a4, 0(a0)
+; RV32I-NEXT:    lbu a5, 1(a0)
+; RV32I-NEXT:    lbu a6, 2(a0)
+; RV32I-NEXT:    lbu a7, 3(a0)
+; RV32I-NEXT:    lbu t0, 4(a0)
+; RV32I-NEXT:    lbu t1, 5(a0)
+; RV32I-NEXT:    lbu t2, 6(a0)
+; RV32I-NEXT:    lbu t3, 7(a0)
+; RV32I-NEXT:    lbu t4, 8(a0)
+; RV32I-NEXT:    lbu t5, 9(a0)
+; RV32I-NEXT:    lbu t6, 10(a0)
+; RV32I-NEXT:    lbu s0, 11(a0)
+; RV32I-NEXT:    lbu s1, 12(a0)
+; RV32I-NEXT:    lbu s2, 13(a0)
+; RV32I-NEXT:    lbu a0, 14(a0)
+; RV32I-NEXT:    slli s3, a3, 24
 ; RV32I-NEXT:    lbu a1, 0(a1)
 ; RV32I-NEXT:    sb a3, 15(sp)
-; RV32I-NEXT:    sb s3, 14(sp)
-; RV32I-NEXT:    sb a0, 13(sp)
-; RV32I-NEXT:    sb s2, 12(sp)
-; RV32I-NEXT:    sb s1, 11(sp)
-; RV32I-NEXT:    sb s0, 10(sp)
-; RV32I-NEXT:    sb t6, 9(sp)
-; RV32I-NEXT:    sb t5, 8(sp)
-; RV32I-NEXT:    sb t4, 7(sp)
-; RV32I-NEXT:    sb t3, 6(sp)
-; RV32I-NEXT:    sb t2, 5(sp)
-; RV32I-NEXT:    sb t1, 4(sp)
-; RV32I-NEXT:    sb t0, 3(sp)
-; RV32I-NEXT:    sb a7, 2(sp)
-; RV32I-NEXT:    sb a6, 1(sp)
-; RV32I-NEXT:    sb a5, 0(sp)
-; RV32I-NEXT:    srai a4, a4, 31
-; RV32I-NEXT:    sb a4, 28(sp)
-; RV32I-NEXT:    sb a4, 24(sp)
-; RV32I-NEXT:    sb a4, 20(sp)
-; RV32I-NEXT:    sb a4, 16(sp)
-; RV32I-NEXT:    srli a0, a4, 24
-; RV32I-NEXT:    sb a0, 31(sp)
-; RV32I-NEXT:    srli a3, a4, 16
-; RV32I-NEXT:    sb a3, 30(sp)
-; RV32I-NEXT:    srli a4, a4, 8
-; RV32I-NEXT:    sb a4, 29(sp)
-; RV32I-NEXT:    sb a0, 27(sp)
-; RV32I-NEXT:    sb a3, 26(sp)
-; RV32I-NEXT:    sb a4, 25(sp)
-; RV32I-NEXT:    sb a0, 23(sp)
-; RV32I-NEXT:    sb a3, 22(sp)
-; RV32I-NEXT:    sb a4, 21(sp)
-; RV32I-NEXT:    sb a0, 19(sp)
-; RV32I-NEXT:    sb a3, 18(sp)
-; RV32I-NEXT:    sb a4, 17(sp)
+; RV32I-NEXT:    sb a0, 14(sp)
+; RV32I-NEXT:    sb s2, 13(sp)
+; RV32I-NEXT:    sb s1, 12(sp)
+; RV32I-NEXT:    sb s0, 11(sp)
+; RV32I-NEXT:    sb t6, 10(sp)
+; RV32I-NEXT:    sb t5, 9(sp)
+; RV32I-NEXT:    sb t4, 8(sp)
+; RV32I-NEXT:    sb t3, 7(sp)
+; RV32I-NEXT:    sb t2, 6(sp)
+; RV32I-NEXT:    sb t1, 5(sp)
+; RV32I-NEXT:    sb t0, 4(sp)
+; RV32I-NEXT:    sb a7, 3(sp)
+; RV32I-NEXT:    sb a6, 2(sp)
+; RV32I-NEXT:    sb a5, 1(sp)
+; RV32I-NEXT:    sb a4, 0(sp)
+; RV32I-NEXT:    srai a0, s3, 31
+; RV32I-NEXT:    sb a0, 28(sp)
+; RV32I-NEXT:    sb a0, 24(sp)
+; RV32I-NEXT:    sb a0, 20(sp)
+; RV32I-NEXT:    sb a0, 16(sp)
+; RV32I-NEXT:    srli a3, a0, 24
+; RV32I-NEXT:    sb a3, 31(sp)
+; RV32I-NEXT:    srli a4, a0, 16
+; RV32I-NEXT:    sb a4, 30(sp)
+; RV32I-NEXT:    srli a0, a0, 8
+; RV32I-NEXT:    sb a0, 29(sp)
+; RV32I-NEXT:    sb a3, 27(sp)
+; RV32I-NEXT:    sb a4, 26(sp)
+; RV32I-NEXT:    sb a0, 25(sp)
+; RV32I-NEXT:    sb a3, 23(sp)
+; RV32I-NEXT:    sb a4, 22(sp)
+; RV32I-NEXT:    sb a0, 21(sp)
+; RV32I-NEXT:    sb a3, 19(sp)
+; RV32I-NEXT:    sb a4, 18(sp)
+; RV32I-NEXT:    sb a0, 17(sp)
 ; RV32I-NEXT:    andi a1, a1, 15
 ; RV32I-NEXT:    mv a0, sp
 ; RV32I-NEXT:    add a0, a0, a1
-; RV32I-NEXT:    lbu a1, 5(a0)
-; RV32I-NEXT:    lbu a3, 4(a0)
-; RV32I-NEXT:    lbu a4, 7(a0)
-; RV32I-NEXT:    lbu a5, 6(a0)
-; RV32I-NEXT:    lbu a6, 1(a0)
-; RV32I-NEXT:    lbu a7, 0(a0)
-; RV32I-NEXT:    lbu t0, 3(a0)
-; RV32I-NEXT:    lbu t1, 2(a0)
-; RV32I-NEXT:    lbu t2, 13(a0)
-; RV32I-NEXT:    lbu t3, 12(a0)
-; RV32I-NEXT:    lbu t4, 15(a0)
-; RV32I-NEXT:    lbu t5, 14(a0)
-; RV32I-NEXT:    lbu t6, 10(a0)
-; RV32I-NEXT:    lbu s0, 11(a0)
-; RV32I-NEXT:    lbu s1, 8(a0)
-; RV32I-NEXT:    lbu a0, 9(a0)
-; RV32I-NEXT:    sb t6, 10(a2)
-; RV32I-NEXT:    sb s0, 11(a2)
-; RV32I-NEXT:    sb s1, 8(a2)
-; RV32I-NEXT:    sb a0, 9(a2)
-; RV32I-NEXT:    sb t5, 14(a2)
-; RV32I-NEXT:    sb t4, 15(a2)
-; RV32I-NEXT:    sb t3, 12(a2)
-; RV32I-NEXT:    sb t2, 13(a2)
-; RV32I-NEXT:    sb t1, 2(a2)
-; RV32I-NEXT:    sb t0, 3(a2)
-; RV32I-NEXT:    sb a7, 0(a2)
-; RV32I-NEXT:    sb a6, 1(a2)
-; RV32I-NEXT:    sb a5, 6(a2)
-; RV32I-NEXT:    sb a4, 7(a2)
-; RV32I-NEXT:    sb a3, 4(a2)
-; RV32I-NEXT:    sb a1, 5(a2)
+; RV32I-NEXT:    lbu a1, 0(a0)
+; RV32I-NEXT:    lbu a3, 1(a0)
+; RV32I-NEXT:    lbu a4, 2(a0)
+; RV32I-NEXT:    lbu a5, 3(a0)
+; RV32I-NEXT:    lbu a6, 4(a0)
+; RV32I-NEXT:    lbu a7, 5(a0)
+; RV32I-NEXT:    lbu t0, 6(a0)
+; RV32I-NEXT:    lbu t1, 7(a0)
+; RV32I-NEXT:    lbu t2, 8(a0)
+; RV32I-NEXT:    lbu t3, 9(a0)
+; RV32I-NEXT:    lbu t4, 10(a0)
+; RV32I-NEXT:    lbu t5, 11(a0)
+; RV32I-NEXT:    lbu t6, 12(a0)
+; RV32I-NEXT:    lbu s0, 13(a0)
+; RV32I-NEXT:    lbu s1, 14(a0)
+; RV32I-NEXT:    lbu a0, 15(a0)
+; RV32I-NEXT:    sb t4, 10(a2)
+; RV32I-NEXT:    sb t5, 11(a2)
+; RV32I-NEXT:    sb t2, 8(a2)
+; RV32I-NEXT:    sb t3, 9(a2)
+; RV32I-NEXT:    sb s1, 14(a2)
+; RV32I-NEXT:    sb a0, 15(a2)
+; RV32I-NEXT:    sb t6, 12(a2)
+; RV32I-NEXT:    sb s0, 13(a2)
+; RV32I-NEXT:    sb a4, 2(a2)
+; RV32I-NEXT:    sb a5, 3(a2)
+; RV32I-NEXT:    sb a1, 0(a2)
+; RV32I-NEXT:    sb a3, 1(a2)
+; RV32I-NEXT:    sb t0, 6(a2)
+; RV32I-NEXT:    sb t1, 7(a2)
+; RV32I-NEXT:    sb a6, 4(a2)
+; RV32I-NEXT:    sb a7, 5(a2)
 ; RV32I-NEXT:    lw s0, 44(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 40(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s2, 36(sp) # 4-byte Folded Reload
@@ -1286,18 +1286,21 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    sd s9, 136(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s10, 128(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s11, 120(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 0(a0)
-; RV64I-NEXT:    sd a3, 48(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 1(a0)
-; RV64I-NEXT:    sd a3, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 2(a0)
-; RV64I-NEXT:    sd a3, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 3(a0)
-; RV64I-NEXT:    sd a3, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 4(a0)
-; RV64I-NEXT:    sd a3, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 5(a0)
-; RV64I-NEXT:    sd a3, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    mv a5, a1
+; RV64I-NEXT:    lbu a7, 30(a0)
+; RV64I-NEXT:    lbu a6, 31(a0)
+; RV64I-NEXT:    lbu a1, 0(a0)
+; RV64I-NEXT:    sd a1, 48(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 1(a0)
+; RV64I-NEXT:    sd a1, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 2(a0)
+; RV64I-NEXT:    sd a1, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 3(a0)
+; RV64I-NEXT:    sd a1, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 4(a0)
+; RV64I-NEXT:    sd a1, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 5(a0)
+; RV64I-NEXT:    sd a1, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    lbu t1, 6(a0)
 ; RV64I-NEXT:    lbu t2, 7(a0)
 ; RV64I-NEXT:    lbu t3, 8(a0)
@@ -1318,19 +1321,17 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    lbu s11, 23(a0)
 ; RV64I-NEXT:    lbu ra, 24(a0)
 ; RV64I-NEXT:    lbu t0, 25(a0)
-; RV64I-NEXT:    lbu a7, 26(a0)
-; RV64I-NEXT:    lbu a6, 27(a0)
-; RV64I-NEXT:    lbu a5, 28(a0)
-; RV64I-NEXT:    lbu a3, 31(a0)
-; RV64I-NEXT:    lbu a4, 30(a0)
+; RV64I-NEXT:    lbu a4, 26(a0)
+; RV64I-NEXT:    lbu a3, 27(a0)
+; RV64I-NEXT:    lbu a1, 28(a0)
 ; RV64I-NEXT:    lbu a0, 29(a0)
-; RV64I-NEXT:    lbu a1, 0(a1)
-; RV64I-NEXT:    sb a3, 87(sp)
-; RV64I-NEXT:    sb a4, 86(sp)
+; RV64I-NEXT:    lbu a5, 0(a5)
+; RV64I-NEXT:    sb a6, 87(sp)
+; RV64I-NEXT:    sb a7, 86(sp)
 ; RV64I-NEXT:    sb a0, 85(sp)
-; RV64I-NEXT:    sb a5, 84(sp)
-; RV64I-NEXT:    sb a6, 83(sp)
-; RV64I-NEXT:    sb a7, 82(sp)
+; RV64I-NEXT:    sb a1, 84(sp)
+; RV64I-NEXT:    sb a3, 83(sp)
+; RV64I-NEXT:    sb a4, 82(sp)
 ; RV64I-NEXT:    sb zero, 119(sp)
 ; RV64I-NEXT:    sb zero, 118(sp)
 ; RV64I-NEXT:    sb zero, 117(sp)
@@ -1395,80 +1396,80 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    sb a0, 57(sp)
 ; RV64I-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    sb a0, 56(sp)
-; RV64I-NEXT:    andi a1, a1, 31
+; RV64I-NEXT:    andi a5, a5, 31
 ; RV64I-NEXT:    addi a0, sp, 56
-; RV64I-NEXT:    add a6, a0, a1
-; RV64I-NEXT:    lbu a0, 8(a6)
-; RV64I-NEXT:    sd a0, 48(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a0, 9(a6)
+; RV64I-NEXT:    add t1, a0, a5
+; RV64I-NEXT:    lbu a0, 0(t1)
 ; RV64I-NEXT:    sd a0, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a0, 10(a6)
-; RV64I-NEXT:    sd a0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a0, 11(a6)
+; RV64I-NEXT:    lbu a0, 1(t1)
 ; RV64I-NEXT:    sd a0, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a0, 12(a6)
+; RV64I-NEXT:    lbu a6, 2(t1)
+; RV64I-NEXT:    lbu t0, 3(t1)
+; RV64I-NEXT:    lbu t2, 4(t1)
+; RV64I-NEXT:    lbu t3, 5(t1)
+; RV64I-NEXT:    lbu t4, 6(t1)
+; RV64I-NEXT:    lbu t5, 7(t1)
+; RV64I-NEXT:    lbu a0, 8(t1)
+; RV64I-NEXT:    sd a0, 48(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a0, 9(t1)
+; RV64I-NEXT:    sd a0, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a0, 10(t1)
 ; RV64I-NEXT:    sd a0, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a7, 13(a6)
-; RV64I-NEXT:    lbu t0, 14(a6)
-; RV64I-NEXT:    lbu t1, 15(a6)
-; RV64I-NEXT:    lbu t2, 0(a6)
-; RV64I-NEXT:    lbu t3, 1(a6)
-; RV64I-NEXT:    lbu t4, 2(a6)
-; RV64I-NEXT:    lbu t5, 3(a6)
-; RV64I-NEXT:    lbu t6, 4(a6)
-; RV64I-NEXT:    lbu s0, 5(a6)
-; RV64I-NEXT:    lbu s1, 6(a6)
-; RV64I-NEXT:    lbu s2, 7(a6)
-; RV64I-NEXT:    lbu s3, 24(a6)
-; RV64I-NEXT:    lbu s4, 25(a6)
-; RV64I-NEXT:    lbu s5, 26(a6)
-; RV64I-NEXT:    lbu s6, 27(a6)
-; RV64I-NEXT:    lbu s7, 28(a6)
-; RV64I-NEXT:    lbu s8, 29(a6)
-; RV64I-NEXT:    lbu s9, 30(a6)
-; RV64I-NEXT:    lbu s10, 31(a6)
-; RV64I-NEXT:    lbu s11, 16(a6)
-; RV64I-NEXT:    lbu ra, 17(a6)
-; RV64I-NEXT:    lbu a5, 18(a6)
-; RV64I-NEXT:    lbu a4, 19(a6)
-; RV64I-NEXT:    lbu a0, 23(a6)
-; RV64I-NEXT:    lbu a1, 22(a6)
-; RV64I-NEXT:    lbu a3, 21(a6)
-; RV64I-NEXT:    lbu a6, 20(a6)
-; RV64I-NEXT:    sb a0, 23(a2)
-; RV64I-NEXT:    sb a1, 22(a2)
-; RV64I-NEXT:    sb a3, 21(a2)
-; RV64I-NEXT:    sb a6, 20(a2)
-; RV64I-NEXT:    sb a4, 19(a2)
-; RV64I-NEXT:    sb a5, 18(a2)
-; RV64I-NEXT:    sb ra, 17(a2)
-; RV64I-NEXT:    sb s11, 16(a2)
-; RV64I-NEXT:    sb s10, 31(a2)
-; RV64I-NEXT:    sb s9, 30(a2)
-; RV64I-NEXT:    sb s8, 29(a2)
-; RV64I-NEXT:    sb s7, 28(a2)
-; RV64I-NEXT:    sb s6, 27(a2)
-; RV64I-NEXT:    sb s5, 26(a2)
-; RV64I-NEXT:    sb s4, 25(a2)
-; RV64I-NEXT:    sb s3, 24(a2)
-; RV64I-NEXT:    sb s2, 7(a2)
-; RV64I-NEXT:    sb s1, 6(a2)
-; RV64I-NEXT:    sb s0, 5(a2)
-; RV64I-NEXT:    sb t6, 4(a2)
-; RV64I-NEXT:    sb t5, 3(a2)
-; RV64I-NEXT:    sb t4, 2(a2)
-; RV64I-NEXT:    sb t3, 1(a2)
-; RV64I-NEXT:    sb t2, 0(a2)
-; RV64I-NEXT:    sb t1, 15(a2)
-; RV64I-NEXT:    sb t0, 14(a2)
-; RV64I-NEXT:    sb a7, 13(a2)
-; RV64I-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 12(a2)
+; RV64I-NEXT:    lbu a7, 11(t1)
+; RV64I-NEXT:    lbu t6, 12(t1)
+; RV64I-NEXT:    lbu s0, 13(t1)
+; RV64I-NEXT:    lbu s1, 14(t1)
+; RV64I-NEXT:    lbu s2, 15(t1)
+; RV64I-NEXT:    lbu s3, 16(t1)
+; RV64I-NEXT:    lbu s4, 17(t1)
+; RV64I-NEXT:    lbu s5, 18(t1)
+; RV64I-NEXT:    lbu s6, 19(t1)
+; RV64I-NEXT:    lbu s7, 20(t1)
+; RV64I-NEXT:    lbu s8, 21(t1)
+; RV64I-NEXT:    lbu s9, 22(t1)
+; RV64I-NEXT:    lbu s10, 23(t1)
+; RV64I-NEXT:    lbu s11, 24(t1)
+; RV64I-NEXT:    lbu ra, 25(t1)
+; RV64I-NEXT:    lbu a5, 26(t1)
+; RV64I-NEXT:    lbu a4, 27(t1)
+; RV64I-NEXT:    lbu a3, 28(t1)
+; RV64I-NEXT:    lbu a1, 29(t1)
+; RV64I-NEXT:    lbu a0, 30(t1)
+; RV64I-NEXT:    lbu t1, 31(t1)
+; RV64I-NEXT:    sb s10, 23(a2)
+; RV64I-NEXT:    sb s9, 22(a2)
+; RV64I-NEXT:    sb s8, 21(a2)
+; RV64I-NEXT:    sb s7, 20(a2)
+; RV64I-NEXT:    sb s6, 19(a2)
+; RV64I-NEXT:    sb s5, 18(a2)
+; RV64I-NEXT:    sb s4, 17(a2)
+; RV64I-NEXT:    sb s3, 16(a2)
+; RV64I-NEXT:    sb t1, 31(a2)
+; RV64I-NEXT:    sb a0, 30(a2)
+; RV64I-NEXT:    sb a1, 29(a2)
+; RV64I-NEXT:    sb a3, 28(a2)
+; RV64I-NEXT:    sb a4, 27(a2)
+; RV64I-NEXT:    sb a5, 26(a2)
+; RV64I-NEXT:    sb ra, 25(a2)
+; RV64I-NEXT:    sb s11, 24(a2)
+; RV64I-NEXT:    sb t5, 7(a2)
+; RV64I-NEXT:    sb t4, 6(a2)
+; RV64I-NEXT:    sb t3, 5(a2)
+; RV64I-NEXT:    sb t2, 4(a2)
+; RV64I-NEXT:    sb t0, 3(a2)
+; RV64I-NEXT:    sb a6, 2(a2)
 ; RV64I-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 11(a2)
-; RV64I-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 10(a2)
+; RV64I-NEXT:    sb a0, 1(a2)
 ; RV64I-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    sb a0, 0(a2)
+; RV64I-NEXT:    sb s2, 15(a2)
+; RV64I-NEXT:    sb s1, 14(a2)
+; RV64I-NEXT:    sb s0, 13(a2)
+; RV64I-NEXT:    sb t6, 12(a2)
+; RV64I-NEXT:    sb a7, 11(a2)
+; RV64I-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    sb a0, 10(a2)
+; RV64I-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    sb a0, 9(a2)
 ; RV64I-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    sb a0, 8(a2)
@@ -1504,18 +1505,21 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sw s9, 100(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s10, 96(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s11, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 0(a0)
-; RV32I-NEXT:    sw a3, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 1(a0)
-; RV32I-NEXT:    sw a3, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 2(a0)
-; RV32I-NEXT:    sw a3, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 3(a0)
-; RV32I-NEXT:    sw a3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 4(a0)
-; RV32I-NEXT:    sw a3, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 5(a0)
-; RV32I-NEXT:    sw a3, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    mv a5, a1
+; RV32I-NEXT:    lbu a7, 30(a0)
+; RV32I-NEXT:    lbu a6, 31(a0)
+; RV32I-NEXT:    lbu a1, 0(a0)
+; RV32I-NEXT:    sw a1, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a1, 1(a0)
+; RV32I-NEXT:    sw a1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a1, 2(a0)
+; RV32I-NEXT:    sw a1, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a1, 3(a0)
+; RV32I-NEXT:    sw a1, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a1, 4(a0)
+; RV32I-NEXT:    sw a1, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a1, 5(a0)
+; RV32I-NEXT:    sw a1, 4(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    lbu t1, 6(a0)
 ; RV32I-NEXT:    lbu t2, 7(a0)
 ; RV32I-NEXT:    lbu t3, 8(a0)
@@ -1536,19 +1540,17 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    lbu s11, 23(a0)
 ; RV32I-NEXT:    lbu ra, 24(a0)
 ; RV32I-NEXT:    lbu t0, 25(a0)
-; RV32I-NEXT:    lbu a7, 26(a0)
-; RV32I-NEXT:    lbu a6, 27(a0)
-; RV32I-NEXT:    lbu a5, 28(a0)
-; RV32I-NEXT:    lbu a3, 31(a0)
-; RV32I-NEXT:    lbu a4, 30(a0)
+; RV32I-NEXT:    lbu a4, 26(a0)
+; RV32I-NEXT:    lbu a3, 27(a0)
+; RV32I-NEXT:    lbu a1, 28(a0)
 ; RV32I-NEXT:    lbu a0, 29(a0)
-; RV32I-NEXT:    lbu a1, 0(a1)
-; RV32I-NEXT:    sb a3, 59(sp)
-; RV32I-NEXT:    sb a4, 58(sp)
+; RV32I-NEXT:    lbu a5, 0(a5)
+; RV32I-NEXT:    sb a6, 59(sp)
+; RV32I-NEXT:    sb a7, 58(sp)
 ; RV32I-NEXT:    sb a0, 57(sp)
-; RV32I-NEXT:    sb a5, 56(sp)
-; RV32I-NEXT:    sb a6, 55(sp)
-; RV32I-NEXT:    sb a7, 54(sp)
+; RV32I-NEXT:    sb a1, 56(sp)
+; RV32I-NEXT:    sb a3, 55(sp)
+; RV32I-NEXT:    sb a4, 54(sp)
 ; RV32I-NEXT:    sb zero, 91(sp)
 ; RV32I-NEXT:    sb zero, 90(sp)
 ; RV32I-NEXT:    sb zero, 89(sp)
@@ -1613,82 +1615,82 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sb a0, 29(sp)
 ; RV32I-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    sb a0, 28(sp)
-; RV32I-NEXT:    andi a1, a1, 31
+; RV32I-NEXT:    andi a5, a5, 31
 ; RV32I-NEXT:    addi a0, sp, 28
-; RV32I-NEXT:    add a6, a0, a1
-; RV32I-NEXT:    lbu a0, 6(a6)
+; RV32I-NEXT:    add t1, a0, a5
+; RV32I-NEXT:    lbu a0, 0(t1)
 ; RV32I-NEXT:    sw a0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a0, 7(a6)
-; RV32I-NEXT:    sw a0, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a0, 4(a6)
-; RV32I-NEXT:    sw a0, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a0, 5(a6)
+; RV32I-NEXT:    lbu a0, 1(t1)
 ; RV32I-NEXT:    sw a0, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a0, 0(a6)
+; RV32I-NEXT:    lbu a7, 2(t1)
+; RV32I-NEXT:    lbu t0, 3(t1)
+; RV32I-NEXT:    lbu a0, 4(t1)
+; RV32I-NEXT:    sw a0, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a6, 5(t1)
+; RV32I-NEXT:    lbu a0, 6(t1)
+; RV32I-NEXT:    sw a0, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a0, 7(t1)
 ; RV32I-NEXT:    sw a0, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a7, 1(a6)
-; RV32I-NEXT:    lbu t0, 2(a6)
-; RV32I-NEXT:    lbu t1, 3(a6)
-; RV32I-NEXT:    lbu t2, 14(a6)
-; RV32I-NEXT:    lbu t3, 15(a6)
-; RV32I-NEXT:    lbu t4, 12(a6)
-; RV32I-NEXT:    lbu t5, 13(a6)
-; RV32I-NEXT:    lbu t6, 10(a6)
-; RV32I-NEXT:    lbu s0, 11(a6)
-; RV32I-NEXT:    lbu s1, 8(a6)
-; RV32I-NEXT:    lbu s2, 9(a6)
-; RV32I-NEXT:    lbu s3, 22(a6)
-; RV32I-NEXT:    lbu s4, 23(a6)
-; RV32I-NEXT:    lbu s5, 20(a6)
-; RV32I-NEXT:    lbu s6, 21(a6)
-; RV32I-NEXT:    lbu s7, 18(a6)
-; RV32I-NEXT:    lbu s8, 19(a6)
-; RV32I-NEXT:    lbu s9, 16(a6)
-; RV32I-NEXT:    lbu s10, 17(a6)
-; RV32I-NEXT:    lbu s11, 30(a6)
-; RV32I-NEXT:    lbu ra, 31(a6)
-; RV32I-NEXT:    lbu a5, 28(a6)
-; RV32I-NEXT:    lbu a4, 29(a6)
-; RV32I-NEXT:    lbu a0, 25(a6)
-; RV32I-NEXT:    lbu a1, 24(a6)
-; RV32I-NEXT:    lbu a3, 27(a6)
-; RV32I-NEXT:    lbu a6, 26(a6)
-; RV32I-NEXT:    sb a0, 25(a2)
-; RV32I-NEXT:    sb a1, 24(a2)
-; RV32I-NEXT:    sb a3, 27(a2)
-; RV32I-NEXT:    sb a6, 26(a2)
-; RV32I-NEXT:    sb a4, 29(a2)
-; RV32I-NEXT:    sb a5, 28(a2)
-; RV32I-NEXT:    sb ra, 31(a2)
-; RV32I-NEXT:    sb s11, 30(a2)
-; RV32I-NEXT:    sb s10, 17(a2)
-; RV32I-NEXT:    sb s9, 16(a2)
-; RV32I-NEXT:    sb s8, 19(a2)
-; RV32I-NEXT:    sb s7, 18(a2)
-; RV32I-NEXT:    sb s6, 21(a2)
-; RV32I-NEXT:    sb s5, 20(a2)
-; RV32I-NEXT:    sb s4, 23(a2)
-; RV32I-NEXT:    sb s3, 22(a2)
-; RV32I-NEXT:    sb s2, 9(a2)
-; RV32I-NEXT:    sb s1, 8(a2)
-; RV32I-NEXT:    sb s0, 11(a2)
-; RV32I-NEXT:    sb t6, 10(a2)
-; RV32I-NEXT:    sb t5, 13(a2)
-; RV32I-NEXT:    sb t4, 12(a2)
-; RV32I-NEXT:    sb t3, 15(a2)
-; RV32I-NEXT:    sb t2, 14(a2)
-; RV32I-NEXT:    sb t1, 3(a2)
-; RV32I-NEXT:    sb t0, 2(a2)
-; RV32I-NEXT:    sb a7, 1(a2)
-; RV32I-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 0(a2)
+; RV32I-NEXT:    lbu t2, 8(t1)
+; RV32I-NEXT:    lbu t3, 9(t1)
+; RV32I-NEXT:    lbu t4, 10(t1)
+; RV32I-NEXT:    lbu t5, 11(t1)
+; RV32I-NEXT:    lbu t6, 12(t1)
+; RV32I-NEXT:    lbu s0, 13(t1)
+; RV32I-NEXT:    lbu s1, 14(t1)
+; RV32I-NEXT:    lbu s2, 15(t1)
+; RV32I-NEXT:    lbu s3, 16(t1)
+; RV32I-NEXT:    lbu s4, 17(t1)
+; RV32I-NEXT:    lbu s5, 18(t1)
+; RV32I-NEXT:    lbu s6, 19(t1)
+; RV32I-NEXT:    lbu s7, 20(t1)
+; RV32I-NEXT:    lbu s8, 21(t1)
+; RV32I-NEXT:    lbu s9, 22(t1)
+; RV32I-NEXT:    lbu s10, 23(t1)
+; RV32I-NEXT:    lbu s11, 24(t1)
+; RV32I-NEXT:    lbu ra, 25(t1)
+; RV32I-NEXT:    lbu a4, 26(t1)
+; RV32I-NEXT:    lbu a0, 27(t1)
+; RV32I-NEXT:    lbu a3, 28(t1)
+; RV32I-NEXT:    lbu a1, 29(t1)
+; RV32I-NEXT:    lbu a5, 30(t1)
+; RV32I-NEXT:    lbu t1, 31(t1)
+; RV32I-NEXT:    sb ra, 25(a2)
+; RV32I-NEXT:    sb s11, 24(a2)
+; RV32I-NEXT:    sb a0, 27(a2)
+; RV32I-NEXT:    sb a4, 26(a2)
+; RV32I-NEXT:    sb a1, 29(a2)
+; RV32I-NEXT:    sb a3, 28(a2)
+; RV32I-NEXT:    sb t1, 31(a2)
+; RV32I-NEXT:    sb a5, 30(a2)
+; RV32I-NEXT:    sb s4, 17(a2)
+; RV32I-NEXT:    sb s3, 16(a2)
+; RV32I-NEXT:    sb s6, 19(a2)
+; RV32I-NEXT:    sb s5, 18(a2)
+; RV32I-NEXT:    sb s8, 21(a2)
+; RV32I-NEXT:    sb s7, 20(a2)
+; RV32I-NEXT:    sb s10, 23(a2)
+; RV32I-NEXT:    sb s9, 22(a2)
+; RV32I-NEXT:    sb t3, 9(a2)
+; RV32I-NEXT:    sb t2, 8(a2)
+; RV32I-NEXT:    sb t5, 11(a2)
+; RV32I-NEXT:    sb t4, 10(a2)
+; RV32I-NEXT:    sb s0, 13(a2)
+; RV32I-NEXT:    sb t6, 12(a2)
+; RV32I-NEXT:    sb s2, 15(a2)
+; RV32I-NEXT:    sb s1, 14(a2)
+; RV32I-NEXT:    sb t0, 3(a2)
+; RV32I-NEXT:    sb a7, 2(a2)
 ; RV32I-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 5(a2)
+; RV32I-NEXT:    sb a0, 1(a2)
+; RV32I-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a0, 0(a2)
+; RV32I-NEXT:    sb a6, 5(a2)
 ; RV32I-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    sb a0, 4(a2)
-; RV32I-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    sb a0, 7(a2)
-; RV32I-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    sb a0, 6(a2)
 ; RV32I-NEXT:    lw ra, 140(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 136(sp) # 4-byte Folded Reload
@@ -1729,18 +1731,21 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    sd s9, 136(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s10, 128(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s11, 120(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 0(a0)
-; RV64I-NEXT:    sd a3, 48(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 1(a0)
-; RV64I-NEXT:    sd a3, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 2(a0)
-; RV64I-NEXT:    sd a3, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 3(a0)
-; RV64I-NEXT:    sd a3, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 4(a0)
-; RV64I-NEXT:    sd a3, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 5(a0)
-; RV64I-NEXT:    sd a3, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    mv a5, a1
+; RV64I-NEXT:    lbu a7, 30(a0)
+; RV64I-NEXT:    lbu a6, 31(a0)
+; RV64I-NEXT:    lbu a1, 0(a0)
+; RV64I-NEXT:    sd a1, 48(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 1(a0)
+; RV64I-NEXT:    sd a1, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 2(a0)
+; RV64I-NEXT:    sd a1, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 3(a0)
+; RV64I-NEXT:    sd a1, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 4(a0)
+; RV64I-NEXT:    sd a1, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 5(a0)
+; RV64I-NEXT:    sd a1, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    lbu t1, 6(a0)
 ; RV64I-NEXT:    lbu t2, 7(a0)
 ; RV64I-NEXT:    lbu t3, 8(a0)
@@ -1761,19 +1766,17 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    lbu s11, 23(a0)
 ; RV64I-NEXT:    lbu ra, 24(a0)
 ; RV64I-NEXT:    lbu t0, 25(a0)
-; RV64I-NEXT:    lbu a7, 26(a0)
-; RV64I-NEXT:    lbu a6, 27(a0)
-; RV64I-NEXT:    lbu a5, 28(a0)
-; RV64I-NEXT:    lbu a3, 31(a0)
-; RV64I-NEXT:    lbu a4, 30(a0)
+; RV64I-NEXT:    lbu a4, 26(a0)
+; RV64I-NEXT:    lbu a3, 27(a0)
+; RV64I-NEXT:    lbu a1, 28(a0)
 ; RV64I-NEXT:    lbu a0, 29(a0)
-; RV64I-NEXT:    lbu a1, 0(a1)
-; RV64I-NEXT:    sb a3, 119(sp)
-; RV64I-NEXT:    sb a4, 118(sp)
+; RV64I-NEXT:    lbu a5, 0(a5)
+; RV64I-NEXT:    sb a6, 119(sp)
+; RV64I-NEXT:    sb a7, 118(sp)
 ; RV64I-NEXT:    sb a0, 117(sp)
-; RV64I-NEXT:    sb a5, 116(sp)
-; RV64I-NEXT:    sb a6, 115(sp)
-; RV64I-NEXT:    sb a7, 114(sp)
+; RV64I-NEXT:    sb a1, 116(sp)
+; RV64I-NEXT:    sb a3, 115(sp)
+; RV64I-NEXT:    sb a4, 114(sp)
 ; RV64I-NEXT:    sb zero, 87(sp)
 ; RV64I-NEXT:    sb zero, 86(sp)
 ; RV64I-NEXT:    sb zero, 85(sp)
@@ -1838,80 +1841,80 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    sb a0, 89(sp)
 ; RV64I-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    sb a0, 88(sp)
-; RV64I-NEXT:    andi a1, a1, 31
+; RV64I-NEXT:    andi a5, a5, 31
 ; RV64I-NEXT:    addi a0, sp, 88
-; RV64I-NEXT:    sub a6, a0, a1
-; RV64I-NEXT:    lbu a0, 8(a6)
-; RV64I-NEXT:    sd a0, 48(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a0, 9(a6)
+; RV64I-NEXT:    sub t1, a0, a5
+; RV64I-NEXT:    lbu a0, 0(t1)
 ; RV64I-NEXT:    sd a0, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a0, 10(a6)
-; RV64I-NEXT:    sd a0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a0, 11(a6)
+; RV64I-NEXT:    lbu a0, 1(t1)
 ; RV64I-NEXT:    sd a0, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a0, 12(a6)
+; RV64I-NEXT:    lbu a6, 2(t1)
+; RV64I-NEXT:    lbu t0, 3(t1)
+; RV64I-NEXT:    lbu t2, 4(t1)
+; RV64I-NEXT:    lbu t3, 5(t1)
+; RV64I-NEXT:    lbu t4, 6(t1)
+; RV64I-NEXT:    lbu t5, 7(t1)
+; RV64I-NEXT:    lbu a0, 8(t1)
+; RV64I-NEXT:    sd a0, 48(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a0, 9(t1)
+; RV64I-NEXT:    sd a0, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a0, 10(t1)
 ; RV64I-NEXT:    sd a0, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a7, 13(a6)
-; RV64I-NEXT:    lbu t0, 14(a6)
-; RV64I-NEXT:    lbu t1, 15(a6)
-; RV64I-NEXT:    lbu t2, 0(a6)
-; RV64I-NEXT:    lbu t3, 1(a6)
-; RV64I-NEXT:    lbu t4, 2(a6)
-; RV64I-NEXT:    lbu t5, 3(a6)
-; RV64I-NEXT:    lbu t6, 4(a6)
-; RV64I-NEXT:    lbu s0, 5(a6)
-; RV64I-NEXT:    lbu s1, 6(a6)
-; RV64I-NEXT:    lbu s2, 7(a6)
-; RV64I-NEXT:    lbu s3, 24(a6)
-; RV64I-NEXT:    lbu s4, 25(a6)
-; RV64I-NEXT:    lbu s5, 26(a6)
-; RV64I-NEXT:    lbu s6, 27(a6)
-; RV64I-NEXT:    lbu s7, 28(a6)
-; RV64I-NEXT:    lbu s8, 29(a6)
-; RV64I-NEXT:    lbu s9, 30(a6)
-; RV64I-NEXT:    lbu s10, 31(a6)
-; RV64I-NEXT:    lbu s11, 16(a6)
-; RV64I-NEXT:    lbu ra, 17(a6)
-; RV64I-NEXT:    lbu a5, 18(a6)
-; RV64I-NEXT:    lbu a4, 19(a6)
-; RV64I-NEXT:    lbu a0, 23(a6)
-; RV64I-NEXT:    lbu a1, 22(a6)
-; RV64I-NEXT:    lbu a3, 21(a6)
-; RV64I-NEXT:    lbu a6, 20(a6)
-; RV64I-NEXT:    sb a0, 23(a2)
-; RV64I-NEXT:    sb a1, 22(a2)
-; RV64I-NEXT:    sb a3, 21(a2)
-; RV64I-NEXT:    sb a6, 20(a2)
-; RV64I-NEXT:    sb a4, 19(a2)
-; RV64I-NEXT:    sb a5, 18(a2)
-; RV64I-NEXT:    sb ra, 17(a2)
-; RV64I-NEXT:    sb s11, 16(a2)
-; RV64I-NEXT:    sb s10, 31(a2)
-; RV64I-NEXT:    sb s9, 30(a2)
-; RV64I-NEXT:    sb s8, 29(a2)
-; RV64I-NEXT:    sb s7, 28(a2)
-; RV64I-NEXT:    sb s6, 27(a2)
-; RV64I-NEXT:    sb s5, 26(a2)
-; RV64I-NEXT:    sb s4, 25(a2)
-; RV64I-NEXT:    sb s3, 24(a2)
-; RV64I-NEXT:    sb s2, 7(a2)
-; RV64I-NEXT:    sb s1, 6(a2)
-; RV64I-NEXT:    sb s0, 5(a2)
-; RV64I-NEXT:    sb t6, 4(a2)
-; RV64I-NEXT:    sb t5, 3(a2)
-; RV64I-NEXT:    sb t4, 2(a2)
-; RV64I-NEXT:    sb t3, 1(a2)
-; RV64I-NEXT:    sb t2, 0(a2)
-; RV64I-NEXT:    sb t1, 15(a2)
-; RV64I-NEXT:    sb t0, 14(a2)
-; RV64I-NEXT:    sb a7, 13(a2)
-; RV64I-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 12(a2)
+; RV64I-NEXT:    lbu a7, 11(t1)
+; RV64I-NEXT:    lbu t6, 12(t1)
+; RV64I-NEXT:    lbu s0, 13(t1)
+; RV64I-NEXT:    lbu s1, 14(t1)
+; RV64I-NEXT:    lbu s2, 15(t1)
+; RV64I-NEXT:    lbu s3, 16(t1)
+; RV64I-NEXT:    lbu s4, 17(t1)
+; RV64I-NEXT:    lbu s5, 18(t1)
+; RV64I-NEXT:    lbu s6, 19(t1)
+; RV64I-NEXT:    lbu s7, 20(t1)
+; RV64I-NEXT:    lbu s8, 21(t1)
+; RV64I-NEXT:    lbu s9, 22(t1)
+; RV64I-NEXT:    lbu s10, 23(t1)
+; RV64I-NEXT:    lbu s11, 24(t1)
+; RV64I-NEXT:    lbu ra, 25(t1)
+; RV64I-NEXT:    lbu a5, 26(t1)
+; RV64I-NEXT:    lbu a4, 27(t1)
+; RV64I-NEXT:    lbu a3, 28(t1)
+; RV64I-NEXT:    lbu a1, 29(t1)
+; RV64I-NEXT:    lbu a0, 30(t1)
+; RV64I-NEXT:    lbu t1, 31(t1)
+; RV64I-NEXT:    sb s10, 23(a2)
+; RV64I-NEXT:    sb s9, 22(a2)
+; RV64I-NEXT:    sb s8, 21(a2)
+; RV64I-NEXT:    sb s7, 20(a2)
+; RV64I-NEXT:    sb s6, 19(a2)
+; RV64I-NEXT:    sb s5, 18(a2)
+; RV64I-NEXT:    sb s4, 17(a2)
+; RV64I-NEXT:    sb s3, 16(a2)
+; RV64I-NEXT:    sb t1, 31(a2)
+; RV64I-NEXT:    sb a0, 30(a2)
+; RV64I-NEXT:    sb a1, 29(a2)
+; RV64I-NEXT:    sb a3, 28(a2)
+; RV64I-NEXT:    sb a4, 27(a2)
+; RV64I-NEXT:    sb a5, 26(a2)
+; RV64I-NEXT:    sb ra, 25(a2)
+; RV64I-NEXT:    sb s11, 24(a2)
+; RV64I-NEXT:    sb t5, 7(a2)
+; RV64I-NEXT:    sb t4, 6(a2)
+; RV64I-NEXT:    sb t3, 5(a2)
+; RV64I-NEXT:    sb t2, 4(a2)
+; RV64I-NEXT:    sb t0, 3(a2)
+; RV64I-NEXT:    sb a6, 2(a2)
 ; RV64I-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 11(a2)
-; RV64I-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 10(a2)
+; RV64I-NEXT:    sb a0, 1(a2)
 ; RV64I-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    sb a0, 0(a2)
+; RV64I-NEXT:    sb s2, 15(a2)
+; RV64I-NEXT:    sb s1, 14(a2)
+; RV64I-NEXT:    sb s0, 13(a2)
+; RV64I-NEXT:    sb t6, 12(a2)
+; RV64I-NEXT:    sb a7, 11(a2)
+; RV64I-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    sb a0, 10(a2)
+; RV64I-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    sb a0, 9(a2)
 ; RV64I-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    sb a0, 8(a2)
@@ -1947,18 +1950,21 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sw s9, 100(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s10, 96(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s11, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 0(a0)
-; RV32I-NEXT:    sw a3, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 1(a0)
-; RV32I-NEXT:    sw a3, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 2(a0)
-; RV32I-NEXT:    sw a3, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 3(a0)
-; RV32I-NEXT:    sw a3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 4(a0)
-; RV32I-NEXT:    sw a3, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 5(a0)
-; RV32I-NEXT:    sw a3, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    mv a5, a1
+; RV32I-NEXT:    lbu a7, 30(a0)
+; RV32I-NEXT:    lbu a6, 31(a0)
+; RV32I-NEXT:    lbu a1, 0(a0)
+; RV32I-NEXT:    sw a1, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a1, 1(a0)
+; RV32I-NEXT:    sw a1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a1, 2(a0)
+; RV32I-NEXT:    sw a1, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a1, 3(a0)
+; RV32I-NEXT:    sw a1, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a1, 4(a0)
+; RV32I-NEXT:    sw a1, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a1, 5(a0)
+; RV32I-NEXT:    sw a1, 4(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    lbu t1, 6(a0)
 ; RV32I-NEXT:    lbu t2, 7(a0)
 ; RV32I-NEXT:    lbu t3, 8(a0)
@@ -1979,19 +1985,17 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    lbu s11, 23(a0)
 ; RV32I-NEXT:    lbu ra, 24(a0)
 ; RV32I-NEXT:    lbu t0, 25(a0)
-; RV32I-NEXT:    lbu a7, 26(a0)
-; RV32I-NEXT:    lbu a6, 27(a0)
-; RV32I-NEXT:    lbu a5, 28(a0)
-; RV32I-NEXT:    lbu a3, 31(a0)
-; RV32I-NEXT:    lbu a4, 30(a0)
+; RV32I-NEXT:    lbu a4, 26(a0)
+; RV32I-NEXT:    lbu a3, 27(a0)
+; RV32I-NEXT:    lbu a1, 28(a0)
 ; RV32I-NEXT:    lbu a0, 29(a0)
-; RV32I-NEXT:    lbu a1, 0(a1)
-; RV32I-NEXT:    sb a3, 91(sp)
-; RV32I-NEXT:    sb a4, 90(sp)
+; RV32I-NEXT:    lbu a5, 0(a5)
+; RV32I-NEXT:    sb a6, 91(sp)
+; RV32I-NEXT:    sb a7, 90(sp)
 ; RV32I-NEXT:    sb a0, 89(sp)
-; RV32I-NEXT:    sb a5, 88(sp)
-; RV32I-NEXT:    sb a6, 87(sp)
-; RV32I-NEXT:    sb a7, 86(sp)
+; RV32I-NEXT:    sb a1, 88(sp)
+; RV32I-NEXT:    sb a3, 87(sp)
+; RV32I-NEXT:    sb a4, 86(sp)
 ; RV32I-NEXT:    sb zero, 59(sp)
 ; RV32I-NEXT:    sb zero, 58(sp)
 ; RV32I-NEXT:    sb zero, 57(sp)
@@ -2056,82 +2060,82 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sb a0, 61(sp)
 ; RV32I-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    sb a0, 60(sp)
-; RV32I-NEXT:    andi a1, a1, 31
+; RV32I-NEXT:    andi a5, a5, 31
 ; RV32I-NEXT:    addi a0, sp, 60
-; RV32I-NEXT:    sub a6, a0, a1
-; RV32I-NEXT:    lbu a0, 6(a6)
+; RV32I-NEXT:    sub t1, a0, a5
+; RV32I-NEXT:    lbu a0, 0(t1)
 ; RV32I-NEXT:    sw a0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a0, 7(a6)
-; RV32I-NEXT:    sw a0, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a0, 4(a6)
-; RV32I-NEXT:    sw a0, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a0, 5(a6)
+; RV32I-NEXT:    lbu a0, 1(t1)
 ; RV32I-NEXT:    sw a0, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a0, 0(a6)
+; RV32I-NEXT:    lbu a7, 2(t1)
+; RV32I-NEXT:    lbu t0, 3(t1)
+; RV32I-NEXT:    lbu a0, 4(t1)
+; RV32I-NEXT:    sw a0, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a6, 5(t1)
+; RV32I-NEXT:    lbu a0, 6(t1)
+; RV32I-NEXT:    sw a0, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a0, 7(t1)
 ; RV32I-NEXT:    sw a0, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a7, 1(a6)
-; RV32I-NEXT:    lbu t0, 2(a6)
-; RV32I-NEXT:    lbu t1, 3(a6)
-; RV32I-NEXT:    lbu t2, 14(a6)
-; RV32I-NEXT:    lbu t3, 15(a6)
-; RV32I-NEXT:    lbu t4, 12(a6)
-; RV32I-NEXT:    lbu t5, 13(a6)
-; RV32I-NEXT:    lbu t6, 10(a6)
-; RV32I-NEXT:    lbu s0, 11(a6)
-; RV32I-NEXT:    lbu s1, 8(a6)
-; RV32I-NEXT:    lbu s2, 9(a6)
-; RV32I-NEXT:    lbu s3, 22(a6)
-; RV32I-NEXT:    lbu s4, 23(a6)
-; RV32I-NEXT:    lbu s5, 20(a6)
-; RV32I-NEXT:    lbu s6, 21(a6)
-; RV32I-NEXT:    lbu s7, 18(a6)
-; RV32I-NEXT:    lbu s8, 19(a6)
-; RV32I-NEXT:    lbu s9, 16(a6)
-; RV32I-NEXT:    lbu s10, 17(a6)
-; RV32I-NEXT:    lbu s11, 30(a6)
-; RV32I-NEXT:    lbu ra, 31(a6)
-; RV32I-NEXT:    lbu a5, 28(a6)
-; RV32I-NEXT:    lbu a4, 29(a6)
-; RV32I-NEXT:    lbu a0, 25(a6)
-; RV32I-NEXT:    lbu a1, 24(a6)
-; RV32I-NEXT:    lbu a3, 27(a6)
-; RV32I-NEXT:    lbu a6, 26(a6)
-; RV32I-NEXT:    sb a0, 25(a2)
-; RV32I-NEXT:    sb a1, 24(a2)
-; RV32I-NEXT:    sb a3, 27(a2)
-; RV32I-NEXT:    sb a6, 26(a2)
-; RV32I-NEXT:    sb a4, 29(a2)
-; RV32I-NEXT:    sb a5, 28(a2)
-; RV32I-NEXT:    sb ra, 31(a2)
-; RV32I-NEXT:    sb s11, 30(a2)
-; RV32I-NEXT:    sb s10, 17(a2)
-; RV32I-NEXT:    sb s9, 16(a2)
-; RV32I-NEXT:    sb s8, 19(a2)
-; RV32I-NEXT:    sb s7, 18(a2)
-; RV32I-NEXT:    sb s6, 21(a2)
-; RV32I-NEXT:    sb s5, 20(a2)
-; RV32I-NEXT:    sb s4, 23(a2)
-; RV32I-NEXT:    sb s3, 22(a2)
-; RV32I-NEXT:    sb s2, 9(a2)
-; RV32I-NEXT:    sb s1, 8(a2)
-; RV32I-NEXT:    sb s0, 11(a2)
-; RV32I-NEXT:    sb t6, 10(a2)
-; RV32I-NEXT:    sb t5, 13(a2)
-; RV32I-NEXT:    sb t4, 12(a2)
-; RV32I-NEXT:    sb t3, 15(a2)
-; RV32I-NEXT:    sb t2, 14(a2)
-; RV32I-NEXT:    sb t1, 3(a2)
-; RV32I-NEXT:    sb t0, 2(a2)
-; RV32I-NEXT:    sb a7, 1(a2)
-; RV32I-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 0(a2)
+; RV32I-NEXT:    lbu t2, 8(t1)
+; RV32I-NEXT:    lbu t3, 9(t1)
+; RV32I-NEXT:    lbu t4, 10(t1)
+; RV32I-NEXT:    lbu t5, 11(t1)
+; RV32I-NEXT:    lbu t6, 12(t1)
+; RV32I-NEXT:    lbu s0, 13(t1)
+; RV32I-NEXT:    lbu s1, 14(t1)
+; RV32I-NEXT:    lbu s2, 15(t1)
+; RV32I-NEXT:    lbu s3, 16(t1)
+; RV32I-NEXT:    lbu s4, 17(t1)
+; RV32I-NEXT:    lbu s5, 18(t1)
+; RV32I-NEXT:    lbu s6, 19(t1)
+; RV32I-NEXT:    lbu s7, 20(t1)
+; RV32I-NEXT:    lbu s8, 21(t1)
+; RV32I-NEXT:    lbu s9, 22(t1)
+; RV32I-NEXT:    lbu s10, 23(t1)
+; RV32I-NEXT:    lbu s11, 24(t1)
+; RV32I-NEXT:    lbu ra, 25(t1)
+; RV32I-NEXT:    lbu a4, 26(t1)
+; RV32I-NEXT:    lbu a0, 27(t1)
+; RV32I-NEXT:    lbu a3, 28(t1)
+; RV32I-NEXT:    lbu a1, 29(t1)
+; RV32I-NEXT:    lbu a5, 30(t1)
+; RV32I-NEXT:    lbu t1, 31(t1)
+; RV32I-NEXT:    sb ra, 25(a2)
+; RV32I-NEXT:    sb s11, 24(a2)
+; RV32I-NEXT:    sb a0, 27(a2)
+; RV32I-NEXT:    sb a4, 26(a2)
+; RV32I-NEXT:    sb a1, 29(a2)
+; RV32I-NEXT:    sb a3, 28(a2)
+; RV32I-NEXT:    sb t1, 31(a2)
+; RV32I-NEXT:    sb a5, 30(a2)
+; RV32I-NEXT:    sb s4, 17(a2)
+; RV32I-NEXT:    sb s3, 16(a2)
+; RV32I-NEXT:    sb s6, 19(a2)
+; RV32I-NEXT:    sb s5, 18(a2)
+; RV32I-NEXT:    sb s8, 21(a2)
+; RV32I-NEXT:    sb s7, 20(a2)
+; RV32I-NEXT:    sb s10, 23(a2)
+; RV32I-NEXT:    sb s9, 22(a2)
+; RV32I-NEXT:    sb t3, 9(a2)
+; RV32I-NEXT:    sb t2, 8(a2)
+; RV32I-NEXT:    sb t5, 11(a2)
+; RV32I-NEXT:    sb t4, 10(a2)
+; RV32I-NEXT:    sb s0, 13(a2)
+; RV32I-NEXT:    sb t6, 12(a2)
+; RV32I-NEXT:    sb s2, 15(a2)
+; RV32I-NEXT:    sb s1, 14(a2)
+; RV32I-NEXT:    sb t0, 3(a2)
+; RV32I-NEXT:    sb a7, 2(a2)
 ; RV32I-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 5(a2)
+; RV32I-NEXT:    sb a0, 1(a2)
+; RV32I-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a0, 0(a2)
+; RV32I-NEXT:    sb a6, 5(a2)
 ; RV32I-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    sb a0, 4(a2)
-; RV32I-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    sb a0, 7(a2)
-; RV32I-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    sb a0, 6(a2)
 ; RV32I-NEXT:    lw ra, 140(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 136(sp) # 4-byte Folded Reload
@@ -2172,20 +2176,20 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    sd s9, 136(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s10, 128(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s11, 120(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv t0, a1
-; RV64I-NEXT:    lbu t1, 31(a0)
+; RV64I-NEXT:    mv t1, a1
+; RV64I-NEXT:    lbu t0, 30(a0)
 ; RV64I-NEXT:    lbu a1, 0(a0)
-; RV64I-NEXT:    sd a1, 48(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a1, 1(a0)
 ; RV64I-NEXT:    sd a1, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a1, 2(a0)
+; RV64I-NEXT:    lbu a1, 1(a0)
 ; RV64I-NEXT:    sd a1, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a1, 3(a0)
+; RV64I-NEXT:    lbu a1, 2(a0)
 ; RV64I-NEXT:    sd a1, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a1, 4(a0)
+; RV64I-NEXT:    lbu a1, 3(a0)
 ; RV64I-NEXT:    sd a1, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a1, 5(a0)
+; RV64I-NEXT:    lbu a1, 4(a0)
 ; RV64I-NEXT:    sd a1, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 5(a0)
+; RV64I-NEXT:    sd a1, 0(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    lbu t2, 6(a0)
 ; RV64I-NEXT:    lbu t3, 7(a0)
 ; RV64I-NEXT:    lbu t4, 8(a0)
@@ -2208,18 +2212,19 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    lbu a6, 25(a0)
 ; RV64I-NEXT:    lbu a5, 26(a0)
 ; RV64I-NEXT:    lbu a4, 27(a0)
-; RV64I-NEXT:    lbu a1, 30(a0)
-; RV64I-NEXT:    lbu a3, 29(a0)
-; RV64I-NEXT:    lbu a0, 28(a0)
-; RV64I-NEXT:    lbu t0, 0(t0)
-; RV64I-NEXT:    sb a1, 86(sp)
-; RV64I-NEXT:    sb a3, 85(sp)
-; RV64I-NEXT:    sb a0, 84(sp)
+; RV64I-NEXT:    lbu a3, 28(a0)
+; RV64I-NEXT:    lbu a1, 29(a0)
+; RV64I-NEXT:    lbu a0, 31(a0)
+; RV64I-NEXT:    lbu t1, 0(t1)
+; RV64I-NEXT:    sd t1, 48(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sb t0, 86(sp)
+; RV64I-NEXT:    sb a1, 85(sp)
+; RV64I-NEXT:    sb a3, 84(sp)
 ; RV64I-NEXT:    sb a4, 83(sp)
 ; RV64I-NEXT:    sb a5, 82(sp)
 ; RV64I-NEXT:    sb a6, 81(sp)
-; RV64I-NEXT:    sb t1, 87(sp)
-; RV64I-NEXT:    slli t1, t1, 56
+; RV64I-NEXT:    sb a0, 87(sp)
+; RV64I-NEXT:    slli a0, a0, 56
 ; RV64I-NEXT:    sb a7, 80(sp)
 ; RV64I-NEXT:    sb ra, 79(sp)
 ; RV64I-NEXT:    sb s11, 78(sp)
@@ -2239,19 +2244,19 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    sb t4, 64(sp)
 ; RV64I-NEXT:    sb t3, 63(sp)
 ; RV64I-NEXT:    sb t2, 62(sp)
-; RV64I-NEXT:    ld a0, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 61(sp)
-; RV64I-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 60(sp)
-; RV64I-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 59(sp)
-; RV64I-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 58(sp)
-; RV64I-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 57(sp)
-; RV64I-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 56(sp)
-; RV64I-NEXT:    srai a0, t1, 63
+; RV64I-NEXT:    ld a1, 0(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    sb a1, 61(sp)
+; RV64I-NEXT:    ld a1, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    sb a1, 60(sp)
+; RV64I-NEXT:    ld a1, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    sb a1, 59(sp)
+; RV64I-NEXT:    ld a1, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    sb a1, 58(sp)
+; RV64I-NEXT:    ld a1, 32(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    sb a1, 57(sp)
+; RV64I-NEXT:    ld a1, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    sb a1, 56(sp)
+; RV64I-NEXT:    srai a0, a0, 63
 ; RV64I-NEXT:    sb a0, 112(sp)
 ; RV64I-NEXT:    sb a0, 104(sp)
 ; RV64I-NEXT:    sb a0, 96(sp)
@@ -2291,80 +2296,81 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    sb a6, 91(sp)
 ; RV64I-NEXT:    sb a7, 90(sp)
 ; RV64I-NEXT:    sb a0, 89(sp)
-; RV64I-NEXT:    andi a0, t0, 31
+; RV64I-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    andi a0, a0, 31
 ; RV64I-NEXT:    addi a1, sp, 56
-; RV64I-NEXT:    add a6, a1, a0
-; RV64I-NEXT:    lbu a0, 8(a6)
-; RV64I-NEXT:    sd a0, 48(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a0, 9(a6)
+; RV64I-NEXT:    add t1, a1, a0
+; RV64I-NEXT:    lbu a0, 0(t1)
 ; RV64I-NEXT:    sd a0, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a0, 10(a6)
-; RV64I-NEXT:    sd a0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a0, 11(a6)
+; RV64I-NEXT:    lbu a0, 1(t1)
 ; RV64I-NEXT:    sd a0, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a0, 12(a6)
+; RV64I-NEXT:    lbu a6, 2(t1)
+; RV64I-NEXT:    lbu t0, 3(t1)
+; RV64I-NEXT:    lbu t2, 4(t1)
+; RV64I-NEXT:    lbu t3, 5(t1)
+; RV64I-NEXT:    lbu t4, 6(t1)
+; RV64I-NEXT:    lbu t5, 7(t1)
+; RV64I-NEXT:    lbu a0, 8(t1)
+; RV64I-NEXT:    sd a0, 48(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a0, 9(t1)
+; RV64I-NEXT:    sd a0, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a0, 10(t1)
 ; RV64I-NEXT:    sd a0, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a7, 13(a6)
-; RV64I-NEXT:    lbu t0, 14(a6)
-; RV64I-NEXT:    lbu t1, 15(a6)
-; RV64I-NEXT:    lbu t2, 0(a6)
-; RV64I-NEXT:    lbu t3, 1(a6)
-; RV64I-NEXT:    lbu t4, 2(a6)
-; RV64I-NEXT:    lbu t5, 3(a6)
-; RV64I-NEXT:    lbu t6, 4(a6)
-; RV64I-NEXT:    lbu s0, 5(a6)
-; RV64I-NEXT:    lbu s1, 6(a6)
-; RV64I-NEXT:    lbu s2, 7(a6)
-; RV64I-NEXT:    lbu s3, 24(a6)
-; RV64I-NEXT:    lbu s4, 25(a6)
-; RV64I-NEXT:    lbu s5, 26(a6)
-; RV64I-NEXT:    lbu s6, 27(a6)
-; RV64I-NEXT:    lbu s7, 28(a6)
-; RV64I-NEXT:    lbu s8, 29(a6)
-; RV64I-NEXT:    lbu s9, 30(a6)
-; RV64I-NEXT:    lbu s10, 31(a6)
-; RV64I-NEXT:    lbu s11, 16(a6)
-; RV64I-NEXT:    lbu ra, 17(a6)
-; RV64I-NEXT:    lbu a5, 18(a6)
-; RV64I-NEXT:    lbu a4, 19(a6)
-; RV64I-NEXT:    lbu a0, 23(a6)
-; RV64I-NEXT:    lbu a1, 22(a6)
-; RV64I-NEXT:    lbu a3, 21(a6)
-; RV64I-NEXT:    lbu a6, 20(a6)
-; RV64I-NEXT:    sb a0, 23(a2)
-; RV64I-NEXT:    sb a1, 22(a2)
-; RV64I-NEXT:    sb a3, 21(a2)
-; RV64I-NEXT:    sb a6, 20(a2)
-; RV64I-NEXT:    sb a4, 19(a2)
-; RV64I-NEXT:    sb a5, 18(a2)
-; RV64I-NEXT:    sb ra, 17(a2)
-; RV64I-NEXT:    sb s11, 16(a2)
-; RV64I-NEXT:    sb s10, 31(a2)
-; RV64I-NEXT:    sb s9, 30(a2)
-; RV64I-NEXT:    sb s8, 29(a2)
-; RV64I-NEXT:    sb s7, 28(a2)
-; RV64I-NEXT:    sb s6, 27(a2)
-; RV64I-NEXT:    sb s5, 26(a2)
-; RV64I-NEXT:    sb s4, 25(a2)
-; RV64I-NEXT:    sb s3, 24(a2)
-; RV64I-NEXT:    sb s2, 7(a2)
-; RV64I-NEXT:    sb s1, 6(a2)
-; RV64I-NEXT:    sb s0, 5(a2)
-; RV64I-NEXT:    sb t6, 4(a2)
-; RV64I-NEXT:    sb t5, 3(a2)
-; RV64I-NEXT:    sb t4, 2(a2)
-; RV64I-NEXT:    sb t3, 1(a2)
-; RV64I-NEXT:    sb t2, 0(a2)
-; RV64I-NEXT:    sb t1, 15(a2)
-; RV64I-NEXT:    sb t0, 14(a2)
-; RV64I-NEXT:    sb a7, 13(a2)
-; RV64I-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 12(a2)
+; RV64I-NEXT:    lbu a7, 11(t1)
+; RV64I-NEXT:    lbu t6, 12(t1)
+; RV64I-NEXT:    lbu s0, 13(t1)
+; RV64I-NEXT:    lbu s1, 14(t1)
+; RV64I-NEXT:    lbu s2, 15(t1)
+; RV64I-NEXT:    lbu s3, 16(t1)
+; RV64I-NEXT:    lbu s4, 17(t1)
+; RV64I-NEXT:    lbu s5, 18(t1)
+; RV64I-NEXT:    lbu s6, 19(t1)
+; RV64I-NEXT:    lbu s7, 20(t1)
+; RV64I-NEXT:    lbu s8, 21(t1)
+; RV64I-NEXT:    lbu s9, 22(t1)
+; RV64I-NEXT:    lbu s10, 23(t1)
+; RV64I-NEXT:    lbu s11, 24(t1)
+; RV64I-NEXT:    lbu ra, 25(t1)
+; RV64I-NEXT:    lbu a5, 26(t1)
+; RV64I-NEXT:    lbu a4, 27(t1)
+; RV64I-NEXT:    lbu a3, 28(t1)
+; RV64I-NEXT:    lbu a1, 29(t1)
+; RV64I-NEXT:    lbu a0, 30(t1)
+; RV64I-NEXT:    lbu t1, 31(t1)
+; RV64I-NEXT:    sb s10, 23(a2)
+; RV64I-NEXT:    sb s9, 22(a2)
+; RV64I-NEXT:    sb s8, 21(a2)
+; RV64I-NEXT:    sb s7, 20(a2)
+; RV64I-NEXT:    sb s6, 19(a2)
+; RV64I-NEXT:    sb s5, 18(a2)
+; RV64I-NEXT:    sb s4, 17(a2)
+; RV64I-NEXT:    sb s3, 16(a2)
+; RV64I-NEXT:    sb t1, 31(a2)
+; RV64I-NEXT:    sb a0, 30(a2)
+; RV64I-NEXT:    sb a1, 29(a2)
+; RV64I-NEXT:    sb a3, 28(a2)
+; RV64I-NEXT:    sb a4, 27(a2)
+; RV64I-NEXT:    sb a5, 26(a2)
+; RV64I-NEXT:    sb ra, 25(a2)
+; RV64I-NEXT:    sb s11, 24(a2)
+; RV64I-NEXT:    sb t5, 7(a2)
+; RV64I-NEXT:    sb t4, 6(a2)
+; RV64I-NEXT:    sb t3, 5(a2)
+; RV64I-NEXT:    sb t2, 4(a2)
+; RV64I-NEXT:    sb t0, 3(a2)
+; RV64I-NEXT:    sb a6, 2(a2)
 ; RV64I-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 11(a2)
-; RV64I-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 10(a2)
+; RV64I-NEXT:    sb a0, 1(a2)
 ; RV64I-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    sb a0, 0(a2)
+; RV64I-NEXT:    sb s2, 15(a2)
+; RV64I-NEXT:    sb s1, 14(a2)
+; RV64I-NEXT:    sb s0, 13(a2)
+; RV64I-NEXT:    sb t6, 12(a2)
+; RV64I-NEXT:    sb a7, 11(a2)
+; RV64I-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    sb a0, 10(a2)
+; RV64I-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    sb a0, 9(a2)
 ; RV64I-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    sb a0, 8(a2)
@@ -2400,20 +2406,20 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sw s9, 100(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s10, 96(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s11, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv t0, a1
-; RV32I-NEXT:    lbu t1, 31(a0)
+; RV32I-NEXT:    mv t1, a1
+; RV32I-NEXT:    lbu t0, 30(a0)
 ; RV32I-NEXT:    lbu a1, 0(a0)
-; RV32I-NEXT:    sw a1, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a1, 1(a0)
 ; RV32I-NEXT:    sw a1, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a1, 2(a0)
+; RV32I-NEXT:    lbu a1, 1(a0)
 ; RV32I-NEXT:    sw a1, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a1, 3(a0)
+; RV32I-NEXT:    lbu a1, 2(a0)
 ; RV32I-NEXT:    sw a1, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a1, 4(a0)
+; RV32I-NEXT:    lbu a1, 3(a0)
 ; RV32I-NEXT:    sw a1, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a1, 5(a0)
+; RV32I-NEXT:    lbu a1, 4(a0)
 ; RV32I-NEXT:    sw a1, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a1, 5(a0)
+; RV32I-NEXT:    sw a1, 0(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    lbu t2, 6(a0)
 ; RV32I-NEXT:    lbu t3, 7(a0)
 ; RV32I-NEXT:    lbu t4, 8(a0)
@@ -2436,18 +2442,19 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    lbu a6, 25(a0)
 ; RV32I-NEXT:    lbu a5, 26(a0)
 ; RV32I-NEXT:    lbu a4, 27(a0)
-; RV32I-NEXT:    lbu a1, 30(a0)
-; RV32I-NEXT:    lbu a3, 29(a0)
-; RV32I-NEXT:    lbu a0, 28(a0)
-; RV32I-NEXT:    lbu t0, 0(t0)
-; RV32I-NEXT:    sb a1, 58(sp)
-; RV32I-NEXT:    sb a3, 57(sp)
-; RV32I-NEXT:    sb a0, 56(sp)
+; RV32I-NEXT:    lbu a3, 28(a0)
+; RV32I-NEXT:    lbu a1, 29(a0)
+; RV32I-NEXT:    lbu a0, 31(a0)
+; RV32I-NEXT:    lbu t1, 0(t1)
+; RV32I-NEXT:    sw t1, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sb t0, 58(sp)
+; RV32I-NEXT:    sb a1, 57(sp)
+; RV32I-NEXT:    sb a3, 56(sp)
 ; RV32I-NEXT:    sb a4, 55(sp)
 ; RV32I-NEXT:    sb a5, 54(sp)
 ; RV32I-NEXT:    sb a6, 53(sp)
-; RV32I-NEXT:    sb t1, 59(sp)
-; RV32I-NEXT:    slli t1, t1, 24
+; RV32I-NEXT:    sb a0, 59(sp)
+; RV32I-NEXT:    slli a0, a0, 24
 ; RV32I-NEXT:    sb a7, 52(sp)
 ; RV32I-NEXT:    sb ra, 51(sp)
 ; RV32I-NEXT:    sb s11, 50(sp)
@@ -2467,19 +2474,19 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sb t4, 36(sp)
 ; RV32I-NEXT:    sb t3, 35(sp)
 ; RV32I-NEXT:    sb t2, 34(sp)
-; RV32I-NEXT:    lw a0, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 33(sp)
-; RV32I-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 32(sp)
-; RV32I-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 31(sp)
-; RV32I-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 30(sp)
-; RV32I-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 29(sp)
-; RV32I-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 28(sp)
-; RV32I-NEXT:    srai a0, t1, 31
+; RV32I-NEXT:    lw a1, 0(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a1, 33(sp)
+; RV32I-NEXT:    lw a1, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a1, 32(sp)
+; RV32I-NEXT:    lw a1, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a1, 31(sp)
+; RV32I-NEXT:    lw a1, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a1, 30(sp)
+; RV32I-NEXT:    lw a1, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a1, 29(sp)
+; RV32I-NEXT:    lw a1, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a1, 28(sp)
+; RV32I-NEXT:    srai a0, a0, 31
 ; RV32I-NEXT:    sb a0, 88(sp)
 ; RV32I-NEXT:    sb a0, 84(sp)
 ; RV32I-NEXT:    sb a0, 80(sp)
@@ -2515,82 +2522,83 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sb a1, 63(sp)
 ; RV32I-NEXT:    sb a3, 62(sp)
 ; RV32I-NEXT:    sb a0, 61(sp)
-; RV32I-NEXT:    andi a0, t0, 31
+; RV32I-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    andi a0, a0, 31
 ; RV32I-NEXT:    addi a1, sp, 28
-; RV32I-NEXT:    add a6, a1, a0
-; RV32I-NEXT:    lbu a0, 6(a6)
+; RV32I-NEXT:    add t1, a1, a0
+; RV32I-NEXT:    lbu a0, 0(t1)
 ; RV32I-NEXT:    sw a0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a0, 7(a6)
-; RV32I-NEXT:    sw a0, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a0, 4(a6)
-; RV32I-NEXT:    sw a0, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a0, 5(a6)
+; RV32I-NEXT:    lbu a0, 1(t1)
 ; RV32I-NEXT:    sw a0, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a0, 0(a6)
+; RV32I-NEXT:    lbu a7, 2(t1)
+; RV32I-NEXT:    lbu t0, 3(t1)
+; RV32I-NEXT:    lbu a0, 4(t1)
+; RV32I-NEXT:    sw a0, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a6, 5(t1)
+; RV32I-NEXT:    lbu a0, 6(t1)
+; RV32I-NEXT:    sw a0, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a0, 7(t1)
 ; RV32I-NEXT:    sw a0, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a7, 1(a6)
-; RV32I-NEXT:    lbu t0, 2(a6)
-; RV32I-NEXT:    lbu t1, 3(a6)
-; RV32I-NEXT:    lbu t2, 14(a6)
-; RV32I-NEXT:    lbu t3, 15(a6)
-; RV32I-NEXT:    lbu t4, 12(a6)
-; RV32I-NEXT:    lbu t5, 13(a6)
-; RV32I-NEXT:    lbu t6, 10(a6)
-; RV32I-NEXT:    lbu s0, 11(a6)
-; RV32I-NEXT:    lbu s1, 8(a6)
-; RV32I-NEXT:    lbu s2, 9(a6)
-; RV32I-NEXT:    lbu s3, 22(a6)
-; RV32I-NEXT:    lbu s4, 23(a6)
-; RV32I-NEXT:    lbu s5, 20(a6)
-; RV32I-NEXT:    lbu s6, 21(a6)
-; RV32I-NEXT:    lbu s7, 18(a6)
-; RV32I-NEXT:    lbu s8, 19(a6)
-; RV32I-NEXT:    lbu s9, 16(a6)
-; RV32I-NEXT:    lbu s10, 17(a6)
-; RV32I-NEXT:    lbu s11, 30(a6)
-; RV32I-NEXT:    lbu ra, 31(a6)
-; RV32I-NEXT:    lbu a5, 28(a6)
-; RV32I-NEXT:    lbu a4, 29(a6)
-; RV32I-NEXT:    lbu a0, 25(a6)
-; RV32I-NEXT:    lbu a1, 24(a6)
-; RV32I-NEXT:    lbu a3, 27(a6)
-; RV32I-NEXT:    lbu a6, 26(a6)
-; RV32I-NEXT:    sb a0, 25(a2)
-; RV32I-NEXT:    sb a1, 24(a2)
-; RV32I-NEXT:    sb a3, 27(a2)
-; RV32I-NEXT:    sb a6, 26(a2)
-; RV32I-NEXT:    sb a4, 29(a2)
-; RV32I-NEXT:    sb a5, 28(a2)
-; RV32I-NEXT:    sb ra, 31(a2)
-; RV32I-NEXT:    sb s11, 30(a2)
-; RV32I-NEXT:    sb s10, 17(a2)
-; RV32I-NEXT:    sb s9, 16(a2)
-; RV32I-NEXT:    sb s8, 19(a2)
-; RV32I-NEXT:    sb s7, 18(a2)
-; RV32I-NEXT:    sb s6, 21(a2)
-; RV32I-NEXT:    sb s5, 20(a2)
-; RV32I-NEXT:    sb s4, 23(a2)
-; RV32I-NEXT:    sb s3, 22(a2)
-; RV32I-NEXT:    sb s2, 9(a2)
-; RV32I-NEXT:    sb s1, 8(a2)
-; RV32I-NEXT:    sb s0, 11(a2)
-; RV32I-NEXT:    sb t6, 10(a2)
-; RV32I-NEXT:    sb t5, 13(a2)
-; RV32I-NEXT:    sb t4, 12(a2)
-; RV32I-NEXT:    sb t3, 15(a2)
-; RV32I-NEXT:    sb t2, 14(a2)
-; RV32I-NEXT:    sb t1, 3(a2)
-; RV32I-NEXT:    sb t0, 2(a2)
-; RV32I-NEXT:    sb a7, 1(a2)
-; RV32I-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 0(a2)
+; RV32I-NEXT:    lbu t2, 8(t1)
+; RV32I-NEXT:    lbu t3, 9(t1)
+; RV32I-NEXT:    lbu t4, 10(t1)
+; RV32I-NEXT:    lbu t5, 11(t1)
+; RV32I-NEXT:    lbu t6, 12(t1)
+; RV32I-NEXT:    lbu s0, 13(t1)
+; RV32I-NEXT:    lbu s1, 14(t1)
+; RV32I-NEXT:    lbu s2, 15(t1)
+; RV32I-NEXT:    lbu s3, 16(t1)
+; RV32I-NEXT:    lbu s4, 17(t1)
+; RV32I-NEXT:    lbu s5, 18(t1)
+; RV32I-NEXT:    lbu s6, 19(t1)
+; RV32I-NEXT:    lbu s7, 20(t1)
+; RV32I-NEXT:    lbu s8, 21(t1)
+; RV32I-NEXT:    lbu s9, 22(t1)
+; RV32I-NEXT:    lbu s10, 23(t1)
+; RV32I-NEXT:    lbu s11, 24(t1)
+; RV32I-NEXT:    lbu ra, 25(t1)
+; RV32I-NEXT:    lbu a4, 26(t1)
+; RV32I-NEXT:    lbu a0, 27(t1)
+; RV32I-NEXT:    lbu a3, 28(t1)
+; RV32I-NEXT:    lbu a1, 29(t1)
+; RV32I-NEXT:    lbu a5, 30(t1)
+; RV32I-NEXT:    lbu t1, 31(t1)
+; RV32I-NEXT:    sb ra, 25(a2)
+; RV32I-NEXT:    sb s11, 24(a2)
+; RV32I-NEXT:    sb a0, 27(a2)
+; RV32I-NEXT:    sb a4, 26(a2)
+; RV32I-NEXT:    sb a1, 29(a2)
+; RV32I-NEXT:    sb a3, 28(a2)
+; RV32I-NEXT:    sb t1, 31(a2)
+; RV32I-NEXT:    sb a5, 30(a2)
+; RV32I-NEXT:    sb s4, 17(a2)
+; RV32I-NEXT:    sb s3, 16(a2)
+; RV32I-NEXT:    sb s6, 19(a2)
+; RV32I-NEXT:    sb s5, 18(a2)
+; RV32I-NEXT:    sb s8, 21(a2)
+; RV32I-NEXT:    sb s7, 20(a2)
+; RV32I-NEXT:    sb s10, 23(a2)
+; RV32I-NEXT:    sb s9, 22(a2)
+; RV32I-NEXT:    sb t3, 9(a2)
+; RV32I-NEXT:    sb t2, 8(a2)
+; RV32I-NEXT:    sb t5, 11(a2)
+; RV32I-NEXT:    sb t4, 10(a2)
+; RV32I-NEXT:    sb s0, 13(a2)
+; RV32I-NEXT:    sb t6, 12(a2)
+; RV32I-NEXT:    sb s2, 15(a2)
+; RV32I-NEXT:    sb s1, 14(a2)
+; RV32I-NEXT:    sb t0, 3(a2)
+; RV32I-NEXT:    sb a7, 2(a2)
 ; RV32I-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 5(a2)
+; RV32I-NEXT:    sb a0, 1(a2)
+; RV32I-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a0, 0(a2)
+; RV32I-NEXT:    sb a6, 5(a2)
 ; RV32I-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    sb a0, 4(a2)
-; RV32I-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    sb a0, 7(a2)
-; RV32I-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    sb a0, 6(a2)
 ; RV32I-NEXT:    lw ra, 140(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 136(sp) # 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
index a601256bc2afaa..3db42f1ba1a012 100644
--- a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
+++ b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
@@ -8,8 +8,8 @@ define void @lshr_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    lbu a3, 1(a0)
 ; RV64I-NEXT:    lbu a4, 0(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
-; RV64I-NEXT:    slli a3, a3, 8
 ; RV64I-NEXT:    lb a0, 3(a0)
+; RV64I-NEXT:    slli a3, a3, 8
 ; RV64I-NEXT:    or a3, a3, a4
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    lbu a1, 0(a1)
@@ -37,17 +37,17 @@ define void @lshr_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a0, a0, 24
 ; RV32I-NEXT:    or a0, a0, a5
-; RV32I-NEXT:    or a0, a0, a3
-; RV32I-NEXT:    lbu a3, 1(a1)
 ; RV32I-NEXT:    lbu a4, 0(a1)
-; RV32I-NEXT:    lbu a5, 2(a1)
+; RV32I-NEXT:    lbu a5, 1(a1)
+; RV32I-NEXT:    or a0, a0, a3
+; RV32I-NEXT:    lbu a3, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a4
-; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    slli a3, a3, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, a5
 ; RV32I-NEXT:    or a1, a1, a3
+; RV32I-NEXT:    or a1, a1, a4
 ; RV32I-NEXT:    srl a0, a0, a1
 ; RV32I-NEXT:    sb a0, 0(a2)
 ; RV32I-NEXT:    srli a1, a0, 16
@@ -69,8 +69,8 @@ define void @shl_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    lbu a3, 1(a0)
 ; RV64I-NEXT:    lbu a4, 0(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
-; RV64I-NEXT:    slli a3, a3, 8
 ; RV64I-NEXT:    lb a0, 3(a0)
+; RV64I-NEXT:    slli a3, a3, 8
 ; RV64I-NEXT:    or a3, a3, a4
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    lbu a1, 0(a1)
@@ -98,17 +98,17 @@ define void @shl_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a0, a0, 24
 ; RV32I-NEXT:    or a0, a0, a5
-; RV32I-NEXT:    or a0, a0, a3
-; RV32I-NEXT:    lbu a3, 1(a1)
 ; RV32I-NEXT:    lbu a4, 0(a1)
-; RV32I-NEXT:    lbu a5, 2(a1)
+; RV32I-NEXT:    lbu a5, 1(a1)
+; RV32I-NEXT:    or a0, a0, a3
+; RV32I-NEXT:    lbu a3, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a4
-; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    slli a3, a3, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, a5
 ; RV32I-NEXT:    or a1, a1, a3
+; RV32I-NEXT:    or a1, a1, a4
 ; RV32I-NEXT:    sll a0, a0, a1
 ; RV32I-NEXT:    sb a0, 0(a2)
 ; RV32I-NEXT:    srli a1, a0, 16
@@ -130,8 +130,8 @@ define void @ashr_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    lbu a3, 1(a0)
 ; RV64I-NEXT:    lbu a4, 0(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
-; RV64I-NEXT:    slli a3, a3, 8
 ; RV64I-NEXT:    lb a0, 3(a0)
+; RV64I-NEXT:    slli a3, a3, 8
 ; RV64I-NEXT:    or a3, a3, a4
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    lbu a1, 0(a1)
@@ -159,17 +159,17 @@ define void @ashr_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a0, a0, 24
 ; RV32I-NEXT:    or a0, a0, a5
-; RV32I-NEXT:    or a0, a0, a3
-; RV32I-NEXT:    lbu a3, 1(a1)
 ; RV32I-NEXT:    lbu a4, 0(a1)
-; RV32I-NEXT:    lbu a5, 2(a1)
+; RV32I-NEXT:    lbu a5, 1(a1)
+; RV32I-NEXT:    or a0, a0, a3
+; RV32I-NEXT:    lbu a3, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a4
-; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    slli a3, a3, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, a5
 ; RV32I-NEXT:    or a1, a1, a3
+; RV32I-NEXT:    or a1, a1, a4
 ; RV32I-NEXT:    sra a0, a0, a1
 ; RV32I-NEXT:    sb a0, 0(a2)
 ; RV32I-NEXT:    srli a1, a0, 16
@@ -189,47 +189,47 @@ define void @ashr_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 define void @lshr_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: lshr_8bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lbu a3, 1(a0)
-; RV64I-NEXT:    lbu a4, 0(a0)
+; RV64I-NEXT:    lbu a3, 0(a0)
+; RV64I-NEXT:    lbu a4, 1(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
 ; RV64I-NEXT:    lbu a6, 3(a0)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    lbu a7, 4(a0)
+; RV64I-NEXT:    lbu t0, 5(a0)
+; RV64I-NEXT:    lbu t1, 6(a0)
+; RV64I-NEXT:    lbu a0, 7(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    or a4, a6, a5
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 5(a0)
-; RV64I-NEXT:    lbu a5, 4(a0)
-; RV64I-NEXT:    lbu a6, 6(a0)
-; RV64I-NEXT:    lbu a0, 7(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a4, t0, a7
+; RV64I-NEXT:    slli t1, t1, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, a6
+; RV64I-NEXT:    or a0, a0, t1
 ; RV64I-NEXT:    or a0, a0, a4
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    or a0, a0, a3
-; RV64I-NEXT:    lbu a3, 1(a1)
-; RV64I-NEXT:    lbu a4, 0(a1)
+; RV64I-NEXT:    lbu a3, 0(a1)
+; RV64I-NEXT:    lbu a4, 1(a1)
 ; RV64I-NEXT:    lbu a5, 2(a1)
 ; RV64I-NEXT:    lbu a6, 3(a1)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    lbu a7, 4(a1)
+; RV64I-NEXT:    lbu t0, 5(a1)
+; RV64I-NEXT:    lbu t1, 6(a1)
+; RV64I-NEXT:    lbu a1, 7(a1)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    or a4, a6, a5
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 5(a1)
-; RV64I-NEXT:    lbu a5, 4(a1)
-; RV64I-NEXT:    lbu a6, 6(a1)
-; RV64I-NEXT:    lbu a1, 7(a1)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a4, t0, a7
+; RV64I-NEXT:    slli t1, t1, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, a6
+; RV64I-NEXT:    or a1, a1, t1
 ; RV64I-NEXT:    or a1, a1, a4
 ; RV64I-NEXT:    slli a1, a1, 32
 ; RV64I-NEXT:    or a1, a1, a3
@@ -262,17 +262,17 @@ define void @lshr_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
 ; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    lbu a4, 1(a1)
 ; RV32I-NEXT:    lbu a5, 0(a1)
-; RV32I-NEXT:    lbu a6, 2(a1)
+; RV32I-NEXT:    lbu a6, 1(a1)
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    or a4, a4, a5
-; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    slli a4, a4, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a5, a1, a6
-; RV32I-NEXT:    or a5, a5, a4
+; RV32I-NEXT:    or a1, a1, a4
+; RV32I-NEXT:    or a5, a1, a5
 ; RV32I-NEXT:    addi a4, a5, -32
 ; RV32I-NEXT:    srl a1, a3, a5
 ; RV32I-NEXT:    bltz a4, .LBB3_2
@@ -322,47 +322,47 @@ define void @lshr_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 define void @shl_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: shl_8bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lbu a3, 1(a0)
-; RV64I-NEXT:    lbu a4, 0(a0)
+; RV64I-NEXT:    lbu a3, 0(a0)
+; RV64I-NEXT:    lbu a4, 1(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
 ; RV64I-NEXT:    lbu a6, 3(a0)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    lbu a7, 4(a0)
+; RV64I-NEXT:    lbu t0, 5(a0)
+; RV64I-NEXT:    lbu t1, 6(a0)
+; RV64I-NEXT:    lbu a0, 7(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    or a4, a6, a5
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 5(a0)
-; RV64I-NEXT:    lbu a5, 4(a0)
-; RV64I-NEXT:    lbu a6, 6(a0)
-; RV64I-NEXT:    lbu a0, 7(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a4, t0, a7
+; RV64I-NEXT:    slli t1, t1, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, a6
+; RV64I-NEXT:    or a0, a0, t1
 ; RV64I-NEXT:    or a0, a0, a4
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    or a0, a0, a3
-; RV64I-NEXT:    lbu a3, 1(a1)
-; RV64I-NEXT:    lbu a4, 0(a1)
+; RV64I-NEXT:    lbu a3, 0(a1)
+; RV64I-NEXT:    lbu a4, 1(a1)
 ; RV64I-NEXT:    lbu a5, 2(a1)
 ; RV64I-NEXT:    lbu a6, 3(a1)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    lbu a7, 4(a1)
+; RV64I-NEXT:    lbu t0, 5(a1)
+; RV64I-NEXT:    lbu t1, 6(a1)
+; RV64I-NEXT:    lbu a1, 7(a1)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    or a4, a6, a5
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 5(a1)
-; RV64I-NEXT:    lbu a5, 4(a1)
-; RV64I-NEXT:    lbu a6, 6(a1)
-; RV64I-NEXT:    lbu a1, 7(a1)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a4, t0, a7
+; RV64I-NEXT:    slli t1, t1, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, a6
+; RV64I-NEXT:    or a1, a1, t1
 ; RV64I-NEXT:    or a1, a1, a4
 ; RV64I-NEXT:    slli a1, a1, 32
 ; RV64I-NEXT:    or a1, a1, a3
@@ -395,17 +395,17 @@ define void @shl_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
 ; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    lbu a4, 1(a1)
 ; RV32I-NEXT:    lbu a5, 0(a1)
-; RV32I-NEXT:    lbu a6, 2(a1)
+; RV32I-NEXT:    lbu a6, 1(a1)
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    or a4, a4, a5
-; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    slli a4, a4, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a5, a1, a6
-; RV32I-NEXT:    or a5, a5, a4
+; RV32I-NEXT:    or a1, a1, a4
+; RV32I-NEXT:    or a5, a1, a5
 ; RV32I-NEXT:    addi a4, a5, -32
 ; RV32I-NEXT:    sll a1, a3, a5
 ; RV32I-NEXT:    bltz a4, .LBB4_2
@@ -455,47 +455,47 @@ define void @shl_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 define void @ashr_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: ashr_8bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lbu a3, 1(a0)
-; RV64I-NEXT:    lbu a4, 0(a0)
+; RV64I-NEXT:    lbu a3, 0(a0)
+; RV64I-NEXT:    lbu a4, 1(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
 ; RV64I-NEXT:    lbu a6, 3(a0)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    lbu a7, 4(a0)
+; RV64I-NEXT:    lbu t0, 5(a0)
+; RV64I-NEXT:    lbu t1, 6(a0)
+; RV64I-NEXT:    lbu a0, 7(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    or a4, a6, a5
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 5(a0)
-; RV64I-NEXT:    lbu a5, 4(a0)
-; RV64I-NEXT:    lbu a6, 6(a0)
-; RV64I-NEXT:    lbu a0, 7(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a4, t0, a7
+; RV64I-NEXT:    slli t1, t1, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, a6
+; RV64I-NEXT:    or a0, a0, t1
 ; RV64I-NEXT:    or a0, a0, a4
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    or a0, a0, a3
-; RV64I-NEXT:    lbu a3, 1(a1)
-; RV64I-NEXT:    lbu a4, 0(a1)
+; RV64I-NEXT:    lbu a3, 0(a1)
+; RV64I-NEXT:    lbu a4, 1(a1)
 ; RV64I-NEXT:    lbu a5, 2(a1)
 ; RV64I-NEXT:    lbu a6, 3(a1)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    lbu a7, 4(a1)
+; RV64I-NEXT:    lbu t0, 5(a1)
+; RV64I-NEXT:    lbu t1, 6(a1)
+; RV64I-NEXT:    lbu a1, 7(a1)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    or a4, a6, a5
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 5(a1)
-; RV64I-NEXT:    lbu a5, 4(a1)
-; RV64I-NEXT:    lbu a6, 6(a1)
-; RV64I-NEXT:    lbu a1, 7(a1)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a4, t0, a7
+; RV64I-NEXT:    slli t1, t1, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, a6
+; RV64I-NEXT:    or a1, a1, t1
 ; RV64I-NEXT:    or a1, a1, a4
 ; RV64I-NEXT:    slli a1, a1, 32
 ; RV64I-NEXT:    or a1, a1, a3
@@ -528,17 +528,17 @@ define void @ashr_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a4, a6, 24
 ; RV32I-NEXT:    or a5, a4, a5
-; RV32I-NEXT:    or a3, a5, a3
-; RV32I-NEXT:    lbu a5, 1(a1)
 ; RV32I-NEXT:    lbu a6, 0(a1)
-; RV32I-NEXT:    lbu a7, 2(a1)
+; RV32I-NEXT:    lbu a7, 1(a1)
+; RV32I-NEXT:    or a3, a5, a3
+; RV32I-NEXT:    lbu a5, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, a6
-; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, a7
-; RV32I-NEXT:    or a5, a1, a5
+; RV32I-NEXT:    or a1, a1, a5
+; RV32I-NEXT:    or a5, a1, a6
 ; RV32I-NEXT:    addi a6, a5, -32
 ; RV32I-NEXT:    sra a1, a3, a5
 ; RV32I-NEXT:    bltz a6, .LBB5_2
@@ -589,47 +589,47 @@ define void @ashr_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: lshr_16bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lbu a3, 9(a0)
-; RV64I-NEXT:    lbu a4, 8(a0)
+; RV64I-NEXT:    lbu a3, 8(a0)
+; RV64I-NEXT:    lbu a4, 9(a0)
 ; RV64I-NEXT:    lbu a5, 10(a0)
 ; RV64I-NEXT:    lbu a6, 11(a0)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    lbu a7, 12(a0)
+; RV64I-NEXT:    lbu t0, 13(a0)
+; RV64I-NEXT:    lbu t1, 14(a0)
+; RV64I-NEXT:    lbu t2, 15(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    or a4, a6, a5
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 13(a0)
-; RV64I-NEXT:    lbu a5, 12(a0)
-; RV64I-NEXT:    lbu a6, 14(a0)
-; RV64I-NEXT:    lbu a7, 15(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
-; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a5, a7, a6
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a4, t0, a7
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t2, t2, 24
+; RV64I-NEXT:    or a5, t2, t1
 ; RV64I-NEXT:    or a4, a5, a4
 ; RV64I-NEXT:    slli a4, a4, 32
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 1(a1)
-; RV64I-NEXT:    lbu a5, 0(a1)
+; RV64I-NEXT:    lbu a4, 0(a1)
+; RV64I-NEXT:    lbu a5, 1(a1)
 ; RV64I-NEXT:    lbu a6, 2(a1)
 ; RV64I-NEXT:    lbu a7, 3(a1)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
+; RV64I-NEXT:    lbu t0, 4(a1)
+; RV64I-NEXT:    lbu t1, 5(a1)
+; RV64I-NEXT:    lbu t2, 6(a1)
+; RV64I-NEXT:    lbu a1, 7(a1)
+; RV64I-NEXT:    slli a5, a5, 8
+; RV64I-NEXT:    or a4, a5, a4
 ; RV64I-NEXT:    slli a6, a6, 16
 ; RV64I-NEXT:    slli a7, a7, 24
 ; RV64I-NEXT:    or a5, a7, a6
 ; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    lbu a5, 5(a1)
-; RV64I-NEXT:    lbu a6, 4(a1)
-; RV64I-NEXT:    lbu a7, 6(a1)
-; RV64I-NEXT:    lbu a1, 7(a1)
-; RV64I-NEXT:    slli a5, a5, 8
-; RV64I-NEXT:    or a5, a5, a6
-; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli t1, t1, 8
+; RV64I-NEXT:    or a5, t1, t0
+; RV64I-NEXT:    slli t2, t2, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, a7
+; RV64I-NEXT:    or a1, a1, t2
 ; RV64I-NEXT:    or a1, a1, a5
 ; RV64I-NEXT:    slli a1, a1, 32
 ; RV64I-NEXT:    or a5, a1, a4
@@ -640,25 +640,25 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    mv a0, a1
 ; RV64I-NEXT:    j .LBB6_3
 ; RV64I-NEXT:  .LBB6_2:
-; RV64I-NEXT:    lbu a6, 1(a0)
-; RV64I-NEXT:    lbu a7, 0(a0)
+; RV64I-NEXT:    lbu a6, 0(a0)
+; RV64I-NEXT:    lbu a7, 1(a0)
 ; RV64I-NEXT:    lbu t0, 2(a0)
 ; RV64I-NEXT:    lbu t1, 3(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    lbu t2, 4(a0)
+; RV64I-NEXT:    lbu t3, 5(a0)
+; RV64I-NEXT:    lbu t4, 6(a0)
+; RV64I-NEXT:    lbu a0, 7(a0)
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or a6, a7, a6
 ; RV64I-NEXT:    slli t0, t0, 16
 ; RV64I-NEXT:    slli t1, t1, 24
 ; RV64I-NEXT:    or a7, t1, t0
 ; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    lbu a7, 5(a0)
-; RV64I-NEXT:    lbu t0, 4(a0)
-; RV64I-NEXT:    lbu t1, 6(a0)
-; RV64I-NEXT:    lbu a0, 7(a0)
-; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    or a7, a7, t0
-; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t3, t3, 8
+; RV64I-NEXT:    or a7, t3, t2
+; RV64I-NEXT:    slli t4, t4, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, t1
+; RV64I-NEXT:    or a0, a0, t4
 ; RV64I-NEXT:    or a0, a0, a7
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    or a0, a0, a6
@@ -710,36 +710,33 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sw s2, 52(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s3, 48(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s4, 44(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 0(a0)
-; RV32I-NEXT:    lbu a4, 1(a0)
-; RV32I-NEXT:    lbu a5, 2(a0)
-; RV32I-NEXT:    lbu a6, 3(a0)
-; RV32I-NEXT:    lbu a7, 4(a0)
-; RV32I-NEXT:    lbu t0, 5(a0)
-; RV32I-NEXT:    lbu t1, 6(a0)
-; RV32I-NEXT:    lbu t2, 7(a0)
-; RV32I-NEXT:    lbu t3, 8(a0)
-; RV32I-NEXT:    lbu t4, 9(a0)
-; RV32I-NEXT:    lbu t5, 10(a0)
-; RV32I-NEXT:    lbu t6, 11(a0)
-; RV32I-NEXT:    lbu s0, 1(a1)
-; RV32I-NEXT:    lbu s1, 0(a1)
+; RV32I-NEXT:    sw s5, 40(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a3, 1(a1)
+; RV32I-NEXT:    lbu a4, 0(a1)
+; RV32I-NEXT:    lbu a5, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli a3, a3, 8
+; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    lbu a4, 0(a0)
+; RV32I-NEXT:    lbu a6, 1(a0)
+; RV32I-NEXT:    lbu a7, 2(a0)
+; RV32I-NEXT:    lbu t0, 3(a0)
+; RV32I-NEXT:    lbu t1, 4(a0)
+; RV32I-NEXT:    lbu t2, 5(a0)
+; RV32I-NEXT:    lbu t3, 6(a0)
+; RV32I-NEXT:    lbu t4, 7(a0)
+; RV32I-NEXT:    lbu t5, 8(a0)
+; RV32I-NEXT:    lbu t6, 9(a0)
+; RV32I-NEXT:    lbu s0, 10(a0)
+; RV32I-NEXT:    lbu s1, 11(a0)
 ; RV32I-NEXT:    lbu s2, 12(a0)
 ; RV32I-NEXT:    lbu s3, 13(a0)
-; RV32I-NEXT:    slli s0, s0, 8
-; RV32I-NEXT:    or s0, s0, s1
-; RV32I-NEXT:    lbu s1, 2(a1)
-; RV32I-NEXT:    lbu a1, 3(a1)
 ; RV32I-NEXT:    lbu s4, 14(a0)
-; RV32I-NEXT:    lbu a0, 15(a0)
-; RV32I-NEXT:    slli s1, s1, 16
+; RV32I-NEXT:    lbu s5, 15(a0)
+; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, s1
-; RV32I-NEXT:    or a1, a1, s0
-; RV32I-NEXT:    sb zero, 43(sp)
-; RV32I-NEXT:    sb zero, 42(sp)
-; RV32I-NEXT:    sb zero, 41(sp)
-; RV32I-NEXT:    sb zero, 40(sp)
+; RV32I-NEXT:    or a0, a1, a5
+; RV32I-NEXT:    or a0, a0, a3
 ; RV32I-NEXT:    sb zero, 39(sp)
 ; RV32I-NEXT:    sb zero, 38(sp)
 ; RV32I-NEXT:    sb zero, 37(sp)
@@ -752,115 +749,120 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sb zero, 30(sp)
 ; RV32I-NEXT:    sb zero, 29(sp)
 ; RV32I-NEXT:    sb zero, 28(sp)
-; RV32I-NEXT:    sb a0, 27(sp)
-; RV32I-NEXT:    sb s4, 26(sp)
-; RV32I-NEXT:    sb s3, 25(sp)
-; RV32I-NEXT:    sb s2, 24(sp)
-; RV32I-NEXT:    sb t6, 23(sp)
-; RV32I-NEXT:    sb t5, 22(sp)
-; RV32I-NEXT:    sb t4, 21(sp)
-; RV32I-NEXT:    sb t3, 20(sp)
-; RV32I-NEXT:    sb t2, 19(sp)
-; RV32I-NEXT:    sb t1, 18(sp)
-; RV32I-NEXT:    sb t0, 17(sp)
-; RV32I-NEXT:    sb a7, 16(sp)
-; RV32I-NEXT:    sb a6, 15(sp)
-; RV32I-NEXT:    sb a5, 14(sp)
-; RV32I-NEXT:    sb a4, 13(sp)
-; RV32I-NEXT:    sb a3, 12(sp)
-; RV32I-NEXT:    slli a0, a1, 25
-; RV32I-NEXT:    srli a0, a0, 28
-; RV32I-NEXT:    addi a3, sp, 12
-; RV32I-NEXT:    add a3, a3, a0
-; RV32I-NEXT:    lbu a0, 5(a3)
-; RV32I-NEXT:    lbu a4, 4(a3)
-; RV32I-NEXT:    lbu a5, 6(a3)
-; RV32I-NEXT:    lbu a6, 7(a3)
-; RV32I-NEXT:    slli a0, a0, 8
-; RV32I-NEXT:    or a0, a0, a4
-; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a5, a4, a0
-; RV32I-NEXT:    andi a4, a1, 7
-; RV32I-NEXT:    srl a0, a5, a4
-; RV32I-NEXT:    lbu a1, 9(a3)
-; RV32I-NEXT:    lbu a6, 8(a3)
-; RV32I-NEXT:    lbu a7, 10(a3)
-; RV32I-NEXT:    lbu t0, 11(a3)
-; RV32I-NEXT:    slli a1, a1, 8
-; RV32I-NEXT:    or a1, a1, a6
-; RV32I-NEXT:    slli a7, a7, 16
-; RV32I-NEXT:    slli t0, t0, 24
-; RV32I-NEXT:    or a6, t0, a7
-; RV32I-NEXT:    or a6, a6, a1
-; RV32I-NEXT:    slli a1, a6, 1
-; RV32I-NEXT:    not a7, a4
-; RV32I-NEXT:    sll a1, a1, a7
-; RV32I-NEXT:    or a1, a0, a1
-; RV32I-NEXT:    lbu a7, 1(a3)
-; RV32I-NEXT:    lbu t0, 0(a3)
-; RV32I-NEXT:    lbu t1, 2(a3)
-; RV32I-NEXT:    lbu t2, 3(a3)
-; RV32I-NEXT:    slli a7, a7, 8
-; RV32I-NEXT:    or a7, a7, t0
-; RV32I-NEXT:    slli t1, t1, 16
-; RV32I-NEXT:    slli t2, t2, 24
-; RV32I-NEXT:    or t0, t2, t1
+; RV32I-NEXT:    sb zero, 27(sp)
+; RV32I-NEXT:    sb zero, 26(sp)
+; RV32I-NEXT:    sb zero, 25(sp)
+; RV32I-NEXT:    sb zero, 24(sp)
+; RV32I-NEXT:    sb s5, 23(sp)
+; RV32I-NEXT:    sb s4, 22(sp)
+; RV32I-NEXT:    sb s3, 21(sp)
+; RV32I-NEXT:    sb s2, 20(sp)
+; RV32I-NEXT:    sb s1, 19(sp)
+; RV32I-NEXT:    sb s0, 18(sp)
+; RV32I-NEXT:    sb t6, 17(sp)
+; RV32I-NEXT:    sb t5, 16(sp)
+; RV32I-NEXT:    sb t4, 15(sp)
+; RV32I-NEXT:    sb t3, 14(sp)
+; RV32I-NEXT:    sb t2, 13(sp)
+; RV32I-NEXT:    sb t1, 12(sp)
+; RV32I-NEXT:    sb t0, 11(sp)
+; RV32I-NEXT:    sb a7, 10(sp)
+; RV32I-NEXT:    sb a6, 9(sp)
+; RV32I-NEXT:    sb a4, 8(sp)
+; RV32I-NEXT:    slli a1, a0, 25
+; RV32I-NEXT:    srli a1, a1, 28
+; RV32I-NEXT:    addi a3, sp, 8
+; RV32I-NEXT:    add a1, a3, a1
+; RV32I-NEXT:    lbu a3, 0(a1)
+; RV32I-NEXT:    lbu a4, 1(a1)
+; RV32I-NEXT:    lbu a5, 2(a1)
+; RV32I-NEXT:    lbu a6, 3(a1)
+; RV32I-NEXT:    lbu a7, 4(a1)
+; RV32I-NEXT:    lbu t0, 5(a1)
+; RV32I-NEXT:    lbu t1, 6(a1)
+; RV32I-NEXT:    lbu t2, 7(a1)
+; RV32I-NEXT:    lbu t3, 8(a1)
+; RV32I-NEXT:    lbu t4, 9(a1)
+; RV32I-NEXT:    lbu t5, 10(a1)
+; RV32I-NEXT:    lbu t6, 11(a1)
+; RV32I-NEXT:    lbu s0, 12(a1)
+; RV32I-NEXT:    lbu s1, 13(a1)
+; RV32I-NEXT:    lbu s2, 14(a1)
+; RV32I-NEXT:    lbu a1, 15(a1)
+; RV32I-NEXT:    slli t0, t0, 8
 ; RV32I-NEXT:    or a7, t0, a7
-; RV32I-NEXT:    srl a7, a7, a4
-; RV32I-NEXT:    slli a5, a5, 1
-; RV32I-NEXT:    xori t0, a4, 31
-; RV32I-NEXT:    sll a5, a5, t0
-; RV32I-NEXT:    or a5, a7, a5
-; RV32I-NEXT:    srl a6, a6, a4
-; RV32I-NEXT:    lbu t1, 13(a3)
-; RV32I-NEXT:    lbu t2, 12(a3)
-; RV32I-NEXT:    lbu t3, 14(a3)
-; RV32I-NEXT:    lbu a3, 15(a3)
-; RV32I-NEXT:    slli t1, t1, 8
-; RV32I-NEXT:    or t1, t1, t2
-; RV32I-NEXT:    slli t3, t3, 16
-; RV32I-NEXT:    slli a3, a3, 24
-; RV32I-NEXT:    or a3, a3, t3
-; RV32I-NEXT:    or a3, a3, t1
-; RV32I-NEXT:    slli t1, a3, 1
-; RV32I-NEXT:    sll t0, t1, t0
-; RV32I-NEXT:    or t0, a6, t0
-; RV32I-NEXT:    srl a3, a3, a4
+; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t2, t2, 24
+; RV32I-NEXT:    or t0, t2, t1
+; RV32I-NEXT:    or a7, t0, a7
+; RV32I-NEXT:    andi a0, a0, 7
+; RV32I-NEXT:    srl t0, a7, a0
+; RV32I-NEXT:    slli t4, t4, 8
+; RV32I-NEXT:    or t1, t4, t3
+; RV32I-NEXT:    slli t5, t5, 16
+; RV32I-NEXT:    slli t6, t6, 24
+; RV32I-NEXT:    or t2, t6, t5
+; RV32I-NEXT:    or t1, t2, t1
+; RV32I-NEXT:    slli t2, t1, 1
+; RV32I-NEXT:    not t3, a0
+; RV32I-NEXT:    sll t2, t2, t3
+; RV32I-NEXT:    or t2, t0, t2
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a6, a6, 24
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    srl a3, a3, a0
+; RV32I-NEXT:    slli a7, a7, 1
+; RV32I-NEXT:    xori a4, a0, 31
+; RV32I-NEXT:    sll a5, a7, a4
+; RV32I-NEXT:    or a5, a3, a5
+; RV32I-NEXT:    srl a6, t1, a0
+; RV32I-NEXT:    slli s1, s1, 8
+; RV32I-NEXT:    or s0, s1, s0
+; RV32I-NEXT:    slli s2, s2, 16
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or a1, a1, s2
+; RV32I-NEXT:    or a1, a1, s0
+; RV32I-NEXT:    slli a7, a1, 1
+; RV32I-NEXT:    sll a4, a7, a4
+; RV32I-NEXT:    or a4, a6, a4
+; RV32I-NEXT:    srl a0, a1, a0
 ; RV32I-NEXT:    sb a6, 8(a2)
-; RV32I-NEXT:    sb a3, 12(a2)
-; RV32I-NEXT:    sb a7, 0(a2)
-; RV32I-NEXT:    sb a0, 4(a2)
-; RV32I-NEXT:    srli a4, a6, 16
-; RV32I-NEXT:    sb a4, 10(a2)
-; RV32I-NEXT:    srli a4, a6, 8
-; RV32I-NEXT:    sb a4, 9(a2)
-; RV32I-NEXT:    srli a4, a3, 16
-; RV32I-NEXT:    sb a4, 14(a2)
-; RV32I-NEXT:    srli a4, a3, 24
-; RV32I-NEXT:    sb a4, 15(a2)
+; RV32I-NEXT:    sb a0, 12(a2)
+; RV32I-NEXT:    sb a3, 0(a2)
+; RV32I-NEXT:    sb t0, 4(a2)
+; RV32I-NEXT:    srli a1, a6, 16
+; RV32I-NEXT:    sb a1, 10(a2)
+; RV32I-NEXT:    srli a1, a6, 8
+; RV32I-NEXT:    sb a1, 9(a2)
+; RV32I-NEXT:    srli a1, a0, 16
+; RV32I-NEXT:    sb a1, 14(a2)
+; RV32I-NEXT:    srli a1, a0, 24
+; RV32I-NEXT:    sb a1, 15(a2)
+; RV32I-NEXT:    srli a0, a0, 8
+; RV32I-NEXT:    sb a0, 13(a2)
+; RV32I-NEXT:    srli a0, a3, 16
+; RV32I-NEXT:    sb a0, 2(a2)
 ; RV32I-NEXT:    srli a3, a3, 8
-; RV32I-NEXT:    sb a3, 13(a2)
-; RV32I-NEXT:    srli a3, a7, 16
-; RV32I-NEXT:    sb a3, 2(a2)
-; RV32I-NEXT:    srli a3, a7, 8
 ; RV32I-NEXT:    sb a3, 1(a2)
-; RV32I-NEXT:    srli a3, a0, 16
-; RV32I-NEXT:    sb a3, 6(a2)
-; RV32I-NEXT:    srli a0, a0, 8
+; RV32I-NEXT:    srli a0, t0, 16
+; RV32I-NEXT:    sb a0, 6(a2)
+; RV32I-NEXT:    srli a0, t0, 8
 ; RV32I-NEXT:    sb a0, 5(a2)
-; RV32I-NEXT:    srli a0, t0, 24
-; RV32I-NEXT:    sb a0, 11(a2)
+; RV32I-NEXT:    srli a4, a4, 24
+; RV32I-NEXT:    sb a4, 11(a2)
 ; RV32I-NEXT:    srli a5, a5, 24
 ; RV32I-NEXT:    sb a5, 3(a2)
-; RV32I-NEXT:    srli a1, a1, 24
-; RV32I-NEXT:    sb a1, 7(a2)
+; RV32I-NEXT:    srli a0, t2, 24
+; RV32I-NEXT:    sb a0, 7(a2)
 ; RV32I-NEXT:    lw s0, 60(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 56(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s2, 52(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s3, 48(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s4, 44(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s5, 40(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    addi sp, sp, 64
 ; RV32I-NEXT:    ret
   %src = load i128, ptr %src.ptr, align 1
@@ -872,47 +874,47 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: shl_16bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lbu a3, 1(a0)
-; RV64I-NEXT:    lbu a4, 0(a0)
+; RV64I-NEXT:    lbu a3, 0(a0)
+; RV64I-NEXT:    lbu a4, 1(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
 ; RV64I-NEXT:    lbu a6, 3(a0)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    lbu a7, 4(a0)
+; RV64I-NEXT:    lbu t0, 5(a0)
+; RV64I-NEXT:    lbu t1, 6(a0)
+; RV64I-NEXT:    lbu t2, 7(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    or a4, a6, a5
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 5(a0)
-; RV64I-NEXT:    lbu a5, 4(a0)
-; RV64I-NEXT:    lbu a6, 6(a0)
-; RV64I-NEXT:    lbu a7, 7(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
-; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a5, a7, a6
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a4, t0, a7
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t2, t2, 24
+; RV64I-NEXT:    or a5, t2, t1
 ; RV64I-NEXT:    or a4, a5, a4
 ; RV64I-NEXT:    slli a4, a4, 32
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 1(a1)
-; RV64I-NEXT:    lbu a5, 0(a1)
+; RV64I-NEXT:    lbu a4, 0(a1)
+; RV64I-NEXT:    lbu a5, 1(a1)
 ; RV64I-NEXT:    lbu a6, 2(a1)
 ; RV64I-NEXT:    lbu a7, 3(a1)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
+; RV64I-NEXT:    lbu t0, 4(a1)
+; RV64I-NEXT:    lbu t1, 5(a1)
+; RV64I-NEXT:    lbu t2, 6(a1)
+; RV64I-NEXT:    lbu a1, 7(a1)
+; RV64I-NEXT:    slli a5, a5, 8
+; RV64I-NEXT:    or a4, a5, a4
 ; RV64I-NEXT:    slli a6, a6, 16
 ; RV64I-NEXT:    slli a7, a7, 24
 ; RV64I-NEXT:    or a5, a7, a6
 ; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    lbu a5, 5(a1)
-; RV64I-NEXT:    lbu a6, 4(a1)
-; RV64I-NEXT:    lbu a7, 6(a1)
-; RV64I-NEXT:    lbu a1, 7(a1)
-; RV64I-NEXT:    slli a5, a5, 8
-; RV64I-NEXT:    or a5, a5, a6
-; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli t1, t1, 8
+; RV64I-NEXT:    or a5, t1, t0
+; RV64I-NEXT:    slli t2, t2, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, a7
+; RV64I-NEXT:    or a1, a1, t2
 ; RV64I-NEXT:    or a1, a1, a5
 ; RV64I-NEXT:    slli a1, a1, 32
 ; RV64I-NEXT:    or a5, a1, a4
@@ -923,25 +925,25 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    mv a0, a1
 ; RV64I-NEXT:    j .LBB7_3
 ; RV64I-NEXT:  .LBB7_2:
-; RV64I-NEXT:    lbu a6, 9(a0)
-; RV64I-NEXT:    lbu a7, 8(a0)
+; RV64I-NEXT:    lbu a6, 8(a0)
+; RV64I-NEXT:    lbu a7, 9(a0)
 ; RV64I-NEXT:    lbu t0, 10(a0)
 ; RV64I-NEXT:    lbu t1, 11(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    lbu t2, 12(a0)
+; RV64I-NEXT:    lbu t3, 13(a0)
+; RV64I-NEXT:    lbu t4, 14(a0)
+; RV64I-NEXT:    lbu a0, 15(a0)
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or a6, a7, a6
 ; RV64I-NEXT:    slli t0, t0, 16
 ; RV64I-NEXT:    slli t1, t1, 24
 ; RV64I-NEXT:    or a7, t1, t0
 ; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    lbu a7, 13(a0)
-; RV64I-NEXT:    lbu t0, 12(a0)
-; RV64I-NEXT:    lbu t1, 14(a0)
-; RV64I-NEXT:    lbu a0, 15(a0)
-; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    or a7, a7, t0
-; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t3, t3, 8
+; RV64I-NEXT:    or a7, t3, t2
+; RV64I-NEXT:    slli t4, t4, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, t1
+; RV64I-NEXT:    or a0, a0, t4
 ; RV64I-NEXT:    or a0, a0, a7
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    or a0, a0, a6
@@ -993,36 +995,33 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sw s2, 52(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s3, 48(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s4, 44(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 0(a0)
-; RV32I-NEXT:    lbu a4, 1(a0)
-; RV32I-NEXT:    lbu a5, 2(a0)
-; RV32I-NEXT:    lbu a6, 3(a0)
-; RV32I-NEXT:    lbu a7, 4(a0)
-; RV32I-NEXT:    lbu t0, 5(a0)
-; RV32I-NEXT:    lbu t1, 6(a0)
-; RV32I-NEXT:    lbu t2, 7(a0)
-; RV32I-NEXT:    lbu t3, 8(a0)
-; RV32I-NEXT:    lbu t4, 9(a0)
-; RV32I-NEXT:    lbu t5, 10(a0)
-; RV32I-NEXT:    lbu t6, 11(a0)
-; RV32I-NEXT:    lbu s0, 1(a1)
-; RV32I-NEXT:    lbu s1, 0(a1)
+; RV32I-NEXT:    sw s5, 40(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a3, 1(a1)
+; RV32I-NEXT:    lbu a4, 0(a1)
+; RV32I-NEXT:    lbu a5, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli a3, a3, 8
+; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    lbu a4, 0(a0)
+; RV32I-NEXT:    lbu a6, 1(a0)
+; RV32I-NEXT:    lbu a7, 2(a0)
+; RV32I-NEXT:    lbu t0, 3(a0)
+; RV32I-NEXT:    lbu t1, 4(a0)
+; RV32I-NEXT:    lbu t2, 5(a0)
+; RV32I-NEXT:    lbu t3, 6(a0)
+; RV32I-NEXT:    lbu t4, 7(a0)
+; RV32I-NEXT:    lbu t5, 8(a0)
+; RV32I-NEXT:    lbu t6, 9(a0)
+; RV32I-NEXT:    lbu s0, 10(a0)
+; RV32I-NEXT:    lbu s1, 11(a0)
 ; RV32I-NEXT:    lbu s2, 12(a0)
 ; RV32I-NEXT:    lbu s3, 13(a0)
-; RV32I-NEXT:    slli s0, s0, 8
-; RV32I-NEXT:    or s0, s0, s1
-; RV32I-NEXT:    lbu s1, 2(a1)
-; RV32I-NEXT:    lbu a1, 3(a1)
 ; RV32I-NEXT:    lbu s4, 14(a0)
-; RV32I-NEXT:    lbu a0, 15(a0)
-; RV32I-NEXT:    slli s1, s1, 16
+; RV32I-NEXT:    lbu s5, 15(a0)
+; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, s1
-; RV32I-NEXT:    or a1, a1, s0
-; RV32I-NEXT:    sb zero, 27(sp)
-; RV32I-NEXT:    sb zero, 26(sp)
-; RV32I-NEXT:    sb zero, 25(sp)
-; RV32I-NEXT:    sb zero, 24(sp)
+; RV32I-NEXT:    or a0, a1, a5
+; RV32I-NEXT:    or a0, a0, a3
 ; RV32I-NEXT:    sb zero, 23(sp)
 ; RV32I-NEXT:    sb zero, 22(sp)
 ; RV32I-NEXT:    sb zero, 21(sp)
@@ -1035,115 +1034,120 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sb zero, 14(sp)
 ; RV32I-NEXT:    sb zero, 13(sp)
 ; RV32I-NEXT:    sb zero, 12(sp)
-; RV32I-NEXT:    sb a0, 43(sp)
-; RV32I-NEXT:    sb s4, 42(sp)
-; RV32I-NEXT:    sb s3, 41(sp)
-; RV32I-NEXT:    sb s2, 40(sp)
-; RV32I-NEXT:    sb t6, 39(sp)
-; RV32I-NEXT:    sb t5, 38(sp)
-; RV32I-NEXT:    sb t4, 37(sp)
-; RV32I-NEXT:    sb t3, 36(sp)
-; RV32I-NEXT:    sb t2, 35(sp)
-; RV32I-NEXT:    sb t1, 34(sp)
-; RV32I-NEXT:    sb t0, 33(sp)
-; RV32I-NEXT:    sb a7, 32(sp)
-; RV32I-NEXT:    sb a6, 31(sp)
-; RV32I-NEXT:    sb a5, 30(sp)
-; RV32I-NEXT:    sb a4, 29(sp)
-; RV32I-NEXT:    sb a3, 28(sp)
-; RV32I-NEXT:    slli a0, a1, 25
-; RV32I-NEXT:    srli a0, a0, 28
-; RV32I-NEXT:    addi a3, sp, 28
-; RV32I-NEXT:    sub a3, a3, a0
-; RV32I-NEXT:    lbu a0, 5(a3)
-; RV32I-NEXT:    lbu a4, 4(a3)
-; RV32I-NEXT:    lbu a5, 6(a3)
-; RV32I-NEXT:    lbu a6, 7(a3)
-; RV32I-NEXT:    slli a0, a0, 8
-; RV32I-NEXT:    or a0, a0, a4
+; RV32I-NEXT:    sb zero, 11(sp)
+; RV32I-NEXT:    sb zero, 10(sp)
+; RV32I-NEXT:    sb zero, 9(sp)
+; RV32I-NEXT:    sb zero, 8(sp)
+; RV32I-NEXT:    sb s5, 39(sp)
+; RV32I-NEXT:    sb s4, 38(sp)
+; RV32I-NEXT:    sb s3, 37(sp)
+; RV32I-NEXT:    sb s2, 36(sp)
+; RV32I-NEXT:    sb s1, 35(sp)
+; RV32I-NEXT:    sb s0, 34(sp)
+; RV32I-NEXT:    sb t6, 33(sp)
+; RV32I-NEXT:    sb t5, 32(sp)
+; RV32I-NEXT:    sb t4, 31(sp)
+; RV32I-NEXT:    sb t3, 30(sp)
+; RV32I-NEXT:    sb t2, 29(sp)
+; RV32I-NEXT:    sb t1, 28(sp)
+; RV32I-NEXT:    sb t0, 27(sp)
+; RV32I-NEXT:    sb a7, 26(sp)
+; RV32I-NEXT:    sb a6, 25(sp)
+; RV32I-NEXT:    sb a4, 24(sp)
+; RV32I-NEXT:    slli a1, a0, 25
+; RV32I-NEXT:    srli a1, a1, 28
+; RV32I-NEXT:    addi a3, sp, 24
+; RV32I-NEXT:    sub a3, a3, a1
+; RV32I-NEXT:    lbu a1, 0(a3)
+; RV32I-NEXT:    lbu a4, 1(a3)
+; RV32I-NEXT:    lbu a5, 2(a3)
+; RV32I-NEXT:    lbu a6, 3(a3)
+; RV32I-NEXT:    lbu a7, 4(a3)
+; RV32I-NEXT:    lbu t0, 5(a3)
+; RV32I-NEXT:    lbu t1, 6(a3)
+; RV32I-NEXT:    lbu t2, 7(a3)
+; RV32I-NEXT:    lbu t3, 8(a3)
+; RV32I-NEXT:    lbu t4, 9(a3)
+; RV32I-NEXT:    lbu t5, 10(a3)
+; RV32I-NEXT:    lbu t6, 11(a3)
+; RV32I-NEXT:    lbu s0, 12(a3)
+; RV32I-NEXT:    lbu s1, 13(a3)
+; RV32I-NEXT:    lbu s2, 14(a3)
+; RV32I-NEXT:    lbu a3, 15(a3)
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    or a7, t0, a7
+; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t2, t2, 24
+; RV32I-NEXT:    or t0, t2, t1
+; RV32I-NEXT:    or a7, t0, a7
+; RV32I-NEXT:    andi a0, a0, 7
+; RV32I-NEXT:    sll t0, a7, a0
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    or a1, a4, a1
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
 ; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a5, a4, a0
-; RV32I-NEXT:    andi a4, a1, 7
-; RV32I-NEXT:    sll a0, a5, a4
-; RV32I-NEXT:    lbu a1, 1(a3)
-; RV32I-NEXT:    lbu a6, 0(a3)
-; RV32I-NEXT:    lbu a7, 2(a3)
-; RV32I-NEXT:    lbu t0, 3(a3)
-; RV32I-NEXT:    slli a1, a1, 8
-; RV32I-NEXT:    or a1, a1, a6
-; RV32I-NEXT:    slli a7, a7, 16
-; RV32I-NEXT:    slli t0, t0, 24
-; RV32I-NEXT:    or a6, t0, a7
-; RV32I-NEXT:    or a6, a6, a1
-; RV32I-NEXT:    srli a1, a6, 1
-; RV32I-NEXT:    xori a7, a4, 31
-; RV32I-NEXT:    srl a1, a1, a7
-; RV32I-NEXT:    or a1, a0, a1
-; RV32I-NEXT:    lbu t0, 13(a3)
-; RV32I-NEXT:    lbu t1, 12(a3)
-; RV32I-NEXT:    lbu t2, 14(a3)
-; RV32I-NEXT:    lbu t3, 15(a3)
-; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    or t0, t0, t1
-; RV32I-NEXT:    slli t2, t2, 16
-; RV32I-NEXT:    slli t3, t3, 24
-; RV32I-NEXT:    or t1, t3, t2
-; RV32I-NEXT:    or t0, t1, t0
-; RV32I-NEXT:    sll t0, t0, a4
-; RV32I-NEXT:    lbu t1, 9(a3)
-; RV32I-NEXT:    lbu t2, 8(a3)
-; RV32I-NEXT:    lbu t3, 10(a3)
-; RV32I-NEXT:    lbu a3, 11(a3)
-; RV32I-NEXT:    slli t1, t1, 8
-; RV32I-NEXT:    or t1, t1, t2
-; RV32I-NEXT:    slli t3, t3, 16
+; RV32I-NEXT:    or a1, a4, a1
+; RV32I-NEXT:    srli a4, a1, 1
+; RV32I-NEXT:    xori a5, a0, 31
+; RV32I-NEXT:    srl a4, a4, a5
+; RV32I-NEXT:    or a4, t0, a4
+; RV32I-NEXT:    slli s1, s1, 8
+; RV32I-NEXT:    or s0, s1, s0
+; RV32I-NEXT:    slli s2, s2, 16
 ; RV32I-NEXT:    slli a3, a3, 24
-; RV32I-NEXT:    or a3, a3, t3
-; RV32I-NEXT:    or a3, a3, t1
-; RV32I-NEXT:    srli t1, a3, 1
-; RV32I-NEXT:    srl a7, t1, a7
-; RV32I-NEXT:    or a7, t0, a7
-; RV32I-NEXT:    sll a3, a3, a4
-; RV32I-NEXT:    srli a5, a5, 1
-; RV32I-NEXT:    not t1, a4
-; RV32I-NEXT:    srl a5, a5, t1
+; RV32I-NEXT:    or a3, a3, s2
+; RV32I-NEXT:    or a3, a3, s0
+; RV32I-NEXT:    sll a3, a3, a0
+; RV32I-NEXT:    slli t4, t4, 8
+; RV32I-NEXT:    or a6, t4, t3
+; RV32I-NEXT:    slli t5, t5, 16
+; RV32I-NEXT:    slli t6, t6, 24
+; RV32I-NEXT:    or t1, t6, t5
+; RV32I-NEXT:    or a6, t1, a6
+; RV32I-NEXT:    srli t1, a6, 1
+; RV32I-NEXT:    srl a5, t1, a5
 ; RV32I-NEXT:    or a5, a3, a5
-; RV32I-NEXT:    sll a4, a6, a4
-; RV32I-NEXT:    sb a4, 0(a2)
-; RV32I-NEXT:    srli a6, a3, 16
-; RV32I-NEXT:    sb a6, 10(a2)
-; RV32I-NEXT:    srli a6, a3, 24
-; RV32I-NEXT:    sb a6, 11(a2)
+; RV32I-NEXT:    sll a6, a6, a0
+; RV32I-NEXT:    srli a7, a7, 1
+; RV32I-NEXT:    not t1, a0
+; RV32I-NEXT:    srl a7, a7, t1
+; RV32I-NEXT:    or a7, a6, a7
+; RV32I-NEXT:    sll a0, a1, a0
+; RV32I-NEXT:    sb a0, 0(a2)
+; RV32I-NEXT:    srli a1, a6, 16
+; RV32I-NEXT:    sb a1, 10(a2)
+; RV32I-NEXT:    srli a1, a6, 24
+; RV32I-NEXT:    sb a1, 11(a2)
+; RV32I-NEXT:    srli a1, a6, 8
+; RV32I-NEXT:    sb a1, 9(a2)
+; RV32I-NEXT:    srli a1, a3, 16
+; RV32I-NEXT:    sb a1, 14(a2)
+; RV32I-NEXT:    srli a1, a3, 24
+; RV32I-NEXT:    sb a1, 15(a2)
 ; RV32I-NEXT:    srli a3, a3, 8
-; RV32I-NEXT:    sb a3, 9(a2)
-; RV32I-NEXT:    srli a3, t0, 16
-; RV32I-NEXT:    sb a3, 14(a2)
-; RV32I-NEXT:    srli a3, t0, 24
-; RV32I-NEXT:    sb a3, 15(a2)
-; RV32I-NEXT:    srli a3, t0, 8
 ; RV32I-NEXT:    sb a3, 13(a2)
-; RV32I-NEXT:    srli a3, a4, 16
-; RV32I-NEXT:    sb a3, 2(a2)
-; RV32I-NEXT:    srli a3, a4, 24
-; RV32I-NEXT:    sb a3, 3(a2)
-; RV32I-NEXT:    srli a4, a4, 8
-; RV32I-NEXT:    sb a4, 1(a2)
-; RV32I-NEXT:    srli a3, a0, 16
-; RV32I-NEXT:    sb a3, 6(a2)
-; RV32I-NEXT:    srli a3, a0, 24
-; RV32I-NEXT:    sb a3, 7(a2)
+; RV32I-NEXT:    srli a1, a0, 16
+; RV32I-NEXT:    sb a1, 2(a2)
+; RV32I-NEXT:    srli a1, a0, 24
+; RV32I-NEXT:    sb a1, 3(a2)
 ; RV32I-NEXT:    srli a0, a0, 8
+; RV32I-NEXT:    sb a0, 1(a2)
+; RV32I-NEXT:    srli a0, t0, 16
+; RV32I-NEXT:    sb a0, 6(a2)
+; RV32I-NEXT:    srli a0, t0, 24
+; RV32I-NEXT:    sb a0, 7(a2)
+; RV32I-NEXT:    srli a0, t0, 8
 ; RV32I-NEXT:    sb a0, 5(a2)
-; RV32I-NEXT:    sb a5, 8(a2)
-; RV32I-NEXT:    sb a7, 12(a2)
-; RV32I-NEXT:    sb a1, 4(a2)
+; RV32I-NEXT:    sb a7, 8(a2)
+; RV32I-NEXT:    sb a5, 12(a2)
+; RV32I-NEXT:    sb a4, 4(a2)
 ; RV32I-NEXT:    lw s0, 60(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 56(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s2, 52(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s3, 48(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s4, 44(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s5, 40(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    addi sp, sp, 64
 ; RV32I-NEXT:    ret
   %src = load i128, ptr %src.ptr, align 1
@@ -1155,47 +1159,47 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: ashr_16bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lbu a3, 9(a0)
-; RV64I-NEXT:    lbu a4, 8(a0)
+; RV64I-NEXT:    lbu a3, 8(a0)
+; RV64I-NEXT:    lbu a4, 9(a0)
 ; RV64I-NEXT:    lbu a5, 10(a0)
 ; RV64I-NEXT:    lbu a6, 11(a0)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    lbu a7, 12(a0)
+; RV64I-NEXT:    lbu t0, 13(a0)
+; RV64I-NEXT:    lbu t1, 14(a0)
+; RV64I-NEXT:    lbu t2, 15(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    or a4, a6, a5
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 13(a0)
-; RV64I-NEXT:    lbu a5, 12(a0)
-; RV64I-NEXT:    lbu a6, 14(a0)
-; RV64I-NEXT:    lbu a7, 15(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
-; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a5, a7, a6
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a4, t0, a7
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t2, t2, 24
+; RV64I-NEXT:    or a5, t2, t1
 ; RV64I-NEXT:    or a4, a5, a4
 ; RV64I-NEXT:    slli a5, a4, 32
 ; RV64I-NEXT:    or a3, a5, a3
-; RV64I-NEXT:    lbu a5, 1(a1)
-; RV64I-NEXT:    lbu a6, 0(a1)
+; RV64I-NEXT:    lbu a5, 0(a1)
+; RV64I-NEXT:    lbu a6, 1(a1)
 ; RV64I-NEXT:    lbu a7, 2(a1)
 ; RV64I-NEXT:    lbu t0, 3(a1)
-; RV64I-NEXT:    slli a5, a5, 8
-; RV64I-NEXT:    or a5, a5, a6
+; RV64I-NEXT:    lbu t1, 4(a1)
+; RV64I-NEXT:    lbu t2, 5(a1)
+; RV64I-NEXT:    lbu t3, 6(a1)
+; RV64I-NEXT:    lbu a1, 7(a1)
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a5, a6, a5
 ; RV64I-NEXT:    slli a7, a7, 16
 ; RV64I-NEXT:    slli t0, t0, 24
 ; RV64I-NEXT:    or a6, t0, a7
 ; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    lbu a6, 5(a1)
-; RV64I-NEXT:    lbu a7, 4(a1)
-; RV64I-NEXT:    lbu t0, 6(a1)
-; RV64I-NEXT:    lbu a1, 7(a1)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t2, t2, 8
+; RV64I-NEXT:    or a6, t2, t1
+; RV64I-NEXT:    slli t3, t3, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, t0
+; RV64I-NEXT:    or a1, a1, t3
 ; RV64I-NEXT:    or a1, a1, a6
 ; RV64I-NEXT:    slli a1, a1, 32
 ; RV64I-NEXT:    or a5, a1, a5
@@ -1208,25 +1212,25 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    mv a1, a3
 ; RV64I-NEXT:    j .LBB8_3
 ; RV64I-NEXT:  .LBB8_2:
-; RV64I-NEXT:    lbu a4, 1(a0)
-; RV64I-NEXT:    lbu a6, 0(a0)
+; RV64I-NEXT:    lbu a4, 0(a0)
+; RV64I-NEXT:    lbu a6, 1(a0)
 ; RV64I-NEXT:    lbu a7, 2(a0)
 ; RV64I-NEXT:    lbu t0, 3(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a6
+; RV64I-NEXT:    lbu t1, 4(a0)
+; RV64I-NEXT:    lbu t2, 5(a0)
+; RV64I-NEXT:    lbu t3, 6(a0)
+; RV64I-NEXT:    lbu a0, 7(a0)
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a4, a6, a4
 ; RV64I-NEXT:    slli a7, a7, 16
 ; RV64I-NEXT:    slli t0, t0, 24
 ; RV64I-NEXT:    or a6, t0, a7
 ; RV64I-NEXT:    or a4, a6, a4
-; RV64I-NEXT:    lbu a6, 5(a0)
-; RV64I-NEXT:    lbu a7, 4(a0)
-; RV64I-NEXT:    lbu t0, 6(a0)
-; RV64I-NEXT:    lbu a0, 7(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t2, t2, 8
+; RV64I-NEXT:    or a6, t2, t1
+; RV64I-NEXT:    slli t3, t3, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, t0
+; RV64I-NEXT:    or a0, a0, t3
 ; RV64I-NEXT:    or a0, a0, a6
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    or a0, a0, a4
@@ -1277,8 +1281,8 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sw s3, 48(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s4, 44(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s5, 40(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 15(a0)
-; RV32I-NEXT:    slli a4, a3, 24
+; RV32I-NEXT:    lbu a3, 0(a1)
+; RV32I-NEXT:    lbu a4, 1(a1)
 ; RV32I-NEXT:    lbu a5, 0(a0)
 ; RV32I-NEXT:    lbu a6, 1(a0)
 ; RV32I-NEXT:    lbu a7, 2(a0)
@@ -1290,25 +1294,25 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    lbu t5, 8(a0)
 ; RV32I-NEXT:    lbu t6, 9(a0)
 ; RV32I-NEXT:    lbu s0, 10(a0)
-; RV32I-NEXT:    lbu s1, 1(a1)
-; RV32I-NEXT:    lbu s2, 0(a1)
-; RV32I-NEXT:    lbu s3, 11(a0)
-; RV32I-NEXT:    lbu s4, 12(a0)
-; RV32I-NEXT:    slli s1, s1, 8
-; RV32I-NEXT:    or s1, s1, s2
-; RV32I-NEXT:    lbu s2, 2(a1)
+; RV32I-NEXT:    lbu s1, 11(a0)
+; RV32I-NEXT:    lbu s2, 12(a0)
+; RV32I-NEXT:    lbu s3, 13(a0)
+; RV32I-NEXT:    lbu s4, 14(a0)
+; RV32I-NEXT:    lbu s5, 15(a0)
+; RV32I-NEXT:    lbu a0, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    lbu s5, 13(a0)
-; RV32I-NEXT:    lbu a0, 14(a0)
-; RV32I-NEXT:    slli s2, s2, 16
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    slli a4, s5, 24
+; RV32I-NEXT:    slli a0, a0, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, s2
-; RV32I-NEXT:    or a1, a1, s1
-; RV32I-NEXT:    sb a3, 23(sp)
-; RV32I-NEXT:    sb a0, 22(sp)
-; RV32I-NEXT:    sb s5, 21(sp)
-; RV32I-NEXT:    sb s4, 20(sp)
-; RV32I-NEXT:    sb s3, 19(sp)
+; RV32I-NEXT:    or a0, a1, a0
+; RV32I-NEXT:    or a0, a0, a3
+; RV32I-NEXT:    sb s5, 23(sp)
+; RV32I-NEXT:    sb s4, 22(sp)
+; RV32I-NEXT:    sb s3, 21(sp)
+; RV32I-NEXT:    sb s2, 20(sp)
+; RV32I-NEXT:    sb s1, 19(sp)
 ; RV32I-NEXT:    sb s0, 18(sp)
 ; RV32I-NEXT:    sb t6, 17(sp)
 ; RV32I-NEXT:    sb t5, 16(sp)
@@ -1325,109 +1329,109 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sb a4, 32(sp)
 ; RV32I-NEXT:    sb a4, 28(sp)
 ; RV32I-NEXT:    sb a4, 24(sp)
-; RV32I-NEXT:    srli a0, a4, 24
-; RV32I-NEXT:    sb a0, 39(sp)
+; RV32I-NEXT:    srli a1, a4, 24
+; RV32I-NEXT:    sb a1, 39(sp)
 ; RV32I-NEXT:    srli a3, a4, 16
 ; RV32I-NEXT:    sb a3, 38(sp)
 ; RV32I-NEXT:    srli a4, a4, 8
 ; RV32I-NEXT:    sb a4, 37(sp)
-; RV32I-NEXT:    sb a0, 35(sp)
+; RV32I-NEXT:    sb a1, 35(sp)
 ; RV32I-NEXT:    sb a3, 34(sp)
 ; RV32I-NEXT:    sb a4, 33(sp)
-; RV32I-NEXT:    sb a0, 31(sp)
+; RV32I-NEXT:    sb a1, 31(sp)
 ; RV32I-NEXT:    sb a3, 30(sp)
 ; RV32I-NEXT:    sb a4, 29(sp)
-; RV32I-NEXT:    sb a0, 27(sp)
+; RV32I-NEXT:    sb a1, 27(sp)
 ; RV32I-NEXT:    sb a3, 26(sp)
 ; RV32I-NEXT:    sb a4, 25(sp)
-; RV32I-NEXT:    slli a0, a1, 25
-; RV32I-NEXT:    srli a0, a0, 28
+; RV32I-NEXT:    slli a1, a0, 25
+; RV32I-NEXT:    srli a1, a1, 28
 ; RV32I-NEXT:    addi a3, sp, 8
-; RV32I-NEXT:    add a3, a3, a0
-; RV32I-NEXT:    lbu a0, 5(a3)
-; RV32I-NEXT:    lbu a4, 4(a3)
-; RV32I-NEXT:    lbu a5, 6(a3)
-; RV32I-NEXT:    lbu a6, 7(a3)
-; RV32I-NEXT:    slli a0, a0, 8
-; RV32I-NEXT:    or a0, a0, a4
-; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a5, a4, a0
-; RV32I-NEXT:    andi a4, a1, 7
-; RV32I-NEXT:    srl a0, a5, a4
-; RV32I-NEXT:    lbu a1, 9(a3)
-; RV32I-NEXT:    lbu a6, 8(a3)
-; RV32I-NEXT:    lbu a7, 10(a3)
-; RV32I-NEXT:    lbu t0, 11(a3)
-; RV32I-NEXT:    slli a1, a1, 8
-; RV32I-NEXT:    or a1, a1, a6
-; RV32I-NEXT:    slli a7, a7, 16
-; RV32I-NEXT:    slli t0, t0, 24
-; RV32I-NEXT:    or a6, t0, a7
-; RV32I-NEXT:    or a6, a6, a1
-; RV32I-NEXT:    slli a1, a6, 1
-; RV32I-NEXT:    not a7, a4
-; RV32I-NEXT:    sll a1, a1, a7
-; RV32I-NEXT:    or a1, a0, a1
-; RV32I-NEXT:    lbu a7, 1(a3)
-; RV32I-NEXT:    lbu t0, 0(a3)
-; RV32I-NEXT:    lbu t1, 2(a3)
-; RV32I-NEXT:    lbu t2, 3(a3)
-; RV32I-NEXT:    slli a7, a7, 8
-; RV32I-NEXT:    or a7, a7, t0
+; RV32I-NEXT:    add a1, a3, a1
+; RV32I-NEXT:    lbu a3, 0(a1)
+; RV32I-NEXT:    lbu a4, 1(a1)
+; RV32I-NEXT:    lbu a5, 2(a1)
+; RV32I-NEXT:    lbu a6, 3(a1)
+; RV32I-NEXT:    lbu a7, 4(a1)
+; RV32I-NEXT:    lbu t0, 5(a1)
+; RV32I-NEXT:    lbu t1, 6(a1)
+; RV32I-NEXT:    lbu t2, 7(a1)
+; RV32I-NEXT:    lbu t3, 8(a1)
+; RV32I-NEXT:    lbu t4, 9(a1)
+; RV32I-NEXT:    lbu t5, 10(a1)
+; RV32I-NEXT:    lbu t6, 11(a1)
+; RV32I-NEXT:    lbu s0, 12(a1)
+; RV32I-NEXT:    lbu s1, 13(a1)
+; RV32I-NEXT:    lbu s2, 14(a1)
+; RV32I-NEXT:    lbu a1, 15(a1)
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    or a7, t0, a7
 ; RV32I-NEXT:    slli t1, t1, 16
 ; RV32I-NEXT:    slli t2, t2, 24
 ; RV32I-NEXT:    or t0, t2, t1
 ; RV32I-NEXT:    or a7, t0, a7
-; RV32I-NEXT:    srl a7, a7, a4
-; RV32I-NEXT:    slli a5, a5, 1
-; RV32I-NEXT:    xori t0, a4, 31
-; RV32I-NEXT:    sll a5, a5, t0
-; RV32I-NEXT:    or a5, a7, a5
-; RV32I-NEXT:    srl a6, a6, a4
-; RV32I-NEXT:    lbu t1, 13(a3)
-; RV32I-NEXT:    lbu t2, 12(a3)
-; RV32I-NEXT:    lbu t3, 14(a3)
-; RV32I-NEXT:    lbu a3, 15(a3)
-; RV32I-NEXT:    slli t1, t1, 8
-; RV32I-NEXT:    or t1, t1, t2
-; RV32I-NEXT:    slli t3, t3, 16
-; RV32I-NEXT:    slli a3, a3, 24
-; RV32I-NEXT:    or a3, a3, t3
-; RV32I-NEXT:    or a3, a3, t1
-; RV32I-NEXT:    slli t1, a3, 1
-; RV32I-NEXT:    sll t0, t1, t0
-; RV32I-NEXT:    or t0, a6, t0
-; RV32I-NEXT:    sra a3, a3, a4
-; RV32I-NEXT:    sb a6, 8(a2)
-; RV32I-NEXT:    sb a3, 12(a2)
-; RV32I-NEXT:    sb a7, 0(a2)
-; RV32I-NEXT:    sb a0, 4(a2)
-; RV32I-NEXT:    srli a4, a6, 16
-; RV32I-NEXT:    sb a4, 10(a2)
-; RV32I-NEXT:    srli a4, a6, 8
-; RV32I-NEXT:    sb a4, 9(a2)
-; RV32I-NEXT:    srli a4, a3, 16
-; RV32I-NEXT:    sb a4, 14(a2)
-; RV32I-NEXT:    srli a4, a3, 24
-; RV32I-NEXT:    sb a4, 15(a2)
-; RV32I-NEXT:    srli a3, a3, 8
-; RV32I-NEXT:    sb a3, 13(a2)
-; RV32I-NEXT:    srli a3, a7, 16
-; RV32I-NEXT:    sb a3, 2(a2)
-; RV32I-NEXT:    srli a3, a7, 8
-; RV32I-NEXT:    sb a3, 1(a2)
-; RV32I-NEXT:    srli a3, a0, 16
-; RV32I-NEXT:    sb a3, 6(a2)
+; RV32I-NEXT:    andi a0, a0, 7
+; RV32I-NEXT:    srl t0, a7, a0
+; RV32I-NEXT:    slli t4, t4, 8
+; RV32I-NEXT:    or t1, t4, t3
+; RV32I-NEXT:    slli t5, t5, 16
+; RV32I-NEXT:    slli t6, t6, 24
+; RV32I-NEXT:    or t2, t6, t5
+; RV32I-NEXT:    or t1, t2, t1
+; RV32I-NEXT:    slli t2, t1, 1
+; RV32I-NEXT:    not t3, a0
+; RV32I-NEXT:    sll t2, t2, t3
+; RV32I-NEXT:    or t2, t0, t2
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a6, a6, 24
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    srl a3, a3, a0
+; RV32I-NEXT:    slli a7, a7, 1
+; RV32I-NEXT:    xori a4, a0, 31
+; RV32I-NEXT:    sll a5, a7, a4
+; RV32I-NEXT:    or a5, a3, a5
+; RV32I-NEXT:    srl a6, t1, a0
+; RV32I-NEXT:    slli s1, s1, 8
+; RV32I-NEXT:    or s0, s1, s0
+; RV32I-NEXT:    slli s2, s2, 16
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or a1, a1, s2
+; RV32I-NEXT:    or a1, a1, s0
+; RV32I-NEXT:    slli a7, a1, 1
+; RV32I-NEXT:    sll a4, a7, a4
+; RV32I-NEXT:    or a4, a6, a4
+; RV32I-NEXT:    sra a0, a1, a0
+; RV32I-NEXT:    sb a6, 8(a2)
+; RV32I-NEXT:    sb a0, 12(a2)
+; RV32I-NEXT:    sb a3, 0(a2)
+; RV32I-NEXT:    sb t0, 4(a2)
+; RV32I-NEXT:    srli a1, a6, 16
+; RV32I-NEXT:    sb a1, 10(a2)
+; RV32I-NEXT:    srli a1, a6, 8
+; RV32I-NEXT:    sb a1, 9(a2)
+; RV32I-NEXT:    srli a1, a0, 16
+; RV32I-NEXT:    sb a1, 14(a2)
+; RV32I-NEXT:    srli a1, a0, 24
+; RV32I-NEXT:    sb a1, 15(a2)
 ; RV32I-NEXT:    srli a0, a0, 8
+; RV32I-NEXT:    sb a0, 13(a2)
+; RV32I-NEXT:    srli a0, a3, 16
+; RV32I-NEXT:    sb a0, 2(a2)
+; RV32I-NEXT:    srli a3, a3, 8
+; RV32I-NEXT:    sb a3, 1(a2)
+; RV32I-NEXT:    srli a0, t0, 16
+; RV32I-NEXT:    sb a0, 6(a2)
+; RV32I-NEXT:    srli a0, t0, 8
 ; RV32I-NEXT:    sb a0, 5(a2)
-; RV32I-NEXT:    srli a0, t0, 24
-; RV32I-NEXT:    sb a0, 11(a2)
+; RV32I-NEXT:    srli a4, a4, 24
+; RV32I-NEXT:    sb a4, 11(a2)
 ; RV32I-NEXT:    srli a5, a5, 24
 ; RV32I-NEXT:    sb a5, 3(a2)
-; RV32I-NEXT:    srli a1, a1, 24
-; RV32I-NEXT:    sb a1, 7(a2)
+; RV32I-NEXT:    srli a0, t2, 24
+; RV32I-NEXT:    sb a0, 7(a2)
 ; RV32I-NEXT:    lw s0, 60(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 56(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s2, 52(sp) # 4-byte Folded Reload
@@ -1460,18 +1464,43 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    sd s9, 136(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s10, 128(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s11, 120(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 0(a0)
-; RV64I-NEXT:    sd a3, 48(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 1(a0)
-; RV64I-NEXT:    sd a3, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 2(a0)
-; RV64I-NEXT:    sd a3, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 3(a0)
-; RV64I-NEXT:    sd a3, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 4(a0)
-; RV64I-NEXT:    sd a3, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 5(a0)
-; RV64I-NEXT:    sd a3, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu s2, 29(a0)
+; RV64I-NEXT:    lbu s4, 30(a0)
+; RV64I-NEXT:    lbu s6, 31(a0)
+; RV64I-NEXT:    lbu a3, 0(a1)
+; RV64I-NEXT:    lbu a4, 1(a1)
+; RV64I-NEXT:    lbu a5, 2(a1)
+; RV64I-NEXT:    lbu a6, 3(a1)
+; RV64I-NEXT:    lbu a7, 4(a1)
+; RV64I-NEXT:    lbu t0, 5(a1)
+; RV64I-NEXT:    lbu t1, 6(a1)
+; RV64I-NEXT:    lbu a1, 7(a1)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    slli a5, a5, 16
+; RV64I-NEXT:    slli a6, a6, 24
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a4, t0, a7
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli a1, a1, 24
+; RV64I-NEXT:    or a1, a1, t1
+; RV64I-NEXT:    or a1, a1, a4
+; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    or a5, a1, a3
+; RV64I-NEXT:    lbu a1, 0(a0)
+; RV64I-NEXT:    sd a1, 48(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 1(a0)
+; RV64I-NEXT:    sd a1, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 2(a0)
+; RV64I-NEXT:    sd a1, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 3(a0)
+; RV64I-NEXT:    sd a1, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 4(a0)
+; RV64I-NEXT:    sd a1, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 5(a0)
+; RV64I-NEXT:    sd a1, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    lbu t1, 6(a0)
 ; RV64I-NEXT:    lbu t2, 7(a0)
 ; RV64I-NEXT:    lbu t3, 8(a0)
@@ -1480,69 +1509,28 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    lbu t6, 11(a0)
 ; RV64I-NEXT:    lbu s0, 12(a0)
 ; RV64I-NEXT:    lbu s1, 13(a0)
-; RV64I-NEXT:    lbu s2, 14(a0)
-; RV64I-NEXT:    lbu s3, 15(a0)
-; RV64I-NEXT:    lbu s4, 16(a0)
-; RV64I-NEXT:    lbu s5, 17(a0)
-; RV64I-NEXT:    lbu s6, 18(a0)
-; RV64I-NEXT:    lbu s7, 19(a0)
-; RV64I-NEXT:    lbu s8, 20(a0)
-; RV64I-NEXT:    lbu s9, 1(a1)
-; RV64I-NEXT:    lbu s10, 0(a1)
-; RV64I-NEXT:    lbu s11, 2(a1)
-; RV64I-NEXT:    lbu ra, 3(a1)
-; RV64I-NEXT:    slli s9, s9, 8
-; RV64I-NEXT:    or s9, s9, s10
-; RV64I-NEXT:    slli s11, s11, 16
-; RV64I-NEXT:    slli ra, ra, 24
-; RV64I-NEXT:    lbu s10, 5(a1)
-; RV64I-NEXT:    or s11, ra, s11
-; RV64I-NEXT:    or s11, s11, s9
-; RV64I-NEXT:    lbu s9, 4(a1)
-; RV64I-NEXT:    slli s10, s10, 8
-; RV64I-NEXT:    lbu ra, 6(a1)
-; RV64I-NEXT:    lbu a1, 7(a1)
-; RV64I-NEXT:    or s10, s10, s9
-; RV64I-NEXT:    lbu s9, 21(a0)
-; RV64I-NEXT:    slli ra, ra, 16
-; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, ra
-; RV64I-NEXT:    lbu ra, 22(a0)
-; RV64I-NEXT:    or a1, a1, s10
-; RV64I-NEXT:    lbu s10, 23(a0)
-; RV64I-NEXT:    slli a1, a1, 32
-; RV64I-NEXT:    or t0, a1, s11
-; RV64I-NEXT:    lbu s11, 24(a0)
-; RV64I-NEXT:    lbu a7, 25(a0)
-; RV64I-NEXT:    lbu a6, 26(a0)
-; RV64I-NEXT:    lbu a5, 27(a0)
-; RV64I-NEXT:    lbu a1, 31(a0)
-; RV64I-NEXT:    lbu a3, 30(a0)
-; RV64I-NEXT:    lbu a4, 29(a0)
+; RV64I-NEXT:    lbu s3, 14(a0)
+; RV64I-NEXT:    lbu s5, 15(a0)
+; RV64I-NEXT:    lbu s7, 16(a0)
+; RV64I-NEXT:    lbu s8, 17(a0)
+; RV64I-NEXT:    lbu s9, 18(a0)
+; RV64I-NEXT:    lbu s10, 19(a0)
+; RV64I-NEXT:    lbu s11, 20(a0)
+; RV64I-NEXT:    lbu ra, 21(a0)
+; RV64I-NEXT:    lbu t0, 22(a0)
+; RV64I-NEXT:    lbu a7, 23(a0)
+; RV64I-NEXT:    lbu a6, 24(a0)
+; RV64I-NEXT:    lbu a4, 25(a0)
+; RV64I-NEXT:    lbu a3, 26(a0)
+; RV64I-NEXT:    lbu a1, 27(a0)
 ; RV64I-NEXT:    lbu a0, 28(a0)
-; RV64I-NEXT:    sb a1, 87(sp)
-; RV64I-NEXT:    sb a3, 86(sp)
-; RV64I-NEXT:    sb a4, 85(sp)
+; RV64I-NEXT:    sb s6, 87(sp)
+; RV64I-NEXT:    sb s4, 86(sp)
+; RV64I-NEXT:    sb s2, 85(sp)
 ; RV64I-NEXT:    sb a0, 84(sp)
-; RV64I-NEXT:    sb a5, 83(sp)
-; RV64I-NEXT:    sb a6, 82(sp)
-; RV64I-NEXT:    sb a7, 81(sp)
-; RV64I-NEXT:    sb s11, 80(sp)
-; RV64I-NEXT:    sb s10, 79(sp)
-; RV64I-NEXT:    sb ra, 78(sp)
-; RV64I-NEXT:    sb s9, 77(sp)
-; RV64I-NEXT:    sb s8, 76(sp)
-; RV64I-NEXT:    sb s7, 75(sp)
-; RV64I-NEXT:    sb s6, 74(sp)
-; RV64I-NEXT:    sb s5, 73(sp)
-; RV64I-NEXT:    sb s4, 72(sp)
-; RV64I-NEXT:    sb s3, 71(sp)
-; RV64I-NEXT:    sb s2, 70(sp)
-; RV64I-NEXT:    sb s1, 69(sp)
-; RV64I-NEXT:    sb s0, 68(sp)
-; RV64I-NEXT:    sb t6, 67(sp)
-; RV64I-NEXT:    sb t5, 66(sp)
-; RV64I-NEXT:    sb t4, 65(sp)
+; RV64I-NEXT:    sb a1, 83(sp)
+; RV64I-NEXT:    sb a3, 82(sp)
+; RV64I-NEXT:    sb a4, 81(sp)
 ; RV64I-NEXT:    sb zero, 119(sp)
 ; RV64I-NEXT:    sb zero, 118(sp)
 ; RV64I-NEXT:    sb zero, 117(sp)
@@ -1575,6 +1563,22 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    sb zero, 90(sp)
 ; RV64I-NEXT:    sb zero, 89(sp)
 ; RV64I-NEXT:    sb zero, 88(sp)
+; RV64I-NEXT:    sb a6, 80(sp)
+; RV64I-NEXT:    sb a7, 79(sp)
+; RV64I-NEXT:    sb t0, 78(sp)
+; RV64I-NEXT:    sb ra, 77(sp)
+; RV64I-NEXT:    sb s11, 76(sp)
+; RV64I-NEXT:    sb s10, 75(sp)
+; RV64I-NEXT:    sb s9, 74(sp)
+; RV64I-NEXT:    sb s8, 73(sp)
+; RV64I-NEXT:    sb s7, 72(sp)
+; RV64I-NEXT:    sb s5, 71(sp)
+; RV64I-NEXT:    sb s3, 70(sp)
+; RV64I-NEXT:    sb s1, 69(sp)
+; RV64I-NEXT:    sb s0, 68(sp)
+; RV64I-NEXT:    sb t6, 67(sp)
+; RV64I-NEXT:    sb t5, 66(sp)
+; RV64I-NEXT:    sb t4, 65(sp)
 ; RV64I-NEXT:    sb t3, 64(sp)
 ; RV64I-NEXT:    sb t2, 63(sp)
 ; RV64I-NEXT:    sb t1, 62(sp)
@@ -1590,111 +1594,112 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    sb a0, 57(sp)
 ; RV64I-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    sb a0, 56(sp)
-; RV64I-NEXT:    slli a0, t0, 56
+; RV64I-NEXT:    slli a0, a5, 56
+; RV64I-NEXT:    mv s11, a5
 ; RV64I-NEXT:    srli a0, a0, 59
-; RV64I-NEXT:    addi a3, sp, 56
-; RV64I-NEXT:    add a3, a3, a0
-; RV64I-NEXT:    lbu a0, 9(a3)
-; RV64I-NEXT:    lbu a1, 8(a3)
-; RV64I-NEXT:    lbu a4, 10(a3)
-; RV64I-NEXT:    lbu a5, 11(a3)
-; RV64I-NEXT:    slli a0, a0, 8
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    slli a4, a4, 16
-; RV64I-NEXT:    slli a5, a5, 24
-; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    or a0, a4, a0
-; RV64I-NEXT:    lbu a1, 13(a3)
-; RV64I-NEXT:    lbu a4, 12(a3)
-; RV64I-NEXT:    lbu a5, 14(a3)
-; RV64I-NEXT:    lbu a6, 15(a3)
-; RV64I-NEXT:    slli a1, a1, 8
-; RV64I-NEXT:    or a1, a1, a4
+; RV64I-NEXT:    addi a1, sp, 56
+; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    lbu a3, 8(a0)
+; RV64I-NEXT:    lbu a4, 9(a0)
+; RV64I-NEXT:    lbu a5, 10(a0)
+; RV64I-NEXT:    lbu a6, 11(a0)
+; RV64I-NEXT:    lbu a7, 12(a0)
+; RV64I-NEXT:    lbu t0, 13(a0)
+; RV64I-NEXT:    lbu t1, 14(a0)
+; RV64I-NEXT:    lbu t2, 15(a0)
+; RV64I-NEXT:    lbu t3, 16(a0)
+; RV64I-NEXT:    lbu t4, 17(a0)
+; RV64I-NEXT:    lbu t5, 18(a0)
+; RV64I-NEXT:    lbu t6, 19(a0)
+; RV64I-NEXT:    lbu s0, 20(a0)
+; RV64I-NEXT:    lbu a1, 21(a0)
+; RV64I-NEXT:    lbu s1, 22(a0)
+; RV64I-NEXT:    lbu s2, 23(a0)
+; RV64I-NEXT:    lbu s3, 24(a0)
+; RV64I-NEXT:    lbu s4, 25(a0)
+; RV64I-NEXT:    lbu s5, 26(a0)
+; RV64I-NEXT:    lbu s6, 27(a0)
+; RV64I-NEXT:    lbu s7, 28(a0)
+; RV64I-NEXT:    lbu s8, 29(a0)
+; RV64I-NEXT:    lbu s9, 30(a0)
+; RV64I-NEXT:    lbu s10, 31(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    or a1, a4, a1
-; RV64I-NEXT:    slli a1, a1, 32
-; RV64I-NEXT:    or a4, a1, a0
-; RV64I-NEXT:    andi a1, t0, 7
-; RV64I-NEXT:    lbu a0, 17(a3)
-; RV64I-NEXT:    lbu a5, 16(a3)
-; RV64I-NEXT:    lbu a6, 18(a3)
-; RV64I-NEXT:    lbu a7, 19(a3)
-; RV64I-NEXT:    slli a0, a0, 8
-; RV64I-NEXT:    or a0, a0, a5
-; RV64I-NEXT:    slli a6, a6, 16
-; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a5, a7, a6
-; RV64I-NEXT:    or a0, a5, a0
-; RV64I-NEXT:    lbu a5, 21(a3)
-; RV64I-NEXT:    lbu a6, 20(a3)
-; RV64I-NEXT:    lbu a7, 22(a3)
-; RV64I-NEXT:    lbu t0, 23(a3)
-; RV64I-NEXT:    slli a5, a5, 8
-; RV64I-NEXT:    or a5, a5, a6
-; RV64I-NEXT:    slli a7, a7, 16
-; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    or a6, t0, a7
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a4, t0, a7
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t2, t2, 24
+; RV64I-NEXT:    or a5, t2, t1
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    slli a4, a4, 32
+; RV64I-NEXT:    or a4, a4, a3
+; RV64I-NEXT:    andi a3, s11, 7
+; RV64I-NEXT:    slli t4, t4, 8
+; RV64I-NEXT:    or a5, t4, t3
+; RV64I-NEXT:    slli t5, t5, 16
+; RV64I-NEXT:    slli t6, t6, 24
+; RV64I-NEXT:    or a6, t6, t5
 ; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    slli a5, a5, 32
-; RV64I-NEXT:    or a5, a5, a0
-; RV64I-NEXT:    slli a0, a5, 1
-; RV64I-NEXT:    not a6, a1
-; RV64I-NEXT:    sll a0, a0, a6
-; RV64I-NEXT:    lbu a6, 1(a3)
-; RV64I-NEXT:    lbu a7, 0(a3)
-; RV64I-NEXT:    lbu t0, 2(a3)
-; RV64I-NEXT:    lbu t1, 3(a3)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    slli a1, a1, 8
+; RV64I-NEXT:    or a1, a1, s0
+; RV64I-NEXT:    slli s1, s1, 16
+; RV64I-NEXT:    slli s2, s2, 24
+; RV64I-NEXT:    or a6, s2, s1
+; RV64I-NEXT:    or a1, a6, a1
+; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    or a5, a1, a5
+; RV64I-NEXT:    slli a1, a5, 1
+; RV64I-NEXT:    not a6, a3
+; RV64I-NEXT:    sll a1, a1, a6
+; RV64I-NEXT:    lbu a6, 0(a0)
+; RV64I-NEXT:    lbu a7, 1(a0)
+; RV64I-NEXT:    lbu t0, 2(a0)
+; RV64I-NEXT:    lbu t1, 3(a0)
+; RV64I-NEXT:    lbu t2, 4(a0)
+; RV64I-NEXT:    lbu t3, 5(a0)
+; RV64I-NEXT:    lbu t4, 6(a0)
+; RV64I-NEXT:    lbu a0, 7(a0)
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or a6, a7, a6
 ; RV64I-NEXT:    slli t0, t0, 16
 ; RV64I-NEXT:    slli t1, t1, 24
 ; RV64I-NEXT:    or a7, t1, t0
 ; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    lbu a7, 5(a3)
-; RV64I-NEXT:    lbu t0, 4(a3)
-; RV64I-NEXT:    lbu t1, 6(a3)
-; RV64I-NEXT:    lbu t2, 7(a3)
-; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    or a7, a7, t0
-; RV64I-NEXT:    slli t1, t1, 16
-; RV64I-NEXT:    slli t2, t2, 24
-; RV64I-NEXT:    or t0, t2, t1
-; RV64I-NEXT:    or a7, t0, a7
-; RV64I-NEXT:    slli a7, a7, 32
-; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    lbu a7, 25(a3)
-; RV64I-NEXT:    lbu t0, 24(a3)
-; RV64I-NEXT:    lbu t1, 26(a3)
-; RV64I-NEXT:    lbu t2, 27(a3)
-; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    or a7, a7, t0
-; RV64I-NEXT:    slli t1, t1, 16
-; RV64I-NEXT:    slli t2, t2, 24
-; RV64I-NEXT:    or t0, t2, t1
-; RV64I-NEXT:    or a7, t0, a7
-; RV64I-NEXT:    lbu t0, 29(a3)
-; RV64I-NEXT:    lbu t1, 28(a3)
-; RV64I-NEXT:    lbu t2, 30(a3)
-; RV64I-NEXT:    lbu a3, 31(a3)
-; RV64I-NEXT:    slli t0, t0, 8
-; RV64I-NEXT:    or t0, t0, t1
-; RV64I-NEXT:    slli t2, t2, 16
-; RV64I-NEXT:    slli a3, a3, 24
-; RV64I-NEXT:    or a3, a3, t2
+; RV64I-NEXT:    slli t3, t3, 8
+; RV64I-NEXT:    or a7, t3, t2
+; RV64I-NEXT:    slli t4, t4, 16
+; RV64I-NEXT:    slli a0, a0, 24
+; RV64I-NEXT:    or a0, a0, t4
+; RV64I-NEXT:    or a0, a0, a7
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    or a6, a0, a6
+; RV64I-NEXT:    slli s4, s4, 8
+; RV64I-NEXT:    or a0, s4, s3
+; RV64I-NEXT:    slli s5, s5, 16
+; RV64I-NEXT:    slli s6, s6, 24
+; RV64I-NEXT:    or a7, s6, s5
+; RV64I-NEXT:    or a0, a7, a0
+; RV64I-NEXT:    slli s8, s8, 8
+; RV64I-NEXT:    or a7, s8, s7
+; RV64I-NEXT:    slli s9, s9, 16
+; RV64I-NEXT:    slli s10, s10, 24
+; RV64I-NEXT:    or t0, s10, s9
 ; RV64I-NEXT:    slli t1, a4, 1
-; RV64I-NEXT:    or a3, a3, t0
-; RV64I-NEXT:    xori t0, a1, 63
+; RV64I-NEXT:    or a7, t0, a7
+; RV64I-NEXT:    xori t0, a3, 63
 ; RV64I-NEXT:    sll t1, t1, t0
-; RV64I-NEXT:    slli a3, a3, 32
-; RV64I-NEXT:    or a7, a3, a7
-; RV64I-NEXT:    slli a3, a7, 1
-; RV64I-NEXT:    sll t0, a3, t0
-; RV64I-NEXT:    srl a3, a4, a1
-; RV64I-NEXT:    srl a4, a6, a1
-; RV64I-NEXT:    srl a5, a5, a1
-; RV64I-NEXT:    srl a1, a7, a1
+; RV64I-NEXT:    slli a7, a7, 32
+; RV64I-NEXT:    or a7, a7, a0
+; RV64I-NEXT:    slli a0, a7, 1
+; RV64I-NEXT:    sll t0, a0, t0
+; RV64I-NEXT:    srl a0, a4, a3
+; RV64I-NEXT:    srl a4, a6, a3
+; RV64I-NEXT:    srl a5, a5, a3
+; RV64I-NEXT:    srl a3, a7, a3
 ; RV64I-NEXT:    srli a6, a5, 48
 ; RV64I-NEXT:    sb a6, 22(a2)
 ; RV64I-NEXT:    srli a6, a5, 40
@@ -1709,55 +1714,55 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    sb a5, 16(a2)
 ; RV64I-NEXT:    srli a5, a5, 8
 ; RV64I-NEXT:    sb a5, 17(a2)
-; RV64I-NEXT:    srli a5, a1, 56
+; RV64I-NEXT:    srli a5, a3, 56
 ; RV64I-NEXT:    sb a5, 31(a2)
-; RV64I-NEXT:    srli a5, a1, 48
+; RV64I-NEXT:    srli a5, a3, 48
 ; RV64I-NEXT:    sb a5, 30(a2)
-; RV64I-NEXT:    srli a5, a1, 40
+; RV64I-NEXT:    srli a5, a3, 40
 ; RV64I-NEXT:    sb a5, 29(a2)
-; RV64I-NEXT:    srli a5, a1, 32
+; RV64I-NEXT:    srli a5, a3, 32
 ; RV64I-NEXT:    sb a5, 28(a2)
-; RV64I-NEXT:    srli a5, a1, 24
+; RV64I-NEXT:    srli a5, a3, 24
 ; RV64I-NEXT:    sb a5, 27(a2)
-; RV64I-NEXT:    srli a5, a1, 16
+; RV64I-NEXT:    srli a5, a3, 16
 ; RV64I-NEXT:    sb a5, 26(a2)
-; RV64I-NEXT:    sb a1, 24(a2)
-; RV64I-NEXT:    srli a1, a1, 8
-; RV64I-NEXT:    sb a1, 25(a2)
-; RV64I-NEXT:    srli a1, a4, 48
-; RV64I-NEXT:    sb a1, 6(a2)
-; RV64I-NEXT:    srli a1, a4, 40
-; RV64I-NEXT:    sb a1, 5(a2)
-; RV64I-NEXT:    srli a1, a4, 32
-; RV64I-NEXT:    sb a1, 4(a2)
-; RV64I-NEXT:    srli a1, a4, 24
-; RV64I-NEXT:    sb a1, 3(a2)
-; RV64I-NEXT:    srli a1, a4, 16
-; RV64I-NEXT:    sb a1, 2(a2)
-; RV64I-NEXT:    or a1, a4, t1
+; RV64I-NEXT:    sb a3, 24(a2)
+; RV64I-NEXT:    srli a3, a3, 8
+; RV64I-NEXT:    sb a3, 25(a2)
+; RV64I-NEXT:    srli a3, a4, 48
+; RV64I-NEXT:    sb a3, 6(a2)
+; RV64I-NEXT:    srli a3, a4, 40
+; RV64I-NEXT:    sb a3, 5(a2)
+; RV64I-NEXT:    srli a3, a4, 32
+; RV64I-NEXT:    sb a3, 4(a2)
+; RV64I-NEXT:    srli a3, a4, 24
+; RV64I-NEXT:    sb a3, 3(a2)
+; RV64I-NEXT:    srli a3, a4, 16
+; RV64I-NEXT:    sb a3, 2(a2)
+; RV64I-NEXT:    or a3, a4, t1
 ; RV64I-NEXT:    sb a4, 0(a2)
 ; RV64I-NEXT:    srli a4, a4, 8
 ; RV64I-NEXT:    sb a4, 1(a2)
-; RV64I-NEXT:    srli a4, a3, 48
+; RV64I-NEXT:    srli a4, a0, 48
 ; RV64I-NEXT:    sb a4, 14(a2)
-; RV64I-NEXT:    srli a4, a3, 40
+; RV64I-NEXT:    srli a4, a0, 40
 ; RV64I-NEXT:    sb a4, 13(a2)
-; RV64I-NEXT:    srli a4, a3, 32
+; RV64I-NEXT:    srli a4, a0, 32
 ; RV64I-NEXT:    sb a4, 12(a2)
-; RV64I-NEXT:    srli a4, a3, 24
+; RV64I-NEXT:    srli a4, a0, 24
 ; RV64I-NEXT:    sb a4, 11(a2)
-; RV64I-NEXT:    srli a4, a3, 16
+; RV64I-NEXT:    srli a4, a0, 16
 ; RV64I-NEXT:    sb a4, 10(a2)
-; RV64I-NEXT:    or a0, a3, a0
-; RV64I-NEXT:    sb a3, 8(a2)
-; RV64I-NEXT:    srli a3, a3, 8
-; RV64I-NEXT:    sb a3, 9(a2)
-; RV64I-NEXT:    srli a3, a6, 56
-; RV64I-NEXT:    sb a3, 23(a2)
+; RV64I-NEXT:    or a1, a0, a1
+; RV64I-NEXT:    sb a0, 8(a2)
+; RV64I-NEXT:    srli a0, a0, 8
+; RV64I-NEXT:    sb a0, 9(a2)
+; RV64I-NEXT:    srli a0, a6, 56
+; RV64I-NEXT:    sb a0, 23(a2)
+; RV64I-NEXT:    srli a3, a3, 56
+; RV64I-NEXT:    sb a3, 7(a2)
 ; RV64I-NEXT:    srli a1, a1, 56
-; RV64I-NEXT:    sb a1, 7(a2)
-; RV64I-NEXT:    srli a0, a0, 56
-; RV64I-NEXT:    sb a0, 15(a2)
+; RV64I-NEXT:    sb a1, 15(a2)
 ; RV64I-NEXT:    ld ra, 216(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 208(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 200(sp) # 8-byte Folded Reload
@@ -1790,18 +1795,32 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sw s9, 100(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s10, 96(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s11, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 0(a0)
-; RV32I-NEXT:    sw a3, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 1(a0)
-; RV32I-NEXT:    sw a3, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 2(a0)
-; RV32I-NEXT:    sw a3, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 3(a0)
-; RV32I-NEXT:    sw a3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 4(a0)
-; RV32I-NEXT:    sw a3, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 5(a0)
-; RV32I-NEXT:    sw a3, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a5, 0(a1)
+; RV32I-NEXT:    lbu a6, 1(a1)
+; RV32I-NEXT:    lbu ra, 29(a0)
+; RV32I-NEXT:    lbu t0, 30(a0)
+; RV32I-NEXT:    lbu a4, 31(a0)
+; RV32I-NEXT:    lbu a7, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or a1, a1, a7
+; RV32I-NEXT:    or a6, a1, a5
+; RV32I-NEXT:    sw a6, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a1, 0(a0)
+; RV32I-NEXT:    sw a1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a1, 1(a0)
+; RV32I-NEXT:    sw a1, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a1, 2(a0)
+; RV32I-NEXT:    sw a1, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a1, 3(a0)
+; RV32I-NEXT:    sw a1, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a1, 4(a0)
+; RV32I-NEXT:    sw a1, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a1, 5(a0)
+; RV32I-NEXT:    sw a1, 0(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    lbu t1, 6(a0)
 ; RV32I-NEXT:    lbu t2, 7(a0)
 ; RV32I-NEXT:    lbu t3, 8(a0)
@@ -1816,44 +1835,21 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    lbu s5, 17(a0)
 ; RV32I-NEXT:    lbu s6, 18(a0)
 ; RV32I-NEXT:    lbu s7, 19(a0)
-; RV32I-NEXT:    lbu s10, 1(a1)
 ; RV32I-NEXT:    lbu s8, 20(a0)
 ; RV32I-NEXT:    lbu s9, 21(a0)
-; RV32I-NEXT:    lbu s11, 0(a1)
-; RV32I-NEXT:    slli s10, s10, 8
-; RV32I-NEXT:    lbu ra, 2(a1)
-; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    or s10, s10, s11
-; RV32I-NEXT:    lbu s11, 22(a0)
-; RV32I-NEXT:    slli ra, ra, 16
-; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, ra
-; RV32I-NEXT:    lbu ra, 23(a0)
-; RV32I-NEXT:    or t0, a1, s10
-; RV32I-NEXT:    lbu s10, 24(a0)
-; RV32I-NEXT:    lbu a7, 25(a0)
-; RV32I-NEXT:    lbu a6, 26(a0)
-; RV32I-NEXT:    lbu a5, 27(a0)
-; RV32I-NEXT:    lbu a1, 31(a0)
-; RV32I-NEXT:    lbu a3, 30(a0)
-; RV32I-NEXT:    lbu a4, 29(a0)
+; RV32I-NEXT:    lbu s10, 22(a0)
+; RV32I-NEXT:    lbu s11, 23(a0)
+; RV32I-NEXT:    lbu a7, 24(a0)
+; RV32I-NEXT:    lbu a5, 25(a0)
+; RV32I-NEXT:    lbu a3, 26(a0)
+; RV32I-NEXT:    lbu a1, 27(a0)
 ; RV32I-NEXT:    lbu a0, 28(a0)
-; RV32I-NEXT:    sb a1, 59(sp)
-; RV32I-NEXT:    sb a3, 58(sp)
-; RV32I-NEXT:    sb a4, 57(sp)
+; RV32I-NEXT:    sb a4, 59(sp)
+; RV32I-NEXT:    sb t0, 58(sp)
+; RV32I-NEXT:    sb ra, 57(sp)
 ; RV32I-NEXT:    sb a0, 56(sp)
-; RV32I-NEXT:    sb a5, 55(sp)
-; RV32I-NEXT:    sb a6, 54(sp)
-; RV32I-NEXT:    sb a7, 53(sp)
-; RV32I-NEXT:    sb s10, 52(sp)
-; RV32I-NEXT:    sb ra, 51(sp)
-; RV32I-NEXT:    sb s11, 50(sp)
-; RV32I-NEXT:    sb s9, 49(sp)
-; RV32I-NEXT:    sb s8, 48(sp)
-; RV32I-NEXT:    sb s7, 47(sp)
-; RV32I-NEXT:    sb s6, 46(sp)
-; RV32I-NEXT:    sb s5, 45(sp)
-; RV32I-NEXT:    sb s4, 44(sp)
+; RV32I-NEXT:    sb a1, 55(sp)
+; RV32I-NEXT:    sb a3, 54(sp)
 ; RV32I-NEXT:    sb zero, 91(sp)
 ; RV32I-NEXT:    sb zero, 90(sp)
 ; RV32I-NEXT:    sb zero, 89(sp)
@@ -1886,6 +1882,16 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sb zero, 62(sp)
 ; RV32I-NEXT:    sb zero, 61(sp)
 ; RV32I-NEXT:    sb zero, 60(sp)
+; RV32I-NEXT:    sb a5, 53(sp)
+; RV32I-NEXT:    sb a7, 52(sp)
+; RV32I-NEXT:    sb s11, 51(sp)
+; RV32I-NEXT:    sb s10, 50(sp)
+; RV32I-NEXT:    sb s9, 49(sp)
+; RV32I-NEXT:    sb s8, 48(sp)
+; RV32I-NEXT:    sb s7, 47(sp)
+; RV32I-NEXT:    sb s6, 46(sp)
+; RV32I-NEXT:    sb s5, 45(sp)
+; RV32I-NEXT:    sb s4, 44(sp)
 ; RV32I-NEXT:    sb s3, 43(sp)
 ; RV32I-NEXT:    sb s2, 42(sp)
 ; RV32I-NEXT:    sb s1, 41(sp)
@@ -1896,187 +1902,192 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sb t3, 36(sp)
 ; RV32I-NEXT:    sb t2, 35(sp)
 ; RV32I-NEXT:    sb t1, 34(sp)
-; RV32I-NEXT:    lw a0, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw a0, 0(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    sb a0, 33(sp)
-; RV32I-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw a0, 4(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    sb a0, 32(sp)
-; RV32I-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    sb a0, 31(sp)
-; RV32I-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    sb a0, 30(sp)
-; RV32I-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    sb a0, 29(sp)
-; RV32I-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    sb a0, 28(sp)
-; RV32I-NEXT:    slli a0, t0, 24
+; RV32I-NEXT:    slli a0, a6, 24
 ; RV32I-NEXT:    srli a0, a0, 27
-; RV32I-NEXT:    addi a4, sp, 28
-; RV32I-NEXT:    add a4, a4, a0
-; RV32I-NEXT:    lbu a0, 5(a4)
-; RV32I-NEXT:    lbu a1, 4(a4)
-; RV32I-NEXT:    lbu a3, 6(a4)
-; RV32I-NEXT:    lbu a5, 7(a4)
-; RV32I-NEXT:    slli a0, a0, 8
-; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    slli a3, a3, 16
-; RV32I-NEXT:    slli a5, a5, 24
-; RV32I-NEXT:    or a3, a5, a3
-; RV32I-NEXT:    or t5, a3, a0
-; RV32I-NEXT:    andi a3, t0, 7
-; RV32I-NEXT:    lbu a0, 9(a4)
-; RV32I-NEXT:    lbu a1, 8(a4)
-; RV32I-NEXT:    lbu a5, 10(a4)
-; RV32I-NEXT:    lbu a6, 11(a4)
-; RV32I-NEXT:    slli a0, a0, 8
+; RV32I-NEXT:    addi a1, sp, 28
+; RV32I-NEXT:    add a0, a1, a0
+; RV32I-NEXT:    lbu a1, 4(a0)
+; RV32I-NEXT:    lbu a4, 5(a0)
+; RV32I-NEXT:    lbu a7, 6(a0)
+; RV32I-NEXT:    lbu a3, 7(a0)
+; RV32I-NEXT:    sw a3, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a3, 8(a0)
+; RV32I-NEXT:    sw a3, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu t0, 9(a0)
+; RV32I-NEXT:    lbu t1, 10(a0)
+; RV32I-NEXT:    lbu t2, 11(a0)
+; RV32I-NEXT:    lbu t3, 12(a0)
+; RV32I-NEXT:    lbu t4, 13(a0)
+; RV32I-NEXT:    lbu t5, 14(a0)
+; RV32I-NEXT:    lbu t6, 15(a0)
+; RV32I-NEXT:    lbu s0, 16(a0)
+; RV32I-NEXT:    lbu s1, 17(a0)
+; RV32I-NEXT:    lbu s2, 18(a0)
+; RV32I-NEXT:    lbu s3, 19(a0)
+; RV32I-NEXT:    lbu a6, 20(a0)
+; RV32I-NEXT:    lbu s5, 21(a0)
+; RV32I-NEXT:    lbu s6, 22(a0)
+; RV32I-NEXT:    lbu s7, 23(a0)
+; RV32I-NEXT:    lbu s4, 24(a0)
+; RV32I-NEXT:    lbu s8, 25(a0)
+; RV32I-NEXT:    lbu s9, 26(a0)
+; RV32I-NEXT:    lbu s10, 27(a0)
+; RV32I-NEXT:    lbu s11, 28(a0)
+; RV32I-NEXT:    lbu a5, 29(a0)
+; RV32I-NEXT:    lbu ra, 30(a0)
+; RV32I-NEXT:    lbu a3, 31(a0)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    or a4, a4, a1
+; RV32I-NEXT:    slli a1, a7, 16
+; RV32I-NEXT:    lw a7, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    slli a7, a7, 24
+; RV32I-NEXT:    or a1, a7, a1
+; RV32I-NEXT:    or a4, a1, a4
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    lw a1, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    or a1, t0, a1
+; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t2, t2, 24
+; RV32I-NEXT:    or a7, t2, t1
+; RV32I-NEXT:    lbu t0, 0(a0)
+; RV32I-NEXT:    lbu t1, 1(a0)
+; RV32I-NEXT:    or a7, a7, a1
+; RV32I-NEXT:    lbu a1, 2(a0)
+; RV32I-NEXT:    lbu a0, 3(a0)
+; RV32I-NEXT:    slli t1, t1, 8
+; RV32I-NEXT:    or t0, t1, t0
+; RV32I-NEXT:    slli a1, a1, 16
+; RV32I-NEXT:    slli a0, a0, 24
 ; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    or a1, a6, a5
-; RV32I-NEXT:    or a6, a1, a0
-; RV32I-NEXT:    slli a0, a6, 1
-; RV32I-NEXT:    not t1, a3
-; RV32I-NEXT:    sll a0, a0, t1
-; RV32I-NEXT:    lbu a1, 1(a4)
-; RV32I-NEXT:    lbu a5, 0(a4)
-; RV32I-NEXT:    lbu a7, 2(a4)
-; RV32I-NEXT:    lbu t0, 3(a4)
-; RV32I-NEXT:    slli a1, a1, 8
-; RV32I-NEXT:    or a1, a1, a5
-; RV32I-NEXT:    slli a7, a7, 16
-; RV32I-NEXT:    slli t0, t0, 24
-; RV32I-NEXT:    or a5, t0, a7
-; RV32I-NEXT:    or t0, a5, a1
-; RV32I-NEXT:    slli a1, t5, 1
-; RV32I-NEXT:    xori t2, a3, 31
-; RV32I-NEXT:    sll a1, a1, t2
-; RV32I-NEXT:    lbu a5, 13(a4)
-; RV32I-NEXT:    lbu a7, 12(a4)
-; RV32I-NEXT:    lbu t3, 14(a4)
-; RV32I-NEXT:    lbu t4, 15(a4)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, a7
-; RV32I-NEXT:    slli t3, t3, 16
-; RV32I-NEXT:    slli t4, t4, 24
-; RV32I-NEXT:    or a7, t4, t3
-; RV32I-NEXT:    or t3, a7, a5
-; RV32I-NEXT:    lbu a5, 17(a4)
-; RV32I-NEXT:    lbu a7, 16(a4)
-; RV32I-NEXT:    lbu t4, 18(a4)
-; RV32I-NEXT:    lbu t6, 19(a4)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, a7
-; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    or a1, a0, t0
+; RV32I-NEXT:    slli t4, t4, 8
+; RV32I-NEXT:    or a0, t4, t3
+; RV32I-NEXT:    slli t5, t5, 16
 ; RV32I-NEXT:    slli t6, t6, 24
-; RV32I-NEXT:    or a7, t6, t4
-; RV32I-NEXT:    or t4, a7, a5
-; RV32I-NEXT:    slli a5, t4, 1
-; RV32I-NEXT:    sll a7, a5, t1
-; RV32I-NEXT:    lbu a5, 21(a4)
-; RV32I-NEXT:    lbu t6, 20(a4)
-; RV32I-NEXT:    lbu s0, 22(a4)
-; RV32I-NEXT:    lbu s1, 23(a4)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, t6
-; RV32I-NEXT:    slli s0, s0, 16
-; RV32I-NEXT:    slli s1, s1, 24
+; RV32I-NEXT:    or t0, t6, t5
+; RV32I-NEXT:    or t1, t0, a0
+; RV32I-NEXT:    slli s1, s1, 8
 ; RV32I-NEXT:    or s0, s1, s0
-; RV32I-NEXT:    or s0, s0, a5
-; RV32I-NEXT:    lbu a5, 25(a4)
-; RV32I-NEXT:    lbu t6, 24(a4)
-; RV32I-NEXT:    lbu s1, 26(a4)
-; RV32I-NEXT:    lbu s2, 27(a4)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, t6
-; RV32I-NEXT:    slli s1, s1, 16
-; RV32I-NEXT:    slli s2, s2, 24
-; RV32I-NEXT:    or t6, s2, s1
-; RV32I-NEXT:    or t6, t6, a5
-; RV32I-NEXT:    lbu a5, 29(a4)
-; RV32I-NEXT:    lbu s1, 28(a4)
-; RV32I-NEXT:    slli s2, t6, 1
-; RV32I-NEXT:    sll t1, s2, t1
+; RV32I-NEXT:    slli s2, s2, 16
+; RV32I-NEXT:    slli s3, s3, 24
+; RV32I-NEXT:    or a0, s3, s2
+; RV32I-NEXT:    or s0, a0, s0
+; RV32I-NEXT:    slli s5, s5, 8
+; RV32I-NEXT:    or a0, s5, a6
+; RV32I-NEXT:    lw a6, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    andi t2, a6, 7
+; RV32I-NEXT:    slli s6, s6, 16
+; RV32I-NEXT:    slli s7, s7, 24
+; RV32I-NEXT:    or a6, s7, s6
+; RV32I-NEXT:    slli t0, a7, 1
+; RV32I-NEXT:    or t3, a6, a0
+; RV32I-NEXT:    not t4, t2
+; RV32I-NEXT:    sll a0, t0, t4
+; RV32I-NEXT:    slli s8, s8, 8
+; RV32I-NEXT:    or t0, s8, s4
+; RV32I-NEXT:    slli t5, a4, 1
+; RV32I-NEXT:    slli s9, s9, 16
+; RV32I-NEXT:    slli s10, s10, 24
+; RV32I-NEXT:    or t6, s10, s9
+; RV32I-NEXT:    slli a6, s0, 1
+; RV32I-NEXT:    sll a6, a6, t4
+; RV32I-NEXT:    or t6, t6, t0
+; RV32I-NEXT:    slli t0, t6, 1
+; RV32I-NEXT:    sll t4, t0, t4
+; RV32I-NEXT:    xori s1, t2, 31
+; RV32I-NEXT:    sll t0, t5, s1
 ; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, s1
-; RV32I-NEXT:    lbu s1, 30(a4)
-; RV32I-NEXT:    lbu a4, 31(a4)
+; RV32I-NEXT:    or a5, a5, s11
+; RV32I-NEXT:    slli t5, t1, 1
+; RV32I-NEXT:    sll t5, t5, s1
+; RV32I-NEXT:    slli ra, ra, 16
+; RV32I-NEXT:    slli a3, a3, 24
+; RV32I-NEXT:    or a3, a3, ra
 ; RV32I-NEXT:    slli s2, t3, 1
-; RV32I-NEXT:    sll s2, s2, t2
-; RV32I-NEXT:    slli s1, s1, 16
-; RV32I-NEXT:    slli a4, a4, 24
-; RV32I-NEXT:    or a4, a4, s1
-; RV32I-NEXT:    slli s1, s0, 1
-; RV32I-NEXT:    sll s1, s1, t2
-; RV32I-NEXT:    or s3, a4, a5
-; RV32I-NEXT:    slli a4, s3, 1
-; RV32I-NEXT:    sll t2, a4, t2
-; RV32I-NEXT:    srl a4, t5, a3
-; RV32I-NEXT:    srl a5, t0, a3
-; RV32I-NEXT:    srl t0, t3, a3
-; RV32I-NEXT:    srl a6, a6, a3
-; RV32I-NEXT:    srl t3, s0, a3
-; RV32I-NEXT:    srl t4, t4, a3
-; RV32I-NEXT:    srl t5, t6, a3
-; RV32I-NEXT:    srl a3, s3, a3
-; RV32I-NEXT:    srli t6, t5, 16
-; RV32I-NEXT:    sb t6, 26(a2)
-; RV32I-NEXT:    or t2, t5, t2
-; RV32I-NEXT:    sb t5, 24(a2)
-; RV32I-NEXT:    srli t5, t5, 8
-; RV32I-NEXT:    sb t5, 25(a2)
-; RV32I-NEXT:    srli t5, a3, 24
-; RV32I-NEXT:    sb t5, 31(a2)
-; RV32I-NEXT:    srli t5, a3, 16
-; RV32I-NEXT:    sb t5, 30(a2)
+; RV32I-NEXT:    sll s2, s2, s1
+; RV32I-NEXT:    or a3, a3, a5
+; RV32I-NEXT:    slli a5, a3, 1
+; RV32I-NEXT:    sll a5, a5, s1
+; RV32I-NEXT:    srl a4, a4, t2
+; RV32I-NEXT:    srl a1, a1, t2
+; RV32I-NEXT:    srl t1, t1, t2
+; RV32I-NEXT:    srl a7, a7, t2
+; RV32I-NEXT:    srl t3, t3, t2
+; RV32I-NEXT:    srl s0, s0, t2
+; RV32I-NEXT:    srl t6, t6, t2
+; RV32I-NEXT:    srl a3, a3, t2
+; RV32I-NEXT:    srli t2, t6, 16
+; RV32I-NEXT:    sb t2, 26(a2)
+; RV32I-NEXT:    or a5, t6, a5
+; RV32I-NEXT:    sb t6, 24(a2)
+; RV32I-NEXT:    srli t2, t6, 8
+; RV32I-NEXT:    sb t2, 25(a2)
+; RV32I-NEXT:    srli t2, a3, 24
+; RV32I-NEXT:    sb t2, 31(a2)
+; RV32I-NEXT:    srli t2, a3, 16
+; RV32I-NEXT:    sb t2, 30(a2)
 ; RV32I-NEXT:    sb a3, 28(a2)
 ; RV32I-NEXT:    srli a3, a3, 8
 ; RV32I-NEXT:    sb a3, 29(a2)
-; RV32I-NEXT:    srli a3, t4, 16
+; RV32I-NEXT:    srli a3, s0, 16
 ; RV32I-NEXT:    sb a3, 18(a2)
-; RV32I-NEXT:    or a3, t4, s1
-; RV32I-NEXT:    sb t4, 16(a2)
-; RV32I-NEXT:    srli t4, t4, 8
-; RV32I-NEXT:    sb t4, 17(a2)
-; RV32I-NEXT:    srli t4, t3, 16
-; RV32I-NEXT:    sb t4, 22(a2)
-; RV32I-NEXT:    or t1, t3, t1
+; RV32I-NEXT:    or a3, s0, s2
+; RV32I-NEXT:    sb s0, 16(a2)
+; RV32I-NEXT:    srli s0, s0, 8
+; RV32I-NEXT:    sb s0, 17(a2)
+; RV32I-NEXT:    srli t2, t3, 16
+; RV32I-NEXT:    sb t2, 22(a2)
+; RV32I-NEXT:    or t2, t3, t4
 ; RV32I-NEXT:    sb t3, 20(a2)
 ; RV32I-NEXT:    srli t3, t3, 8
 ; RV32I-NEXT:    sb t3, 21(a2)
-; RV32I-NEXT:    srli t3, a6, 16
+; RV32I-NEXT:    srli t3, a7, 16
 ; RV32I-NEXT:    sb t3, 10(a2)
-; RV32I-NEXT:    or t3, a6, s2
-; RV32I-NEXT:    sb a6, 8(a2)
-; RV32I-NEXT:    srli a6, a6, 8
-; RV32I-NEXT:    sb a6, 9(a2)
-; RV32I-NEXT:    srli a6, t0, 16
-; RV32I-NEXT:    sb a6, 14(a2)
-; RV32I-NEXT:    or a6, t0, a7
-; RV32I-NEXT:    sb t0, 12(a2)
-; RV32I-NEXT:    srli a7, t0, 8
+; RV32I-NEXT:    or t3, a7, t5
+; RV32I-NEXT:    sb a7, 8(a2)
+; RV32I-NEXT:    srli a7, a7, 8
+; RV32I-NEXT:    sb a7, 9(a2)
+; RV32I-NEXT:    srli a7, t1, 16
+; RV32I-NEXT:    sb a7, 14(a2)
+; RV32I-NEXT:    or a6, t1, a6
+; RV32I-NEXT:    sb t1, 12(a2)
+; RV32I-NEXT:    srli a7, t1, 8
 ; RV32I-NEXT:    sb a7, 13(a2)
-; RV32I-NEXT:    srli a7, a5, 16
+; RV32I-NEXT:    srli a7, a1, 16
 ; RV32I-NEXT:    sb a7, 2(a2)
-; RV32I-NEXT:    or a1, a5, a1
-; RV32I-NEXT:    sb a5, 0(a2)
-; RV32I-NEXT:    srli a5, a5, 8
-; RV32I-NEXT:    sb a5, 1(a2)
-; RV32I-NEXT:    srli a5, a4, 16
-; RV32I-NEXT:    sb a5, 6(a2)
+; RV32I-NEXT:    or a7, a1, t0
+; RV32I-NEXT:    sb a1, 0(a2)
+; RV32I-NEXT:    srli a1, a1, 8
+; RV32I-NEXT:    sb a1, 1(a2)
+; RV32I-NEXT:    srli a1, a4, 16
+; RV32I-NEXT:    sb a1, 6(a2)
 ; RV32I-NEXT:    or a0, a4, a0
 ; RV32I-NEXT:    sb a4, 4(a2)
 ; RV32I-NEXT:    srli a4, a4, 8
 ; RV32I-NEXT:    sb a4, 5(a2)
-; RV32I-NEXT:    srli a4, t2, 24
-; RV32I-NEXT:    sb a4, 27(a2)
+; RV32I-NEXT:    srli a5, a5, 24
+; RV32I-NEXT:    sb a5, 27(a2)
 ; RV32I-NEXT:    srli a3, a3, 24
 ; RV32I-NEXT:    sb a3, 19(a2)
-; RV32I-NEXT:    srli a3, t1, 24
-; RV32I-NEXT:    sb a3, 23(a2)
-; RV32I-NEXT:    srli a3, t3, 24
-; RV32I-NEXT:    sb a3, 11(a2)
-; RV32I-NEXT:    srli a3, a6, 24
-; RV32I-NEXT:    sb a3, 15(a2)
-; RV32I-NEXT:    srli a1, a1, 24
+; RV32I-NEXT:    srli a1, t2, 24
+; RV32I-NEXT:    sb a1, 23(a2)
+; RV32I-NEXT:    srli a1, t3, 24
+; RV32I-NEXT:    sb a1, 11(a2)
+; RV32I-NEXT:    srli a1, a6, 24
+; RV32I-NEXT:    sb a1, 15(a2)
+; RV32I-NEXT:    srli a1, a7, 24
 ; RV32I-NEXT:    sb a1, 3(a2)
 ; RV32I-NEXT:    srli a0, a0, 24
 ; RV32I-NEXT:    sb a0, 7(a2)
@@ -2118,18 +2129,43 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    sd s9, 136(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s10, 128(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s11, 120(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 0(a0)
-; RV64I-NEXT:    sd a3, 48(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 1(a0)
-; RV64I-NEXT:    sd a3, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 2(a0)
-; RV64I-NEXT:    sd a3, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 3(a0)
-; RV64I-NEXT:    sd a3, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 4(a0)
-; RV64I-NEXT:    sd a3, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 5(a0)
-; RV64I-NEXT:    sd a3, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu s2, 29(a0)
+; RV64I-NEXT:    lbu s4, 30(a0)
+; RV64I-NEXT:    lbu s6, 31(a0)
+; RV64I-NEXT:    lbu a3, 0(a1)
+; RV64I-NEXT:    lbu a4, 1(a1)
+; RV64I-NEXT:    lbu a5, 2(a1)
+; RV64I-NEXT:    lbu a6, 3(a1)
+; RV64I-NEXT:    lbu a7, 4(a1)
+; RV64I-NEXT:    lbu t0, 5(a1)
+; RV64I-NEXT:    lbu t1, 6(a1)
+; RV64I-NEXT:    lbu a1, 7(a1)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    slli a5, a5, 16
+; RV64I-NEXT:    slli a6, a6, 24
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a4, t0, a7
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli a1, a1, 24
+; RV64I-NEXT:    or a1, a1, t1
+; RV64I-NEXT:    or a1, a1, a4
+; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    or s11, a1, a3
+; RV64I-NEXT:    lbu a1, 0(a0)
+; RV64I-NEXT:    sd a1, 48(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 1(a0)
+; RV64I-NEXT:    sd a1, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 2(a0)
+; RV64I-NEXT:    sd a1, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 3(a0)
+; RV64I-NEXT:    sd a1, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 4(a0)
+; RV64I-NEXT:    sd a1, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 5(a0)
+; RV64I-NEXT:    sd a1, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    lbu t1, 6(a0)
 ; RV64I-NEXT:    lbu t2, 7(a0)
 ; RV64I-NEXT:    lbu t3, 8(a0)
@@ -2138,70 +2174,28 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    lbu t6, 11(a0)
 ; RV64I-NEXT:    lbu s0, 12(a0)
 ; RV64I-NEXT:    lbu s1, 13(a0)
-; RV64I-NEXT:    lbu s2, 14(a0)
-; RV64I-NEXT:    lbu s3, 15(a0)
-; RV64I-NEXT:    lbu s4, 16(a0)
-; RV64I-NEXT:    lbu s5, 17(a0)
-; RV64I-NEXT:    lbu s6, 18(a0)
-; RV64I-NEXT:    lbu s7, 19(a0)
-; RV64I-NEXT:    lbu s8, 20(a0)
-; RV64I-NEXT:    lbu s9, 1(a1)
-; RV64I-NEXT:    lbu s10, 0(a1)
-; RV64I-NEXT:    lbu s11, 2(a1)
-; RV64I-NEXT:    lbu ra, 3(a1)
-; RV64I-NEXT:    slli s9, s9, 8
-; RV64I-NEXT:    or s9, s9, s10
-; RV64I-NEXT:    slli s11, s11, 16
-; RV64I-NEXT:    slli ra, ra, 24
-; RV64I-NEXT:    lbu s10, 5(a1)
-; RV64I-NEXT:    or s11, ra, s11
-; RV64I-NEXT:    or s11, s11, s9
-; RV64I-NEXT:    lbu s9, 4(a1)
-; RV64I-NEXT:    slli s10, s10, 8
-; RV64I-NEXT:    lbu ra, 6(a1)
-; RV64I-NEXT:    lbu a1, 7(a1)
-; RV64I-NEXT:    or s10, s10, s9
-; RV64I-NEXT:    lbu s9, 21(a0)
-; RV64I-NEXT:    slli ra, ra, 16
-; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, ra
-; RV64I-NEXT:    lbu ra, 22(a0)
-; RV64I-NEXT:    or a1, a1, s10
-; RV64I-NEXT:    lbu s10, 23(a0)
-; RV64I-NEXT:    slli a1, a1, 32
-; RV64I-NEXT:    or t0, a1, s11
-; RV64I-NEXT:    lbu s11, 24(a0)
-; RV64I-NEXT:    lbu a7, 25(a0)
-; RV64I-NEXT:    lbu a6, 26(a0)
-; RV64I-NEXT:    lbu a5, 27(a0)
-; RV64I-NEXT:    lbu a1, 31(a0)
-; RV64I-NEXT:    lbu a3, 30(a0)
-; RV64I-NEXT:    lbu a4, 29(a0)
+; RV64I-NEXT:    lbu s3, 14(a0)
+; RV64I-NEXT:    lbu s5, 15(a0)
+; RV64I-NEXT:    lbu s7, 16(a0)
+; RV64I-NEXT:    lbu s8, 17(a0)
+; RV64I-NEXT:    lbu s9, 18(a0)
+; RV64I-NEXT:    lbu s10, 19(a0)
+; RV64I-NEXT:    lbu t0, 20(a0)
+; RV64I-NEXT:    lbu ra, 21(a0)
+; RV64I-NEXT:    lbu a7, 22(a0)
+; RV64I-NEXT:    lbu a6, 23(a0)
+; RV64I-NEXT:    lbu a5, 24(a0)
+; RV64I-NEXT:    lbu a4, 25(a0)
+; RV64I-NEXT:    lbu a3, 26(a0)
+; RV64I-NEXT:    lbu a1, 27(a0)
 ; RV64I-NEXT:    lbu a0, 28(a0)
-; RV64I-NEXT:    sb a1, 119(sp)
-; RV64I-NEXT:    sb a3, 118(sp)
-; RV64I-NEXT:    sb a4, 117(sp)
+; RV64I-NEXT:    sb s6, 119(sp)
+; RV64I-NEXT:    sb s4, 118(sp)
+; RV64I-NEXT:    sb s2, 117(sp)
 ; RV64I-NEXT:    sb a0, 116(sp)
-; RV64I-NEXT:    sb a5, 115(sp)
-; RV64I-NEXT:    sb a6, 114(sp)
-; RV64I-NEXT:    sb a7, 113(sp)
-; RV64I-NEXT:    sb s11, 112(sp)
-; RV64I-NEXT:    sb s10, 111(sp)
-; RV64I-NEXT:    sb ra, 110(sp)
-; RV64I-NEXT:    sb s9, 109(sp)
-; RV64I-NEXT:    sb s8, 108(sp)
-; RV64I-NEXT:    sb s7, 107(sp)
-; RV64I-NEXT:    sb s6, 106(sp)
-; RV64I-NEXT:    sb s5, 105(sp)
-; RV64I-NEXT:    sb s4, 104(sp)
-; RV64I-NEXT:    sb s3, 103(sp)
-; RV64I-NEXT:    sb s2, 102(sp)
-; RV64I-NEXT:    sb s1, 101(sp)
-; RV64I-NEXT:    sb s0, 100(sp)
-; RV64I-NEXT:    sb t6, 99(sp)
-; RV64I-NEXT:    sb t5, 98(sp)
-; RV64I-NEXT:    sb t4, 97(sp)
-; RV64I-NEXT:    sb t3, 96(sp)
+; RV64I-NEXT:    sb a1, 115(sp)
+; RV64I-NEXT:    sb a3, 114(sp)
+; RV64I-NEXT:    sb a4, 113(sp)
 ; RV64I-NEXT:    sb zero, 87(sp)
 ; RV64I-NEXT:    sb zero, 86(sp)
 ; RV64I-NEXT:    sb zero, 85(sp)
@@ -2234,6 +2228,23 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    sb zero, 58(sp)
 ; RV64I-NEXT:    sb zero, 57(sp)
 ; RV64I-NEXT:    sb zero, 56(sp)
+; RV64I-NEXT:    sb a5, 112(sp)
+; RV64I-NEXT:    sb a6, 111(sp)
+; RV64I-NEXT:    sb a7, 110(sp)
+; RV64I-NEXT:    sb ra, 109(sp)
+; RV64I-NEXT:    sb t0, 108(sp)
+; RV64I-NEXT:    sb s10, 107(sp)
+; RV64I-NEXT:    sb s9, 106(sp)
+; RV64I-NEXT:    sb s8, 105(sp)
+; RV64I-NEXT:    sb s7, 104(sp)
+; RV64I-NEXT:    sb s5, 103(sp)
+; RV64I-NEXT:    sb s3, 102(sp)
+; RV64I-NEXT:    sb s1, 101(sp)
+; RV64I-NEXT:    sb s0, 100(sp)
+; RV64I-NEXT:    sb t6, 99(sp)
+; RV64I-NEXT:    sb t5, 98(sp)
+; RV64I-NEXT:    sb t4, 97(sp)
+; RV64I-NEXT:    sb t3, 96(sp)
 ; RV64I-NEXT:    sb t2, 95(sp)
 ; RV64I-NEXT:    sb t1, 94(sp)
 ; RV64I-NEXT:    ld a0, 8(sp) # 8-byte Folded Reload
@@ -2248,173 +2259,173 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    sb a0, 89(sp)
 ; RV64I-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    sb a0, 88(sp)
-; RV64I-NEXT:    slli a0, t0, 56
+; RV64I-NEXT:    slli a0, s11, 56
 ; RV64I-NEXT:    srli a0, a0, 59
 ; RV64I-NEXT:    addi a1, sp, 88
 ; RV64I-NEXT:    sub a0, a1, a0
-; RV64I-NEXT:    lbu a1, 9(a0)
 ; RV64I-NEXT:    lbu a3, 8(a0)
-; RV64I-NEXT:    lbu a4, 10(a0)
-; RV64I-NEXT:    lbu a5, 11(a0)
-; RV64I-NEXT:    slli a1, a1, 8
-; RV64I-NEXT:    or a1, a1, a3
-; RV64I-NEXT:    slli a4, a4, 16
-; RV64I-NEXT:    slli a5, a5, 24
-; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    or a1, a4, a1
-; RV64I-NEXT:    lbu a3, 13(a0)
-; RV64I-NEXT:    lbu a4, 12(a0)
-; RV64I-NEXT:    lbu a5, 14(a0)
-; RV64I-NEXT:    lbu a6, 15(a0)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    lbu a4, 9(a0)
+; RV64I-NEXT:    lbu a5, 10(a0)
+; RV64I-NEXT:    lbu a6, 11(a0)
+; RV64I-NEXT:    lbu a7, 12(a0)
+; RV64I-NEXT:    lbu t0, 13(a0)
+; RV64I-NEXT:    lbu t1, 14(a0)
+; RV64I-NEXT:    lbu t2, 15(a0)
+; RV64I-NEXT:    lbu a1, 16(a0)
+; RV64I-NEXT:    lbu t3, 17(a0)
+; RV64I-NEXT:    lbu t4, 18(a0)
+; RV64I-NEXT:    lbu t6, 19(a0)
+; RV64I-NEXT:    lbu t5, 20(a0)
+; RV64I-NEXT:    lbu s0, 21(a0)
+; RV64I-NEXT:    lbu s1, 22(a0)
+; RV64I-NEXT:    lbu s2, 23(a0)
+; RV64I-NEXT:    lbu s3, 24(a0)
+; RV64I-NEXT:    lbu s4, 25(a0)
+; RV64I-NEXT:    lbu s5, 26(a0)
+; RV64I-NEXT:    lbu s6, 27(a0)
+; RV64I-NEXT:    lbu s7, 28(a0)
+; RV64I-NEXT:    lbu s8, 29(a0)
+; RV64I-NEXT:    lbu s9, 30(a0)
+; RV64I-NEXT:    lbu s10, 31(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    or a4, a6, a5
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    slli a3, a3, 32
-; RV64I-NEXT:    or a3, a3, a1
-; RV64I-NEXT:    andi a1, t0, 7
-; RV64I-NEXT:    lbu a4, 1(a0)
-; RV64I-NEXT:    lbu a5, 0(a0)
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a4, t0, a7
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t2, t2, 24
+; RV64I-NEXT:    or a5, t2, t1
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    slli a4, a4, 32
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 0(a0)
+; RV64I-NEXT:    lbu a5, 1(a0)
 ; RV64I-NEXT:    lbu a6, 2(a0)
 ; RV64I-NEXT:    lbu a7, 3(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
+; RV64I-NEXT:    lbu t0, 4(a0)
+; RV64I-NEXT:    lbu t1, 5(a0)
+; RV64I-NEXT:    lbu t2, 6(a0)
+; RV64I-NEXT:    lbu a0, 7(a0)
+; RV64I-NEXT:    slli a5, a5, 8
+; RV64I-NEXT:    or a4, a5, a4
 ; RV64I-NEXT:    slli a6, a6, 16
 ; RV64I-NEXT:    slli a7, a7, 24
 ; RV64I-NEXT:    or a5, a7, a6
 ; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    lbu a5, 5(a0)
-; RV64I-NEXT:    lbu a6, 4(a0)
-; RV64I-NEXT:    lbu a7, 6(a0)
-; RV64I-NEXT:    lbu t0, 7(a0)
-; RV64I-NEXT:    slli a5, a5, 8
-; RV64I-NEXT:    or a5, a5, a6
-; RV64I-NEXT:    slli a7, a7, 16
-; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    or a6, t0, a7
+; RV64I-NEXT:    slli t1, t1, 8
+; RV64I-NEXT:    or a5, t1, t0
+; RV64I-NEXT:    slli t2, t2, 16
+; RV64I-NEXT:    slli a0, a0, 24
+; RV64I-NEXT:    or a0, a0, t2
+; RV64I-NEXT:    or a0, a0, a5
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    or a4, a0, a4
+; RV64I-NEXT:    slli s4, s4, 8
+; RV64I-NEXT:    or a0, s4, s3
+; RV64I-NEXT:    slli s5, s5, 16
+; RV64I-NEXT:    slli s6, s6, 24
+; RV64I-NEXT:    or a5, s6, s5
+; RV64I-NEXT:    or a0, a5, a0
+; RV64I-NEXT:    slli s8, s8, 8
+; RV64I-NEXT:    or a5, s8, s7
+; RV64I-NEXT:    slli s9, s9, 16
+; RV64I-NEXT:    slli s10, s10, 24
+; RV64I-NEXT:    or a6, s10, s9
 ; RV64I-NEXT:    or a5, a6, a5
 ; RV64I-NEXT:    slli a5, a5, 32
-; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    lbu a5, 25(a0)
-; RV64I-NEXT:    lbu a6, 24(a0)
-; RV64I-NEXT:    lbu a7, 26(a0)
-; RV64I-NEXT:    lbu t0, 27(a0)
-; RV64I-NEXT:    slli a5, a5, 8
-; RV64I-NEXT:    or a5, a5, a6
-; RV64I-NEXT:    slli a7, a7, 16
-; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    or a6, t0, a7
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    lbu a6, 29(a0)
-; RV64I-NEXT:    lbu a7, 28(a0)
-; RV64I-NEXT:    lbu t0, 30(a0)
-; RV64I-NEXT:    lbu t1, 31(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    slli t0, t0, 16
-; RV64I-NEXT:    slli t1, t1, 24
-; RV64I-NEXT:    or a7, t1, t0
-; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    slli a6, a6, 32
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    lbu a6, 17(a0)
-; RV64I-NEXT:    lbu a7, 16(a0)
-; RV64I-NEXT:    lbu t0, 18(a0)
-; RV64I-NEXT:    lbu t1, 19(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    slli t0, t0, 16
-; RV64I-NEXT:    slli t1, t1, 24
-; RV64I-NEXT:    lbu a7, 21(a0)
-; RV64I-NEXT:    or t0, t1, t0
-; RV64I-NEXT:    or a6, t0, a6
-; RV64I-NEXT:    lbu t0, 20(a0)
-; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    lbu t1, 22(a0)
-; RV64I-NEXT:    lbu a0, 23(a0)
-; RV64I-NEXT:    or a7, a7, t0
-; RV64I-NEXT:    srli t0, a4, 1
-; RV64I-NEXT:    slli t1, t1, 16
-; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or t1, a0, t1
-; RV64I-NEXT:    xori t2, a1, 63
-; RV64I-NEXT:    srl a0, t0, t2
-; RV64I-NEXT:    or a7, t1, a7
+; RV64I-NEXT:    or a5, a5, a0
+; RV64I-NEXT:    slli t3, t3, 8
+; RV64I-NEXT:    or a0, t3, a1
+; RV64I-NEXT:    slli t4, t4, 16
+; RV64I-NEXT:    slli t6, t6, 24
+; RV64I-NEXT:    or a1, t6, t4
+; RV64I-NEXT:    or a1, a1, a0
+; RV64I-NEXT:    andi a6, s11, 7
+; RV64I-NEXT:    slli s0, s0, 8
+; RV64I-NEXT:    or a7, s0, t5
+; RV64I-NEXT:    srli a0, a4, 1
+; RV64I-NEXT:    slli s1, s1, 16
+; RV64I-NEXT:    slli s2, s2, 24
+; RV64I-NEXT:    or t0, s2, s1
+; RV64I-NEXT:    xori t1, a6, 63
+; RV64I-NEXT:    srl a0, a0, t1
+; RV64I-NEXT:    or a7, t0, a7
 ; RV64I-NEXT:    slli a7, a7, 32
-; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    srli a7, a6, 1
-; RV64I-NEXT:    srl a7, a7, t2
-; RV64I-NEXT:    srli t0, a3, 1
-; RV64I-NEXT:    not t1, a1
-; RV64I-NEXT:    srl t0, t0, t1
-; RV64I-NEXT:    sll a3, a3, a1
-; RV64I-NEXT:    sll a5, a5, a1
-; RV64I-NEXT:    sll a6, a6, a1
-; RV64I-NEXT:    sll a1, a4, a1
-; RV64I-NEXT:    srli a4, a6, 56
-; RV64I-NEXT:    sb a4, 23(a2)
-; RV64I-NEXT:    srli a4, a6, 48
-; RV64I-NEXT:    sb a4, 22(a2)
-; RV64I-NEXT:    srli a4, a6, 40
-; RV64I-NEXT:    sb a4, 21(a2)
-; RV64I-NEXT:    srli a4, a6, 32
-; RV64I-NEXT:    sb a4, 20(a2)
-; RV64I-NEXT:    srli a4, a6, 24
-; RV64I-NEXT:    sb a4, 19(a2)
-; RV64I-NEXT:    srli a4, a6, 16
-; RV64I-NEXT:    sb a4, 18(a2)
-; RV64I-NEXT:    or a4, a6, t0
-; RV64I-NEXT:    srli a6, a6, 8
-; RV64I-NEXT:    sb a6, 17(a2)
+; RV64I-NEXT:    or a7, a7, a1
+; RV64I-NEXT:    srli a1, a7, 1
+; RV64I-NEXT:    srl t0, a1, t1
+; RV64I-NEXT:    srli a1, a3, 1
+; RV64I-NEXT:    not t1, a6
+; RV64I-NEXT:    srl t1, a1, t1
+; RV64I-NEXT:    sll a1, a3, a6
+; RV64I-NEXT:    sll a3, a5, a6
+; RV64I-NEXT:    sll a5, a7, a6
+; RV64I-NEXT:    sll a4, a4, a6
 ; RV64I-NEXT:    srli a6, a5, 56
-; RV64I-NEXT:    sb a6, 31(a2)
+; RV64I-NEXT:    sb a6, 23(a2)
 ; RV64I-NEXT:    srli a6, a5, 48
-; RV64I-NEXT:    sb a6, 30(a2)
+; RV64I-NEXT:    sb a6, 22(a2)
 ; RV64I-NEXT:    srli a6, a5, 40
-; RV64I-NEXT:    sb a6, 29(a2)
+; RV64I-NEXT:    sb a6, 21(a2)
 ; RV64I-NEXT:    srli a6, a5, 32
-; RV64I-NEXT:    sb a6, 28(a2)
+; RV64I-NEXT:    sb a6, 20(a2)
 ; RV64I-NEXT:    srli a6, a5, 24
-; RV64I-NEXT:    sb a6, 27(a2)
+; RV64I-NEXT:    sb a6, 19(a2)
 ; RV64I-NEXT:    srli a6, a5, 16
-; RV64I-NEXT:    sb a6, 26(a2)
-; RV64I-NEXT:    or a6, a5, a7
+; RV64I-NEXT:    sb a6, 18(a2)
+; RV64I-NEXT:    or a6, a5, t1
 ; RV64I-NEXT:    srli a5, a5, 8
-; RV64I-NEXT:    sb a5, 25(a2)
-; RV64I-NEXT:    srli a5, a1, 56
-; RV64I-NEXT:    sb a5, 7(a2)
-; RV64I-NEXT:    srli a5, a1, 48
-; RV64I-NEXT:    sb a5, 6(a2)
-; RV64I-NEXT:    srli a5, a1, 40
-; RV64I-NEXT:    sb a5, 5(a2)
-; RV64I-NEXT:    srli a5, a1, 32
-; RV64I-NEXT:    sb a5, 4(a2)
-; RV64I-NEXT:    srli a5, a1, 24
-; RV64I-NEXT:    sb a5, 3(a2)
-; RV64I-NEXT:    srli a5, a1, 16
-; RV64I-NEXT:    sb a5, 2(a2)
-; RV64I-NEXT:    sb a1, 0(a2)
-; RV64I-NEXT:    srli a1, a1, 8
-; RV64I-NEXT:    sb a1, 1(a2)
-; RV64I-NEXT:    srli a1, a3, 56
-; RV64I-NEXT:    sb a1, 15(a2)
-; RV64I-NEXT:    srli a1, a3, 48
-; RV64I-NEXT:    sb a1, 14(a2)
-; RV64I-NEXT:    srli a1, a3, 40
-; RV64I-NEXT:    sb a1, 13(a2)
-; RV64I-NEXT:    srli a1, a3, 32
-; RV64I-NEXT:    sb a1, 12(a2)
-; RV64I-NEXT:    srli a1, a3, 24
-; RV64I-NEXT:    sb a1, 11(a2)
-; RV64I-NEXT:    srli a1, a3, 16
-; RV64I-NEXT:    sb a1, 10(a2)
-; RV64I-NEXT:    or a0, a3, a0
+; RV64I-NEXT:    sb a5, 17(a2)
+; RV64I-NEXT:    srli a5, a3, 56
+; RV64I-NEXT:    sb a5, 31(a2)
+; RV64I-NEXT:    srli a5, a3, 48
+; RV64I-NEXT:    sb a5, 30(a2)
+; RV64I-NEXT:    srli a5, a3, 40
+; RV64I-NEXT:    sb a5, 29(a2)
+; RV64I-NEXT:    srli a5, a3, 32
+; RV64I-NEXT:    sb a5, 28(a2)
+; RV64I-NEXT:    srli a5, a3, 24
+; RV64I-NEXT:    sb a5, 27(a2)
+; RV64I-NEXT:    srli a5, a3, 16
+; RV64I-NEXT:    sb a5, 26(a2)
+; RV64I-NEXT:    or a5, a3, t0
 ; RV64I-NEXT:    srli a3, a3, 8
-; RV64I-NEXT:    sb a3, 9(a2)
-; RV64I-NEXT:    sb a4, 16(a2)
-; RV64I-NEXT:    sb a6, 24(a2)
+; RV64I-NEXT:    sb a3, 25(a2)
+; RV64I-NEXT:    srli a3, a4, 56
+; RV64I-NEXT:    sb a3, 7(a2)
+; RV64I-NEXT:    srli a3, a4, 48
+; RV64I-NEXT:    sb a3, 6(a2)
+; RV64I-NEXT:    srli a3, a4, 40
+; RV64I-NEXT:    sb a3, 5(a2)
+; RV64I-NEXT:    srli a3, a4, 32
+; RV64I-NEXT:    sb a3, 4(a2)
+; RV64I-NEXT:    srli a3, a4, 24
+; RV64I-NEXT:    sb a3, 3(a2)
+; RV64I-NEXT:    srli a3, a4, 16
+; RV64I-NEXT:    sb a3, 2(a2)
+; RV64I-NEXT:    sb a4, 0(a2)
+; RV64I-NEXT:    srli a4, a4, 8
+; RV64I-NEXT:    sb a4, 1(a2)
+; RV64I-NEXT:    srli a3, a1, 56
+; RV64I-NEXT:    sb a3, 15(a2)
+; RV64I-NEXT:    srli a3, a1, 48
+; RV64I-NEXT:    sb a3, 14(a2)
+; RV64I-NEXT:    srli a3, a1, 40
+; RV64I-NEXT:    sb a3, 13(a2)
+; RV64I-NEXT:    srli a3, a1, 32
+; RV64I-NEXT:    sb a3, 12(a2)
+; RV64I-NEXT:    srli a3, a1, 24
+; RV64I-NEXT:    sb a3, 11(a2)
+; RV64I-NEXT:    srli a3, a1, 16
+; RV64I-NEXT:    sb a3, 10(a2)
+; RV64I-NEXT:    or a0, a1, a0
+; RV64I-NEXT:    srli a1, a1, 8
+; RV64I-NEXT:    sb a1, 9(a2)
+; RV64I-NEXT:    sb a6, 16(a2)
+; RV64I-NEXT:    sb a5, 24(a2)
 ; RV64I-NEXT:    sb a0, 8(a2)
 ; RV64I-NEXT:    ld ra, 216(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 208(sp) # 8-byte Folded Reload
@@ -2448,18 +2459,32 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sw s9, 100(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s10, 96(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s11, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 0(a0)
-; RV32I-NEXT:    sw a3, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 1(a0)
-; RV32I-NEXT:    sw a3, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 2(a0)
-; RV32I-NEXT:    sw a3, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 3(a0)
-; RV32I-NEXT:    sw a3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 4(a0)
-; RV32I-NEXT:    sw a3, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 5(a0)
-; RV32I-NEXT:    sw a3, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a5, 0(a1)
+; RV32I-NEXT:    lbu a6, 1(a1)
+; RV32I-NEXT:    lbu ra, 29(a0)
+; RV32I-NEXT:    lbu t0, 30(a0)
+; RV32I-NEXT:    lbu a4, 31(a0)
+; RV32I-NEXT:    lbu a7, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or a1, a1, a7
+; RV32I-NEXT:    or a6, a1, a5
+; RV32I-NEXT:    sw a6, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a1, 0(a0)
+; RV32I-NEXT:    sw a1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a1, 1(a0)
+; RV32I-NEXT:    sw a1, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a1, 2(a0)
+; RV32I-NEXT:    sw a1, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a1, 3(a0)
+; RV32I-NEXT:    sw a1, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a1, 4(a0)
+; RV32I-NEXT:    sw a1, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a1, 5(a0)
+; RV32I-NEXT:    sw a1, 0(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    lbu t1, 6(a0)
 ; RV32I-NEXT:    lbu t2, 7(a0)
 ; RV32I-NEXT:    lbu t3, 8(a0)
@@ -2474,44 +2499,21 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    lbu s5, 17(a0)
 ; RV32I-NEXT:    lbu s6, 18(a0)
 ; RV32I-NEXT:    lbu s7, 19(a0)
-; RV32I-NEXT:    lbu s10, 1(a1)
 ; RV32I-NEXT:    lbu s8, 20(a0)
 ; RV32I-NEXT:    lbu s9, 21(a0)
-; RV32I-NEXT:    lbu s11, 0(a1)
-; RV32I-NEXT:    slli s10, s10, 8
-; RV32I-NEXT:    lbu ra, 2(a1)
-; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    or s10, s10, s11
-; RV32I-NEXT:    lbu s11, 22(a0)
-; RV32I-NEXT:    slli ra, ra, 16
-; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, ra
-; RV32I-NEXT:    lbu ra, 23(a0)
-; RV32I-NEXT:    or t0, a1, s10
-; RV32I-NEXT:    lbu s10, 24(a0)
-; RV32I-NEXT:    lbu a7, 25(a0)
-; RV32I-NEXT:    lbu a6, 26(a0)
-; RV32I-NEXT:    lbu a5, 27(a0)
-; RV32I-NEXT:    lbu a1, 31(a0)
-; RV32I-NEXT:    lbu a3, 30(a0)
-; RV32I-NEXT:    lbu a4, 29(a0)
+; RV32I-NEXT:    lbu s10, 22(a0)
+; RV32I-NEXT:    lbu s11, 23(a0)
+; RV32I-NEXT:    lbu a7, 24(a0)
+; RV32I-NEXT:    lbu a5, 25(a0)
+; RV32I-NEXT:    lbu a3, 26(a0)
+; RV32I-NEXT:    lbu a1, 27(a0)
 ; RV32I-NEXT:    lbu a0, 28(a0)
-; RV32I-NEXT:    sb a1, 91(sp)
-; RV32I-NEXT:    sb a3, 90(sp)
-; RV32I-NEXT:    sb a4, 89(sp)
+; RV32I-NEXT:    sb a4, 91(sp)
+; RV32I-NEXT:    sb t0, 90(sp)
+; RV32I-NEXT:    sb ra, 89(sp)
 ; RV32I-NEXT:    sb a0, 88(sp)
-; RV32I-NEXT:    sb a5, 87(sp)
-; RV32I-NEXT:    sb a6, 86(sp)
-; RV32I-NEXT:    sb a7, 85(sp)
-; RV32I-NEXT:    sb s10, 84(sp)
-; RV32I-NEXT:    sb ra, 83(sp)
-; RV32I-NEXT:    sb s11, 82(sp)
-; RV32I-NEXT:    sb s9, 81(sp)
-; RV32I-NEXT:    sb s8, 80(sp)
-; RV32I-NEXT:    sb s7, 79(sp)
-; RV32I-NEXT:    sb s6, 78(sp)
-; RV32I-NEXT:    sb s5, 77(sp)
-; RV32I-NEXT:    sb s4, 76(sp)
+; RV32I-NEXT:    sb a1, 87(sp)
+; RV32I-NEXT:    sb a3, 86(sp)
 ; RV32I-NEXT:    sb zero, 59(sp)
 ; RV32I-NEXT:    sb zero, 58(sp)
 ; RV32I-NEXT:    sb zero, 57(sp)
@@ -2544,6 +2546,16 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sb zero, 30(sp)
 ; RV32I-NEXT:    sb zero, 29(sp)
 ; RV32I-NEXT:    sb zero, 28(sp)
+; RV32I-NEXT:    sb a5, 85(sp)
+; RV32I-NEXT:    sb a7, 84(sp)
+; RV32I-NEXT:    sb s11, 83(sp)
+; RV32I-NEXT:    sb s10, 82(sp)
+; RV32I-NEXT:    sb s9, 81(sp)
+; RV32I-NEXT:    sb s8, 80(sp)
+; RV32I-NEXT:    sb s7, 79(sp)
+; RV32I-NEXT:    sb s6, 78(sp)
+; RV32I-NEXT:    sb s5, 77(sp)
+; RV32I-NEXT:    sb s4, 76(sp)
 ; RV32I-NEXT:    sb s3, 75(sp)
 ; RV32I-NEXT:    sb s2, 74(sp)
 ; RV32I-NEXT:    sb s1, 73(sp)
@@ -2554,189 +2566,194 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sb t3, 68(sp)
 ; RV32I-NEXT:    sb t2, 67(sp)
 ; RV32I-NEXT:    sb t1, 66(sp)
-; RV32I-NEXT:    lw a0, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw a0, 0(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    sb a0, 65(sp)
-; RV32I-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw a0, 4(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    sb a0, 64(sp)
-; RV32I-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    sb a0, 63(sp)
-; RV32I-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    sb a0, 62(sp)
-; RV32I-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    sb a0, 61(sp)
-; RV32I-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    sb a0, 60(sp)
-; RV32I-NEXT:    slli a0, t0, 24
+; RV32I-NEXT:    slli a0, a6, 24
 ; RV32I-NEXT:    srli a0, a0, 27
-; RV32I-NEXT:    addi a4, sp, 60
-; RV32I-NEXT:    sub a4, a4, a0
-; RV32I-NEXT:    lbu a0, 5(a4)
-; RV32I-NEXT:    lbu a1, 4(a4)
-; RV32I-NEXT:    lbu a3, 6(a4)
-; RV32I-NEXT:    lbu a5, 7(a4)
+; RV32I-NEXT:    addi a1, sp, 60
+; RV32I-NEXT:    sub a0, a1, a0
+; RV32I-NEXT:    lbu a1, 4(a0)
+; RV32I-NEXT:    lbu a4, 5(a0)
+; RV32I-NEXT:    lbu a5, 6(a0)
+; RV32I-NEXT:    lbu a3, 7(a0)
+; RV32I-NEXT:    lbu a6, 8(a0)
+; RV32I-NEXT:    sw a6, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a6, 9(a0)
+; RV32I-NEXT:    sw a6, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu t1, 10(a0)
+; RV32I-NEXT:    lbu t2, 11(a0)
+; RV32I-NEXT:    lbu t3, 12(a0)
+; RV32I-NEXT:    lbu t6, 13(a0)
+; RV32I-NEXT:    lbu t4, 14(a0)
+; RV32I-NEXT:    lbu t5, 15(a0)
+; RV32I-NEXT:    lbu s0, 16(a0)
+; RV32I-NEXT:    lbu t0, 17(a0)
+; RV32I-NEXT:    lbu s1, 18(a0)
+; RV32I-NEXT:    lbu s2, 19(a0)
+; RV32I-NEXT:    lbu s5, 20(a0)
+; RV32I-NEXT:    lbu s10, 21(a0)
+; RV32I-NEXT:    lbu s8, 22(a0)
+; RV32I-NEXT:    lbu s9, 23(a0)
+; RV32I-NEXT:    lbu s3, 24(a0)
+; RV32I-NEXT:    lbu s6, 25(a0)
+; RV32I-NEXT:    lbu s4, 26(a0)
+; RV32I-NEXT:    lbu s7, 27(a0)
+; RV32I-NEXT:    lbu s11, 28(a0)
+; RV32I-NEXT:    lbu ra, 29(a0)
+; RV32I-NEXT:    lbu a6, 30(a0)
+; RV32I-NEXT:    lbu a7, 31(a0)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    or a4, a4, a1
+; RV32I-NEXT:    slli a1, a5, 16
+; RV32I-NEXT:    slli a3, a3, 24
+; RV32I-NEXT:    or a1, a3, a1
+; RV32I-NEXT:    or a5, a1, a4
+; RV32I-NEXT:    lbu a1, 1(a0)
+; RV32I-NEXT:    lbu a4, 0(a0)
+; RV32I-NEXT:    lbu a3, 2(a0)
+; RV32I-NEXT:    lbu a0, 3(a0)
+; RV32I-NEXT:    slli a1, a1, 8
+; RV32I-NEXT:    or a1, a1, a4
+; RV32I-NEXT:    slli a3, a3, 16
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    or a0, a0, a3
+; RV32I-NEXT:    or a4, a0, a1
+; RV32I-NEXT:    slli t6, t6, 8
+; RV32I-NEXT:    or a0, t6, t3
+; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    slli t5, t5, 24
+; RV32I-NEXT:    or a1, t5, t4
+; RV32I-NEXT:    or t3, a1, a0
+; RV32I-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    slli a0, a0, 8
+; RV32I-NEXT:    lw a1, 20(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    slli a3, a3, 16
-; RV32I-NEXT:    slli a5, a5, 24
-; RV32I-NEXT:    or a3, a5, a3
-; RV32I-NEXT:    or t5, a3, a0
-; RV32I-NEXT:    andi a1, t0, 7
-; RV32I-NEXT:    lbu a0, 1(a4)
-; RV32I-NEXT:    lbu a3, 0(a4)
-; RV32I-NEXT:    lbu a5, 2(a4)
-; RV32I-NEXT:    lbu a6, 3(a4)
-; RV32I-NEXT:    slli a0, a0, 8
-; RV32I-NEXT:    or a0, a0, a3
-; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    or a3, a6, a5
-; RV32I-NEXT:    or a6, a3, a0
-; RV32I-NEXT:    srli a0, a6, 1
-; RV32I-NEXT:    xori a7, a1, 31
-; RV32I-NEXT:    srl a0, a0, a7
-; RV32I-NEXT:    lbu a3, 13(a4)
-; RV32I-NEXT:    lbu a5, 12(a4)
-; RV32I-NEXT:    lbu t0, 14(a4)
-; RV32I-NEXT:    lbu t1, 15(a4)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a5
-; RV32I-NEXT:    slli t0, t0, 16
-; RV32I-NEXT:    slli t1, t1, 24
-; RV32I-NEXT:    or a5, t1, t0
-; RV32I-NEXT:    or t0, a5, a3
-; RV32I-NEXT:    lbu a3, 9(a4)
-; RV32I-NEXT:    lbu a5, 8(a4)
-; RV32I-NEXT:    lbu t1, 10(a4)
-; RV32I-NEXT:    lbu t2, 11(a4)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a5
 ; RV32I-NEXT:    slli t1, t1, 16
 ; RV32I-NEXT:    slli t2, t2, 24
-; RV32I-NEXT:    or a5, t2, t1
-; RV32I-NEXT:    or t1, a5, a3
-; RV32I-NEXT:    srli a3, t1, 1
-; RV32I-NEXT:    srl a5, a3, a7
-; RV32I-NEXT:    srli t4, t5, 1
-; RV32I-NEXT:    not t2, a1
-; RV32I-NEXT:    lbu a3, 21(a4)
-; RV32I-NEXT:    lbu t3, 20(a4)
-; RV32I-NEXT:    lbu t6, 22(a4)
-; RV32I-NEXT:    lbu s0, 23(a4)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, t3
-; RV32I-NEXT:    slli t6, t6, 16
-; RV32I-NEXT:    slli s0, s0, 24
-; RV32I-NEXT:    or t3, s0, t6
-; RV32I-NEXT:    or t3, t3, a3
-; RV32I-NEXT:    lbu a3, 17(a4)
-; RV32I-NEXT:    lbu t6, 16(a4)
-; RV32I-NEXT:    lbu s0, 18(a4)
-; RV32I-NEXT:    lbu s1, 19(a4)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, t6
-; RV32I-NEXT:    slli s0, s0, 16
-; RV32I-NEXT:    slli s1, s1, 24
-; RV32I-NEXT:    or s0, s1, s0
-; RV32I-NEXT:    or s0, s0, a3
-; RV32I-NEXT:    lbu a3, 29(a4)
-; RV32I-NEXT:    lbu t6, 28(a4)
-; RV32I-NEXT:    lbu s1, 30(a4)
-; RV32I-NEXT:    lbu s2, 31(a4)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, t6
+; RV32I-NEXT:    or a1, t2, t1
+; RV32I-NEXT:    or t1, a1, a0
+; RV32I-NEXT:    slli s10, s10, 8
+; RV32I-NEXT:    or a0, s10, s5
+; RV32I-NEXT:    slli s8, s8, 16
+; RV32I-NEXT:    slli s9, s9, 24
+; RV32I-NEXT:    or a1, s9, s8
+; RV32I-NEXT:    or t2, a1, a0
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    or a0, t0, s0
+; RV32I-NEXT:    lw a1, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    andi t0, a1, 7
 ; RV32I-NEXT:    slli s1, s1, 16
 ; RV32I-NEXT:    slli s2, s2, 24
-; RV32I-NEXT:    or t6, s2, s1
-; RV32I-NEXT:    lbu s1, 25(a4)
-; RV32I-NEXT:    lbu s2, 24(a4)
-; RV32I-NEXT:    srl t4, t4, t2
-; RV32I-NEXT:    or t6, t6, a3
-; RV32I-NEXT:    slli s1, s1, 8
-; RV32I-NEXT:    or a3, s1, s2
-; RV32I-NEXT:    lbu s1, 26(a4)
-; RV32I-NEXT:    lbu a4, 27(a4)
-; RV32I-NEXT:    srli s2, s0, 1
-; RV32I-NEXT:    srl s2, s2, a7
-; RV32I-NEXT:    slli s1, s1, 16
-; RV32I-NEXT:    slli a4, a4, 24
-; RV32I-NEXT:    or a4, a4, s1
-; RV32I-NEXT:    srli s1, t0, 1
-; RV32I-NEXT:    srl s1, s1, t2
-; RV32I-NEXT:    or a4, a4, a3
+; RV32I-NEXT:    or a1, s2, s1
 ; RV32I-NEXT:    srli a3, a4, 1
-; RV32I-NEXT:    srl a7, a3, a7
-; RV32I-NEXT:    srli a3, t3, 1
-; RV32I-NEXT:    srl t2, a3, t2
-; RV32I-NEXT:    sll a3, t5, a1
-; RV32I-NEXT:    sll t0, t0, a1
-; RV32I-NEXT:    sll t1, t1, a1
-; RV32I-NEXT:    sll t3, t3, a1
-; RV32I-NEXT:    sll t5, s0, a1
-; RV32I-NEXT:    sll t6, t6, a1
-; RV32I-NEXT:    sll a4, a4, a1
-; RV32I-NEXT:    sll a1, a6, a1
-; RV32I-NEXT:    srli a6, a4, 24
-; RV32I-NEXT:    sb a6, 27(a2)
-; RV32I-NEXT:    srli a6, a4, 16
-; RV32I-NEXT:    sb a6, 26(a2)
-; RV32I-NEXT:    or a6, a4, t2
-; RV32I-NEXT:    srli a4, a4, 8
-; RV32I-NEXT:    sb a4, 25(a2)
-; RV32I-NEXT:    srli a4, t6, 24
-; RV32I-NEXT:    sb a4, 31(a2)
-; RV32I-NEXT:    srli a4, t6, 16
-; RV32I-NEXT:    sb a4, 30(a2)
-; RV32I-NEXT:    or a4, t6, a7
-; RV32I-NEXT:    srli a7, t6, 8
-; RV32I-NEXT:    sb a7, 29(a2)
-; RV32I-NEXT:    srli a7, t5, 24
-; RV32I-NEXT:    sb a7, 19(a2)
-; RV32I-NEXT:    srli a7, t5, 16
-; RV32I-NEXT:    sb a7, 18(a2)
-; RV32I-NEXT:    or a7, t5, s1
-; RV32I-NEXT:    srli t2, t5, 8
-; RV32I-NEXT:    sb t2, 17(a2)
-; RV32I-NEXT:    srli t2, t3, 24
-; RV32I-NEXT:    sb t2, 23(a2)
-; RV32I-NEXT:    srli t2, t3, 16
-; RV32I-NEXT:    sb t2, 22(a2)
-; RV32I-NEXT:    or t2, t3, s2
+; RV32I-NEXT:    or t4, a1, a0
+; RV32I-NEXT:    xori t5, t0, 31
+; RV32I-NEXT:    srl a0, a3, t5
+; RV32I-NEXT:    slli ra, ra, 8
+; RV32I-NEXT:    or a3, ra, s11
+; RV32I-NEXT:    srli a1, t1, 1
+; RV32I-NEXT:    srl a1, a1, t5
+; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a7, a7, 24
+; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    srli a7, a5, 1
+; RV32I-NEXT:    or t6, a6, a3
+; RV32I-NEXT:    not a3, t0
+; RV32I-NEXT:    srl a6, a7, a3
+; RV32I-NEXT:    slli s6, s6, 8
+; RV32I-NEXT:    or a7, s6, s3
+; RV32I-NEXT:    srli s0, t4, 1
+; RV32I-NEXT:    srl s0, s0, t5
+; RV32I-NEXT:    slli s4, s4, 16
+; RV32I-NEXT:    slli s7, s7, 24
+; RV32I-NEXT:    or s1, s7, s4
+; RV32I-NEXT:    srli s2, t3, 1
+; RV32I-NEXT:    srl s2, s2, a3
+; RV32I-NEXT:    or a7, s1, a7
+; RV32I-NEXT:    srli s1, a7, 1
+; RV32I-NEXT:    srl t5, s1, t5
+; RV32I-NEXT:    srli s1, t2, 1
+; RV32I-NEXT:    srl s1, s1, a3
+; RV32I-NEXT:    sll a3, a5, t0
+; RV32I-NEXT:    sll a5, t3, t0
+; RV32I-NEXT:    sll t1, t1, t0
+; RV32I-NEXT:    sll t2, t2, t0
+; RV32I-NEXT:    sll t3, t4, t0
+; RV32I-NEXT:    sll t4, t6, t0
+; RV32I-NEXT:    sll a7, a7, t0
+; RV32I-NEXT:    sll a4, a4, t0
+; RV32I-NEXT:    srli t0, a7, 24
+; RV32I-NEXT:    sb t0, 27(a2)
+; RV32I-NEXT:    srli t0, a7, 16
+; RV32I-NEXT:    sb t0, 26(a2)
+; RV32I-NEXT:    or t0, a7, s1
+; RV32I-NEXT:    srli a7, a7, 8
+; RV32I-NEXT:    sb a7, 25(a2)
+; RV32I-NEXT:    srli a7, t4, 24
+; RV32I-NEXT:    sb a7, 31(a2)
+; RV32I-NEXT:    srli a7, t4, 16
+; RV32I-NEXT:    sb a7, 30(a2)
+; RV32I-NEXT:    or a7, t4, t5
+; RV32I-NEXT:    srli t4, t4, 8
+; RV32I-NEXT:    sb t4, 29(a2)
+; RV32I-NEXT:    srli t4, t3, 24
+; RV32I-NEXT:    sb t4, 19(a2)
+; RV32I-NEXT:    srli t4, t3, 16
+; RV32I-NEXT:    sb t4, 18(a2)
+; RV32I-NEXT:    or t4, t3, s2
 ; RV32I-NEXT:    srli t3, t3, 8
-; RV32I-NEXT:    sb t3, 21(a2)
-; RV32I-NEXT:    srli t3, t1, 24
-; RV32I-NEXT:    sb t3, 11(a2)
-; RV32I-NEXT:    srli t3, t1, 16
-; RV32I-NEXT:    sb t3, 10(a2)
-; RV32I-NEXT:    or t3, t1, t4
+; RV32I-NEXT:    sb t3, 17(a2)
+; RV32I-NEXT:    srli t3, t2, 24
+; RV32I-NEXT:    sb t3, 23(a2)
+; RV32I-NEXT:    srli t3, t2, 16
+; RV32I-NEXT:    sb t3, 22(a2)
+; RV32I-NEXT:    or t3, t2, s0
+; RV32I-NEXT:    srli t2, t2, 8
+; RV32I-NEXT:    sb t2, 21(a2)
+; RV32I-NEXT:    srli t2, t1, 24
+; RV32I-NEXT:    sb t2, 11(a2)
+; RV32I-NEXT:    srli t2, t1, 16
+; RV32I-NEXT:    sb t2, 10(a2)
+; RV32I-NEXT:    or a6, t1, a6
 ; RV32I-NEXT:    srli t1, t1, 8
 ; RV32I-NEXT:    sb t1, 9(a2)
-; RV32I-NEXT:    srli t1, t0, 24
+; RV32I-NEXT:    srli t1, a5, 24
 ; RV32I-NEXT:    sb t1, 15(a2)
-; RV32I-NEXT:    srli t1, t0, 16
+; RV32I-NEXT:    srli t1, a5, 16
 ; RV32I-NEXT:    sb t1, 14(a2)
-; RV32I-NEXT:    or a5, t0, a5
-; RV32I-NEXT:    srli t0, t0, 8
-; RV32I-NEXT:    sb t0, 13(a2)
-; RV32I-NEXT:    srli t0, a1, 24
-; RV32I-NEXT:    sb t0, 3(a2)
-; RV32I-NEXT:    srli t0, a1, 16
-; RV32I-NEXT:    sb t0, 2(a2)
-; RV32I-NEXT:    sb a1, 0(a2)
-; RV32I-NEXT:    srli a1, a1, 8
-; RV32I-NEXT:    sb a1, 1(a2)
-; RV32I-NEXT:    srli a1, a3, 24
-; RV32I-NEXT:    sb a1, 7(a2)
-; RV32I-NEXT:    srli a1, a3, 16
-; RV32I-NEXT:    sb a1, 6(a2)
+; RV32I-NEXT:    or a1, a5, a1
+; RV32I-NEXT:    srli a5, a5, 8
+; RV32I-NEXT:    sb a5, 13(a2)
+; RV32I-NEXT:    srli a5, a4, 24
+; RV32I-NEXT:    sb a5, 3(a2)
+; RV32I-NEXT:    srli a5, a4, 16
+; RV32I-NEXT:    sb a5, 2(a2)
+; RV32I-NEXT:    sb a4, 0(a2)
+; RV32I-NEXT:    srli a4, a4, 8
+; RV32I-NEXT:    sb a4, 1(a2)
+; RV32I-NEXT:    srli a4, a3, 24
+; RV32I-NEXT:    sb a4, 7(a2)
+; RV32I-NEXT:    srli a4, a3, 16
+; RV32I-NEXT:    sb a4, 6(a2)
 ; RV32I-NEXT:    or a0, a3, a0
 ; RV32I-NEXT:    srli a3, a3, 8
 ; RV32I-NEXT:    sb a3, 5(a2)
-; RV32I-NEXT:    sb a6, 24(a2)
-; RV32I-NEXT:    sb a4, 28(a2)
-; RV32I-NEXT:    sb a7, 16(a2)
-; RV32I-NEXT:    sb t2, 20(a2)
-; RV32I-NEXT:    sb t3, 8(a2)
-; RV32I-NEXT:    sb a5, 12(a2)
+; RV32I-NEXT:    sb t0, 24(a2)
+; RV32I-NEXT:    sb a7, 28(a2)
+; RV32I-NEXT:    sb t4, 16(a2)
+; RV32I-NEXT:    sb t3, 20(a2)
+; RV32I-NEXT:    sb a6, 8(a2)
+; RV32I-NEXT:    sb a1, 12(a2)
 ; RV32I-NEXT:    sb a0, 4(a2)
 ; RV32I-NEXT:    lw ra, 140(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 136(sp) # 4-byte Folded Reload
@@ -2776,19 +2793,43 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    sd s9, 136(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s10, 128(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s11, 120(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu t1, 31(a0)
-; RV64I-NEXT:    lbu a3, 0(a0)
-; RV64I-NEXT:    sd a3, 48(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 1(a0)
+; RV64I-NEXT:    lbu a3, 29(a0)
 ; RV64I-NEXT:    sd a3, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 2(a0)
-; RV64I-NEXT:    sd a3, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 3(a0)
+; RV64I-NEXT:    lbu a3, 30(a0)
 ; RV64I-NEXT:    sd a3, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 4(a0)
-; RV64I-NEXT:    sd a3, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 5(a0)
-; RV64I-NEXT:    sd a3, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a4, 0(a1)
+; RV64I-NEXT:    lbu a6, 1(a1)
+; RV64I-NEXT:    lbu a7, 2(a1)
+; RV64I-NEXT:    lbu t0, 3(a1)
+; RV64I-NEXT:    lbu t1, 4(a1)
+; RV64I-NEXT:    lbu t2, 5(a1)
+; RV64I-NEXT:    lbu t3, 6(a1)
+; RV64I-NEXT:    lbu a1, 7(a1)
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a4, a6, a4
+; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli t0, t0, 24
+; RV64I-NEXT:    or a6, t0, a7
+; RV64I-NEXT:    or a4, a6, a4
+; RV64I-NEXT:    slli t2, t2, 8
+; RV64I-NEXT:    or a6, t2, t1
+; RV64I-NEXT:    slli t3, t3, 16
+; RV64I-NEXT:    slli a1, a1, 24
+; RV64I-NEXT:    or a1, a1, t3
+; RV64I-NEXT:    or a1, a1, a6
+; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    or a1, a1, a4
+; RV64I-NEXT:    sd a1, 48(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 0(a0)
+; RV64I-NEXT:    sd a1, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 1(a0)
+; RV64I-NEXT:    sd a1, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 2(a0)
+; RV64I-NEXT:    sd a1, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 3(a0)
+; RV64I-NEXT:    sd a1, 0(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu t1, 4(a0)
+; RV64I-NEXT:    lbu t2, 5(a0)
 ; RV64I-NEXT:    lbu t3, 6(a0)
 ; RV64I-NEXT:    lbu t4, 7(a0)
 ; RV64I-NEXT:    lbu t5, 8(a0)
@@ -2803,50 +2844,31 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    lbu s7, 17(a0)
 ; RV64I-NEXT:    lbu s8, 18(a0)
 ; RV64I-NEXT:    lbu s9, 19(a0)
-; RV64I-NEXT:    lbu a3, 1(a1)
-; RV64I-NEXT:    lbu s10, 0(a1)
-; RV64I-NEXT:    lbu s11, 2(a1)
-; RV64I-NEXT:    lbu ra, 3(a1)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, s10
-; RV64I-NEXT:    slli s11, s11, 16
-; RV64I-NEXT:    slli ra, ra, 24
-; RV64I-NEXT:    lbu s10, 5(a1)
-; RV64I-NEXT:    or s11, ra, s11
-; RV64I-NEXT:    or a3, s11, a3
-; RV64I-NEXT:    lbu s11, 4(a1)
-; RV64I-NEXT:    slli s10, s10, 8
-; RV64I-NEXT:    lbu ra, 6(a1)
-; RV64I-NEXT:    lbu a1, 7(a1)
-; RV64I-NEXT:    or s10, s10, s11
-; RV64I-NEXT:    lbu s11, 20(a0)
-; RV64I-NEXT:    slli ra, ra, 16
-; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, ra
-; RV64I-NEXT:    lbu ra, 21(a0)
-; RV64I-NEXT:    or a1, a1, s10
-; RV64I-NEXT:    lbu s10, 22(a0)
-; RV64I-NEXT:    slli a1, a1, 32
-; RV64I-NEXT:    or t2, a1, a3
-; RV64I-NEXT:    lbu t0, 23(a0)
-; RV64I-NEXT:    lbu a7, 24(a0)
-; RV64I-NEXT:    lbu a6, 25(a0)
-; RV64I-NEXT:    lbu a5, 26(a0)
-; RV64I-NEXT:    lbu a1, 30(a0)
-; RV64I-NEXT:    lbu a3, 29(a0)
-; RV64I-NEXT:    lbu a4, 28(a0)
-; RV64I-NEXT:    lbu a0, 27(a0)
-; RV64I-NEXT:    sb a1, 86(sp)
-; RV64I-NEXT:    sb a3, 85(sp)
-; RV64I-NEXT:    sb a4, 84(sp)
-; RV64I-NEXT:    sb a0, 83(sp)
-; RV64I-NEXT:    sb a5, 82(sp)
-; RV64I-NEXT:    sb a6, 81(sp)
-; RV64I-NEXT:    sb a7, 80(sp)
-; RV64I-NEXT:    sb t0, 79(sp)
-; RV64I-NEXT:    sb s10, 78(sp)
-; RV64I-NEXT:    sb ra, 77(sp)
-; RV64I-NEXT:    sb s11, 76(sp)
+; RV64I-NEXT:    lbu s10, 20(a0)
+; RV64I-NEXT:    lbu s11, 21(a0)
+; RV64I-NEXT:    lbu ra, 22(a0)
+; RV64I-NEXT:    lbu a7, 23(a0)
+; RV64I-NEXT:    lbu a6, 24(a0)
+; RV64I-NEXT:    lbu a5, 25(a0)
+; RV64I-NEXT:    lbu a4, 26(a0)
+; RV64I-NEXT:    lbu a3, 27(a0)
+; RV64I-NEXT:    lbu a1, 28(a0)
+; RV64I-NEXT:    lbu a0, 31(a0)
+; RV64I-NEXT:    ld t0, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    sb t0, 86(sp)
+; RV64I-NEXT:    ld t0, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    sb t0, 85(sp)
+; RV64I-NEXT:    sb a1, 84(sp)
+; RV64I-NEXT:    sb a3, 83(sp)
+; RV64I-NEXT:    sb a4, 82(sp)
+; RV64I-NEXT:    sb a5, 81(sp)
+; RV64I-NEXT:    sb a6, 80(sp)
+; RV64I-NEXT:    sb a0, 87(sp)
+; RV64I-NEXT:    slli a0, a0, 56
+; RV64I-NEXT:    sb a7, 79(sp)
+; RV64I-NEXT:    sb ra, 78(sp)
+; RV64I-NEXT:    sb s11, 77(sp)
+; RV64I-NEXT:    sb s10, 76(sp)
 ; RV64I-NEXT:    sb s9, 75(sp)
 ; RV64I-NEXT:    sb s8, 74(sp)
 ; RV64I-NEXT:    sb s7, 73(sp)
@@ -2859,23 +2881,19 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    sb s0, 66(sp)
 ; RV64I-NEXT:    sb t6, 65(sp)
 ; RV64I-NEXT:    sb t5, 64(sp)
-; RV64I-NEXT:    sb t1, 87(sp)
-; RV64I-NEXT:    slli t1, t1, 56
 ; RV64I-NEXT:    sb t4, 63(sp)
 ; RV64I-NEXT:    sb t3, 62(sp)
-; RV64I-NEXT:    ld a0, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 61(sp)
-; RV64I-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 60(sp)
-; RV64I-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 59(sp)
-; RV64I-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 58(sp)
-; RV64I-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 57(sp)
-; RV64I-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 56(sp)
-; RV64I-NEXT:    srai a0, t1, 63
+; RV64I-NEXT:    sb t2, 61(sp)
+; RV64I-NEXT:    sb t1, 60(sp)
+; RV64I-NEXT:    ld a1, 0(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    sb a1, 59(sp)
+; RV64I-NEXT:    ld a1, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    sb a1, 58(sp)
+; RV64I-NEXT:    ld a1, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    sb a1, 57(sp)
+; RV64I-NEXT:    ld a1, 32(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    sb a1, 56(sp)
+; RV64I-NEXT:    srai a0, a0, 63
 ; RV64I-NEXT:    sb a0, 112(sp)
 ; RV64I-NEXT:    sb a0, 104(sp)
 ; RV64I-NEXT:    sb a0, 96(sp)
@@ -2915,108 +2933,109 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    sb a6, 91(sp)
 ; RV64I-NEXT:    sb a7, 90(sp)
 ; RV64I-NEXT:    sb a0, 89(sp)
-; RV64I-NEXT:    slli a0, t2, 56
+; RV64I-NEXT:    ld ra, 48(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    slli a0, ra, 56
 ; RV64I-NEXT:    srli a0, a0, 59
 ; RV64I-NEXT:    addi a1, sp, 56
-; RV64I-NEXT:    add a1, a1, a0
-; RV64I-NEXT:    lbu a0, 9(a1)
-; RV64I-NEXT:    lbu a3, 8(a1)
-; RV64I-NEXT:    lbu a4, 10(a1)
-; RV64I-NEXT:    lbu a5, 11(a1)
-; RV64I-NEXT:    slli a0, a0, 8
-; RV64I-NEXT:    or a0, a0, a3
-; RV64I-NEXT:    slli a4, a4, 16
-; RV64I-NEXT:    slli a5, a5, 24
-; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    or a0, a4, a0
-; RV64I-NEXT:    lbu a3, 13(a1)
-; RV64I-NEXT:    lbu a4, 12(a1)
-; RV64I-NEXT:    lbu a5, 14(a1)
-; RV64I-NEXT:    lbu a6, 15(a1)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    lbu a3, 8(a0)
+; RV64I-NEXT:    lbu a4, 9(a0)
+; RV64I-NEXT:    lbu a5, 10(a0)
+; RV64I-NEXT:    lbu a6, 11(a0)
+; RV64I-NEXT:    lbu a7, 12(a0)
+; RV64I-NEXT:    lbu t0, 13(a0)
+; RV64I-NEXT:    lbu t1, 14(a0)
+; RV64I-NEXT:    lbu t2, 15(a0)
+; RV64I-NEXT:    lbu t3, 16(a0)
+; RV64I-NEXT:    lbu t4, 17(a0)
+; RV64I-NEXT:    lbu t5, 18(a0)
+; RV64I-NEXT:    lbu t6, 19(a0)
+; RV64I-NEXT:    lbu s0, 20(a0)
+; RV64I-NEXT:    lbu s1, 21(a0)
+; RV64I-NEXT:    lbu s2, 22(a0)
+; RV64I-NEXT:    lbu s3, 23(a0)
+; RV64I-NEXT:    lbu s4, 24(a0)
+; RV64I-NEXT:    lbu s5, 25(a0)
+; RV64I-NEXT:    lbu s6, 26(a0)
+; RV64I-NEXT:    lbu s7, 27(a0)
+; RV64I-NEXT:    lbu s8, 28(a0)
+; RV64I-NEXT:    lbu s9, 29(a0)
+; RV64I-NEXT:    lbu s10, 30(a0)
+; RV64I-NEXT:    lbu s11, 31(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
-; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a1, a6, a5
+; RV64I-NEXT:    or a1, a1, a3
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a3, t0, a7
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t2, t2, 24
+; RV64I-NEXT:    or a4, t2, t1
 ; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    slli a3, a3, 32
-; RV64I-NEXT:    or a4, a3, a0
-; RV64I-NEXT:    andi a3, t2, 7
-; RV64I-NEXT:    lbu a0, 17(a1)
-; RV64I-NEXT:    lbu a5, 16(a1)
-; RV64I-NEXT:    lbu a6, 18(a1)
-; RV64I-NEXT:    lbu a7, 19(a1)
-; RV64I-NEXT:    slli a0, a0, 8
-; RV64I-NEXT:    or a0, a0, a5
-; RV64I-NEXT:    slli a6, a6, 16
-; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a5, a7, a6
-; RV64I-NEXT:    or a0, a5, a0
-; RV64I-NEXT:    lbu a5, 21(a1)
-; RV64I-NEXT:    lbu a6, 20(a1)
-; RV64I-NEXT:    lbu a7, 22(a1)
-; RV64I-NEXT:    lbu t0, 23(a1)
-; RV64I-NEXT:    slli a5, a5, 8
-; RV64I-NEXT:    or a5, a5, a6
-; RV64I-NEXT:    slli a7, a7, 16
-; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    or a6, t0, a7
-; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    or a4, a3, a1
+; RV64I-NEXT:    andi a3, ra, 7
+; RV64I-NEXT:    slli t4, t4, 8
+; RV64I-NEXT:    or a1, t4, t3
+; RV64I-NEXT:    slli t5, t5, 16
+; RV64I-NEXT:    slli t6, t6, 24
+; RV64I-NEXT:    or a5, t6, t5
+; RV64I-NEXT:    or a1, a5, a1
+; RV64I-NEXT:    slli s1, s1, 8
+; RV64I-NEXT:    or s0, s1, s0
+; RV64I-NEXT:    slli s2, s2, 16
+; RV64I-NEXT:    slli s3, s3, 24
+; RV64I-NEXT:    or a5, s3, s2
+; RV64I-NEXT:    or a5, a5, s0
 ; RV64I-NEXT:    slli a5, a5, 32
-; RV64I-NEXT:    or a5, a5, a0
-; RV64I-NEXT:    slli a0, a5, 1
+; RV64I-NEXT:    or a5, a5, a1
+; RV64I-NEXT:    slli a1, a5, 1
 ; RV64I-NEXT:    not a6, a3
-; RV64I-NEXT:    sll a0, a0, a6
-; RV64I-NEXT:    lbu a6, 1(a1)
-; RV64I-NEXT:    lbu a7, 0(a1)
-; RV64I-NEXT:    lbu t0, 2(a1)
-; RV64I-NEXT:    lbu t1, 3(a1)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    sll a1, a1, a6
+; RV64I-NEXT:    lbu a6, 0(a0)
+; RV64I-NEXT:    lbu a7, 1(a0)
+; RV64I-NEXT:    lbu t0, 2(a0)
+; RV64I-NEXT:    lbu t1, 3(a0)
+; RV64I-NEXT:    lbu t2, 4(a0)
+; RV64I-NEXT:    lbu t3, 5(a0)
+; RV64I-NEXT:    lbu t4, 6(a0)
+; RV64I-NEXT:    lbu a0, 7(a0)
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or a6, a7, a6
 ; RV64I-NEXT:    slli t0, t0, 16
 ; RV64I-NEXT:    slli t1, t1, 24
 ; RV64I-NEXT:    or a7, t1, t0
 ; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    lbu a7, 5(a1)
-; RV64I-NEXT:    lbu t0, 4(a1)
-; RV64I-NEXT:    lbu t1, 6(a1)
-; RV64I-NEXT:    lbu t2, 7(a1)
-; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    or a7, a7, t0
-; RV64I-NEXT:    slli t1, t1, 16
-; RV64I-NEXT:    slli t2, t2, 24
-; RV64I-NEXT:    or t0, t2, t1
-; RV64I-NEXT:    or a7, t0, a7
-; RV64I-NEXT:    slli a7, a7, 32
-; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    lbu a7, 25(a1)
-; RV64I-NEXT:    lbu t0, 24(a1)
-; RV64I-NEXT:    lbu t1, 26(a1)
-; RV64I-NEXT:    lbu t2, 27(a1)
-; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    or a7, a7, t0
-; RV64I-NEXT:    slli t1, t1, 16
-; RV64I-NEXT:    slli t2, t2, 24
-; RV64I-NEXT:    or t0, t2, t1
-; RV64I-NEXT:    or a7, t0, a7
-; RV64I-NEXT:    lbu t0, 29(a1)
-; RV64I-NEXT:    lbu t1, 28(a1)
-; RV64I-NEXT:    lbu t2, 30(a1)
-; RV64I-NEXT:    lbu a1, 31(a1)
-; RV64I-NEXT:    slli t0, t0, 8
-; RV64I-NEXT:    or t0, t0, t1
-; RV64I-NEXT:    slli t2, t2, 16
-; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, t2
+; RV64I-NEXT:    slli t3, t3, 8
+; RV64I-NEXT:    or a7, t3, t2
+; RV64I-NEXT:    slli t4, t4, 16
+; RV64I-NEXT:    slli a0, a0, 24
+; RV64I-NEXT:    or a0, a0, t4
+; RV64I-NEXT:    or a0, a0, a7
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    or a6, a0, a6
+; RV64I-NEXT:    slli s5, s5, 8
+; RV64I-NEXT:    or a0, s5, s4
+; RV64I-NEXT:    slli s6, s6, 16
+; RV64I-NEXT:    slli s7, s7, 24
+; RV64I-NEXT:    or a7, s7, s6
+; RV64I-NEXT:    or a0, a7, a0
+; RV64I-NEXT:    slli s9, s9, 8
+; RV64I-NEXT:    or a7, s9, s8
+; RV64I-NEXT:    slli s10, s10, 16
+; RV64I-NEXT:    slli s11, s11, 24
+; RV64I-NEXT:    or t0, s11, s10
 ; RV64I-NEXT:    slli t1, a4, 1
-; RV64I-NEXT:    or a1, a1, t0
+; RV64I-NEXT:    or a7, t0, a7
 ; RV64I-NEXT:    xori t0, a3, 63
 ; RV64I-NEXT:    sll t1, t1, t0
-; RV64I-NEXT:    slli a1, a1, 32
-; RV64I-NEXT:    or a7, a1, a7
-; RV64I-NEXT:    slli a1, a7, 1
-; RV64I-NEXT:    sll t0, a1, t0
-; RV64I-NEXT:    srl a1, a4, a3
+; RV64I-NEXT:    slli a7, a7, 32
+; RV64I-NEXT:    or a7, a7, a0
+; RV64I-NEXT:    slli a0, a7, 1
+; RV64I-NEXT:    sll t0, a0, t0
+; RV64I-NEXT:    srl a0, a4, a3
 ; RV64I-NEXT:    srl a4, a6, a3
 ; RV64I-NEXT:    srl a5, a5, a3
 ; RV64I-NEXT:    sra a3, a7, a3
@@ -3063,26 +3082,26 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    sb a4, 0(a2)
 ; RV64I-NEXT:    srli a4, a4, 8
 ; RV64I-NEXT:    sb a4, 1(a2)
-; RV64I-NEXT:    srli a4, a1, 48
+; RV64I-NEXT:    srli a4, a0, 48
 ; RV64I-NEXT:    sb a4, 14(a2)
-; RV64I-NEXT:    srli a4, a1, 40
+; RV64I-NEXT:    srli a4, a0, 40
 ; RV64I-NEXT:    sb a4, 13(a2)
-; RV64I-NEXT:    srli a4, a1, 32
+; RV64I-NEXT:    srli a4, a0, 32
 ; RV64I-NEXT:    sb a4, 12(a2)
-; RV64I-NEXT:    srli a4, a1, 24
+; RV64I-NEXT:    srli a4, a0, 24
 ; RV64I-NEXT:    sb a4, 11(a2)
-; RV64I-NEXT:    srli a4, a1, 16
+; RV64I-NEXT:    srli a4, a0, 16
 ; RV64I-NEXT:    sb a4, 10(a2)
-; RV64I-NEXT:    or a0, a1, a0
-; RV64I-NEXT:    sb a1, 8(a2)
-; RV64I-NEXT:    srli a1, a1, 8
-; RV64I-NEXT:    sb a1, 9(a2)
-; RV64I-NEXT:    srli a1, a6, 56
-; RV64I-NEXT:    sb a1, 23(a2)
+; RV64I-NEXT:    or a1, a0, a1
+; RV64I-NEXT:    sb a0, 8(a2)
+; RV64I-NEXT:    srli a0, a0, 8
+; RV64I-NEXT:    sb a0, 9(a2)
+; RV64I-NEXT:    srli a0, a6, 56
+; RV64I-NEXT:    sb a0, 23(a2)
 ; RV64I-NEXT:    srli a3, a3, 56
 ; RV64I-NEXT:    sb a3, 7(a2)
-; RV64I-NEXT:    srli a0, a0, 56
-; RV64I-NEXT:    sb a0, 15(a2)
+; RV64I-NEXT:    srli a1, a1, 56
+; RV64I-NEXT:    sb a1, 15(a2)
 ; RV64I-NEXT:    ld ra, 216(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 208(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 200(sp) # 8-byte Folded Reload
@@ -3115,94 +3134,93 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sw s9, 100(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s10, 96(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s11, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu t3, 31(a0)
-; RV32I-NEXT:    lbu a3, 0(a0)
-; RV32I-NEXT:    sw a3, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 1(a0)
-; RV32I-NEXT:    sw a3, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 2(a0)
-; RV32I-NEXT:    sw a3, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 3(a0)
-; RV32I-NEXT:    sw a3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 4(a0)
-; RV32I-NEXT:    sw a3, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 5(a0)
-; RV32I-NEXT:    sw a3, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu t2, 6(a0)
-; RV32I-NEXT:    lbu t4, 7(a0)
-; RV32I-NEXT:    lbu t5, 8(a0)
-; RV32I-NEXT:    lbu t6, 9(a0)
-; RV32I-NEXT:    lbu s0, 10(a0)
-; RV32I-NEXT:    lbu s1, 11(a0)
-; RV32I-NEXT:    lbu s2, 12(a0)
-; RV32I-NEXT:    lbu s3, 13(a0)
-; RV32I-NEXT:    lbu s4, 14(a0)
-; RV32I-NEXT:    lbu s5, 15(a0)
-; RV32I-NEXT:    lbu s6, 16(a0)
-; RV32I-NEXT:    lbu s7, 17(a0)
-; RV32I-NEXT:    lbu s8, 18(a0)
-; RV32I-NEXT:    lbu a3, 1(a1)
-; RV32I-NEXT:    lbu s9, 19(a0)
-; RV32I-NEXT:    lbu s10, 20(a0)
-; RV32I-NEXT:    lbu s11, 0(a1)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    lbu ra, 2(a1)
+; RV32I-NEXT:    lbu a3, 0(a1)
+; RV32I-NEXT:    lbu a4, 1(a1)
+; RV32I-NEXT:    lbu t0, 29(a0)
+; RV32I-NEXT:    lbu a7, 30(a0)
+; RV32I-NEXT:    lbu a5, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    or a3, a3, s11
-; RV32I-NEXT:    lbu s11, 21(a0)
-; RV32I-NEXT:    slli ra, ra, 16
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, ra
-; RV32I-NEXT:    lbu ra, 22(a0)
-; RV32I-NEXT:    or t1, a1, a3
-; RV32I-NEXT:    lbu t0, 23(a0)
-; RV32I-NEXT:    lbu a7, 24(a0)
-; RV32I-NEXT:    lbu a6, 25(a0)
-; RV32I-NEXT:    lbu a5, 26(a0)
-; RV32I-NEXT:    lbu a1, 30(a0)
-; RV32I-NEXT:    lbu a3, 29(a0)
-; RV32I-NEXT:    lbu a4, 28(a0)
-; RV32I-NEXT:    lbu a0, 27(a0)
-; RV32I-NEXT:    sb a1, 58(sp)
-; RV32I-NEXT:    sb a3, 57(sp)
-; RV32I-NEXT:    sb a4, 56(sp)
-; RV32I-NEXT:    sb a0, 55(sp)
-; RV32I-NEXT:    sb a5, 54(sp)
-; RV32I-NEXT:    sb a6, 53(sp)
-; RV32I-NEXT:    sb a7, 52(sp)
-; RV32I-NEXT:    sb t0, 51(sp)
-; RV32I-NEXT:    sb ra, 50(sp)
-; RV32I-NEXT:    sb s11, 49(sp)
-; RV32I-NEXT:    sb s10, 48(sp)
-; RV32I-NEXT:    sb s9, 47(sp)
-; RV32I-NEXT:    sb s8, 46(sp)
-; RV32I-NEXT:    sb s7, 45(sp)
-; RV32I-NEXT:    sb s6, 44(sp)
-; RV32I-NEXT:    sb s5, 43(sp)
-; RV32I-NEXT:    sb t3, 59(sp)
-; RV32I-NEXT:    slli t3, t3, 24
-; RV32I-NEXT:    sb s4, 42(sp)
-; RV32I-NEXT:    sb s3, 41(sp)
-; RV32I-NEXT:    sb s2, 40(sp)
-; RV32I-NEXT:    sb s1, 39(sp)
-; RV32I-NEXT:    sb s0, 38(sp)
-; RV32I-NEXT:    sb t6, 37(sp)
-; RV32I-NEXT:    sb t5, 36(sp)
-; RV32I-NEXT:    sb t4, 35(sp)
+; RV32I-NEXT:    or a1, a1, a5
+; RV32I-NEXT:    or a1, a1, a3
+; RV32I-NEXT:    sw a1, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a1, 0(a0)
+; RV32I-NEXT:    sw a1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a1, 1(a0)
+; RV32I-NEXT:    sw a1, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a1, 2(a0)
+; RV32I-NEXT:    sw a1, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a1, 3(a0)
+; RV32I-NEXT:    sw a1, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a1, 4(a0)
+; RV32I-NEXT:    sw a1, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu t1, 5(a0)
+; RV32I-NEXT:    lbu t2, 6(a0)
+; RV32I-NEXT:    lbu t3, 7(a0)
+; RV32I-NEXT:    lbu t4, 8(a0)
+; RV32I-NEXT:    lbu t5, 9(a0)
+; RV32I-NEXT:    lbu t6, 10(a0)
+; RV32I-NEXT:    lbu s0, 11(a0)
+; RV32I-NEXT:    lbu s1, 12(a0)
+; RV32I-NEXT:    lbu s2, 13(a0)
+; RV32I-NEXT:    lbu s3, 14(a0)
+; RV32I-NEXT:    lbu s4, 15(a0)
+; RV32I-NEXT:    lbu s5, 16(a0)
+; RV32I-NEXT:    lbu s6, 17(a0)
+; RV32I-NEXT:    lbu s7, 18(a0)
+; RV32I-NEXT:    lbu s8, 19(a0)
+; RV32I-NEXT:    lbu s9, 20(a0)
+; RV32I-NEXT:    lbu s10, 21(a0)
+; RV32I-NEXT:    lbu s11, 22(a0)
+; RV32I-NEXT:    lbu ra, 23(a0)
+; RV32I-NEXT:    lbu a6, 24(a0)
+; RV32I-NEXT:    lbu a5, 25(a0)
+; RV32I-NEXT:    lbu a4, 26(a0)
+; RV32I-NEXT:    lbu a3, 27(a0)
+; RV32I-NEXT:    lbu a1, 28(a0)
+; RV32I-NEXT:    lbu a0, 31(a0)
+; RV32I-NEXT:    sb a7, 58(sp)
+; RV32I-NEXT:    sb t0, 57(sp)
+; RV32I-NEXT:    sb a1, 56(sp)
+; RV32I-NEXT:    sb a3, 55(sp)
+; RV32I-NEXT:    sb a4, 54(sp)
+; RV32I-NEXT:    sb a5, 53(sp)
+; RV32I-NEXT:    sb a0, 59(sp)
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    sb a6, 52(sp)
+; RV32I-NEXT:    sb ra, 51(sp)
+; RV32I-NEXT:    sb s11, 50(sp)
+; RV32I-NEXT:    sb s10, 49(sp)
+; RV32I-NEXT:    sb s9, 48(sp)
+; RV32I-NEXT:    sb s8, 47(sp)
+; RV32I-NEXT:    sb s7, 46(sp)
+; RV32I-NEXT:    sb s6, 45(sp)
+; RV32I-NEXT:    sb s5, 44(sp)
+; RV32I-NEXT:    sb s4, 43(sp)
+; RV32I-NEXT:    sb s3, 42(sp)
+; RV32I-NEXT:    sb s2, 41(sp)
+; RV32I-NEXT:    sb s1, 40(sp)
+; RV32I-NEXT:    sb s0, 39(sp)
+; RV32I-NEXT:    sb t6, 38(sp)
+; RV32I-NEXT:    sb t5, 37(sp)
+; RV32I-NEXT:    sb t4, 36(sp)
+; RV32I-NEXT:    sb t3, 35(sp)
 ; RV32I-NEXT:    sb t2, 34(sp)
-; RV32I-NEXT:    lw a0, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 33(sp)
-; RV32I-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 32(sp)
-; RV32I-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 31(sp)
-; RV32I-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 30(sp)
-; RV32I-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 29(sp)
-; RV32I-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 28(sp)
-; RV32I-NEXT:    srai a0, t3, 31
+; RV32I-NEXT:    sb t1, 33(sp)
+; RV32I-NEXT:    lw a1, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a1, 32(sp)
+; RV32I-NEXT:    lw a1, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a1, 31(sp)
+; RV32I-NEXT:    lw a1, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a1, 30(sp)
+; RV32I-NEXT:    lw a1, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a1, 29(sp)
+; RV32I-NEXT:    lw a1, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a1, 28(sp)
+; RV32I-NEXT:    srai a0, a0, 31
 ; RV32I-NEXT:    sb a0, 88(sp)
 ; RV32I-NEXT:    sb a0, 84(sp)
 ; RV32I-NEXT:    sb a0, 80(sp)
@@ -3238,175 +3256,181 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sb a1, 63(sp)
 ; RV32I-NEXT:    sb a3, 62(sp)
 ; RV32I-NEXT:    sb a0, 61(sp)
-; RV32I-NEXT:    slli a0, t1, 24
+; RV32I-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    slli a0, a0, 24
 ; RV32I-NEXT:    srli a0, a0, 27
-; RV32I-NEXT:    addi a4, sp, 28
-; RV32I-NEXT:    add a4, a4, a0
-; RV32I-NEXT:    lbu a0, 5(a4)
-; RV32I-NEXT:    lbu a1, 4(a4)
-; RV32I-NEXT:    lbu a3, 6(a4)
-; RV32I-NEXT:    lbu a5, 7(a4)
-; RV32I-NEXT:    slli a0, a0, 8
-; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    slli a3, a3, 16
-; RV32I-NEXT:    slli a5, a5, 24
-; RV32I-NEXT:    or a3, a5, a3
-; RV32I-NEXT:    or t5, a3, a0
-; RV32I-NEXT:    andi a3, t1, 7
-; RV32I-NEXT:    lbu a0, 9(a4)
-; RV32I-NEXT:    lbu a1, 8(a4)
-; RV32I-NEXT:    lbu a5, 10(a4)
-; RV32I-NEXT:    lbu a6, 11(a4)
-; RV32I-NEXT:    slli a0, a0, 8
+; RV32I-NEXT:    addi a1, sp, 28
+; RV32I-NEXT:    add a0, a1, a0
+; RV32I-NEXT:    lbu a1, 4(a0)
+; RV32I-NEXT:    lbu a4, 5(a0)
+; RV32I-NEXT:    lbu a7, 6(a0)
+; RV32I-NEXT:    lbu a3, 7(a0)
+; RV32I-NEXT:    sw a3, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a3, 8(a0)
+; RV32I-NEXT:    sw a3, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu t0, 9(a0)
+; RV32I-NEXT:    lbu t1, 10(a0)
+; RV32I-NEXT:    lbu t2, 11(a0)
+; RV32I-NEXT:    lbu t3, 12(a0)
+; RV32I-NEXT:    lbu t4, 13(a0)
+; RV32I-NEXT:    lbu t5, 14(a0)
+; RV32I-NEXT:    lbu t6, 15(a0)
+; RV32I-NEXT:    lbu s0, 16(a0)
+; RV32I-NEXT:    lbu s1, 17(a0)
+; RV32I-NEXT:    lbu s2, 18(a0)
+; RV32I-NEXT:    lbu s3, 19(a0)
+; RV32I-NEXT:    lbu s4, 20(a0)
+; RV32I-NEXT:    lbu s6, 21(a0)
+; RV32I-NEXT:    lbu s7, 22(a0)
+; RV32I-NEXT:    lbu s8, 23(a0)
+; RV32I-NEXT:    lbu s5, 24(a0)
+; RV32I-NEXT:    lbu s9, 25(a0)
+; RV32I-NEXT:    lbu s10, 26(a0)
+; RV32I-NEXT:    lbu s11, 27(a0)
+; RV32I-NEXT:    lbu ra, 28(a0)
+; RV32I-NEXT:    lbu a5, 29(a0)
+; RV32I-NEXT:    lbu a6, 30(a0)
+; RV32I-NEXT:    lbu a3, 31(a0)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    or a4, a4, a1
+; RV32I-NEXT:    slli a1, a7, 16
+; RV32I-NEXT:    lw a7, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    slli a7, a7, 24
+; RV32I-NEXT:    or a1, a7, a1
+; RV32I-NEXT:    or a4, a1, a4
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    lw a1, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    or a1, t0, a1
+; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t2, t2, 24
+; RV32I-NEXT:    or a7, t2, t1
+; RV32I-NEXT:    lbu t1, 0(a0)
+; RV32I-NEXT:    lbu t2, 1(a0)
+; RV32I-NEXT:    or t0, a7, a1
+; RV32I-NEXT:    lbu a1, 2(a0)
+; RV32I-NEXT:    lbu a0, 3(a0)
+; RV32I-NEXT:    slli t2, t2, 8
+; RV32I-NEXT:    or a7, t2, t1
+; RV32I-NEXT:    slli a1, a1, 16
+; RV32I-NEXT:    slli a0, a0, 24
 ; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    or a1, a6, a5
-; RV32I-NEXT:    or a6, a1, a0
-; RV32I-NEXT:    slli a0, a6, 1
-; RV32I-NEXT:    not t1, a3
-; RV32I-NEXT:    sll a0, a0, t1
-; RV32I-NEXT:    lbu a1, 1(a4)
-; RV32I-NEXT:    lbu a5, 0(a4)
-; RV32I-NEXT:    lbu a7, 2(a4)
-; RV32I-NEXT:    lbu t0, 3(a4)
-; RV32I-NEXT:    slli a1, a1, 8
-; RV32I-NEXT:    or a1, a1, a5
-; RV32I-NEXT:    slli a7, a7, 16
-; RV32I-NEXT:    slli t0, t0, 24
-; RV32I-NEXT:    or a5, t0, a7
-; RV32I-NEXT:    or t0, a5, a1
-; RV32I-NEXT:    slli a1, t5, 1
-; RV32I-NEXT:    xori t2, a3, 31
-; RV32I-NEXT:    sll a1, a1, t2
-; RV32I-NEXT:    lbu a5, 13(a4)
-; RV32I-NEXT:    lbu a7, 12(a4)
-; RV32I-NEXT:    lbu t3, 14(a4)
-; RV32I-NEXT:    lbu t4, 15(a4)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, a7
-; RV32I-NEXT:    slli t3, t3, 16
-; RV32I-NEXT:    slli t4, t4, 24
-; RV32I-NEXT:    or a7, t4, t3
-; RV32I-NEXT:    or t3, a7, a5
-; RV32I-NEXT:    lbu a5, 17(a4)
-; RV32I-NEXT:    lbu a7, 16(a4)
-; RV32I-NEXT:    lbu t4, 18(a4)
-; RV32I-NEXT:    lbu t6, 19(a4)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, a7
-; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    or a1, a0, a7
+; RV32I-NEXT:    slli t4, t4, 8
+; RV32I-NEXT:    or a0, t4, t3
+; RV32I-NEXT:    slli t5, t5, 16
 ; RV32I-NEXT:    slli t6, t6, 24
-; RV32I-NEXT:    or a7, t6, t4
-; RV32I-NEXT:    or t4, a7, a5
-; RV32I-NEXT:    slli a5, t4, 1
-; RV32I-NEXT:    sll a7, a5, t1
-; RV32I-NEXT:    lbu a5, 21(a4)
-; RV32I-NEXT:    lbu t6, 20(a4)
-; RV32I-NEXT:    lbu s0, 22(a4)
-; RV32I-NEXT:    lbu s1, 23(a4)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, t6
-; RV32I-NEXT:    slli s0, s0, 16
-; RV32I-NEXT:    slli s1, s1, 24
+; RV32I-NEXT:    or a7, t6, t5
+; RV32I-NEXT:    or t2, a7, a0
+; RV32I-NEXT:    slli s1, s1, 8
 ; RV32I-NEXT:    or s0, s1, s0
-; RV32I-NEXT:    or s0, s0, a5
-; RV32I-NEXT:    lbu a5, 25(a4)
-; RV32I-NEXT:    lbu t6, 24(a4)
-; RV32I-NEXT:    lbu s1, 26(a4)
-; RV32I-NEXT:    lbu s2, 27(a4)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, t6
-; RV32I-NEXT:    slli s1, s1, 16
-; RV32I-NEXT:    slli s2, s2, 24
-; RV32I-NEXT:    or t6, s2, s1
-; RV32I-NEXT:    or t6, t6, a5
-; RV32I-NEXT:    lbu a5, 29(a4)
-; RV32I-NEXT:    lbu s1, 28(a4)
-; RV32I-NEXT:    slli s2, t6, 1
-; RV32I-NEXT:    sll t1, s2, t1
+; RV32I-NEXT:    slli s2, s2, 16
+; RV32I-NEXT:    slli s3, s3, 24
+; RV32I-NEXT:    or a0, s3, s2
+; RV32I-NEXT:    or s0, a0, s0
+; RV32I-NEXT:    slli s6, s6, 8
+; RV32I-NEXT:    or a0, s6, s4
+; RV32I-NEXT:    lw a7, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    andi t3, a7, 7
+; RV32I-NEXT:    slli s7, s7, 16
+; RV32I-NEXT:    slli s8, s8, 24
+; RV32I-NEXT:    or a7, s8, s7
+; RV32I-NEXT:    slli t1, t0, 1
+; RV32I-NEXT:    or t4, a7, a0
+; RV32I-NEXT:    not a7, t3
+; RV32I-NEXT:    sll a0, t1, a7
+; RV32I-NEXT:    slli s9, s9, 8
+; RV32I-NEXT:    or t5, s9, s5
+; RV32I-NEXT:    slli t6, a4, 1
+; RV32I-NEXT:    slli s10, s10, 16
+; RV32I-NEXT:    slli s11, s11, 24
+; RV32I-NEXT:    or s1, s11, s10
+; RV32I-NEXT:    slli t1, s0, 1
+; RV32I-NEXT:    sll t1, t1, a7
+; RV32I-NEXT:    or t5, s1, t5
+; RV32I-NEXT:    slli s1, t5, 1
+; RV32I-NEXT:    sll s1, s1, a7
+; RV32I-NEXT:    xori s2, t3, 31
+; RV32I-NEXT:    sll a7, t6, s2
 ; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, s1
-; RV32I-NEXT:    lbu s1, 30(a4)
-; RV32I-NEXT:    lbu a4, 31(a4)
-; RV32I-NEXT:    slli s2, t3, 1
-; RV32I-NEXT:    sll s2, s2, t2
-; RV32I-NEXT:    slli s1, s1, 16
-; RV32I-NEXT:    slli a4, a4, 24
-; RV32I-NEXT:    or a4, a4, s1
-; RV32I-NEXT:    slli s1, s0, 1
-; RV32I-NEXT:    sll s1, s1, t2
-; RV32I-NEXT:    or s3, a4, a5
-; RV32I-NEXT:    slli a4, s3, 1
-; RV32I-NEXT:    sll t2, a4, t2
-; RV32I-NEXT:    srl a4, t5, a3
-; RV32I-NEXT:    srl a5, t0, a3
-; RV32I-NEXT:    srl t0, t3, a3
-; RV32I-NEXT:    srl a6, a6, a3
-; RV32I-NEXT:    srl t3, s0, a3
-; RV32I-NEXT:    srl t4, t4, a3
-; RV32I-NEXT:    srl t5, t6, a3
-; RV32I-NEXT:    sra a3, s3, a3
-; RV32I-NEXT:    srli t6, t5, 16
-; RV32I-NEXT:    sb t6, 26(a2)
-; RV32I-NEXT:    or t2, t5, t2
+; RV32I-NEXT:    or a5, a5, ra
+; RV32I-NEXT:    slli t6, t2, 1
+; RV32I-NEXT:    sll t6, t6, s2
+; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a3, a3, 24
+; RV32I-NEXT:    or a3, a3, a6
+; RV32I-NEXT:    slli a6, t4, 1
+; RV32I-NEXT:    sll a6, a6, s2
+; RV32I-NEXT:    or a3, a3, a5
+; RV32I-NEXT:    slli a5, a3, 1
+; RV32I-NEXT:    sll a5, a5, s2
+; RV32I-NEXT:    srl a4, a4, t3
+; RV32I-NEXT:    srl a1, a1, t3
+; RV32I-NEXT:    srl t2, t2, t3
+; RV32I-NEXT:    srl t0, t0, t3
+; RV32I-NEXT:    srl t4, t4, t3
+; RV32I-NEXT:    srl s0, s0, t3
+; RV32I-NEXT:    srl t5, t5, t3
+; RV32I-NEXT:    sra a3, a3, t3
+; RV32I-NEXT:    srli t3, t5, 16
+; RV32I-NEXT:    sb t3, 26(a2)
+; RV32I-NEXT:    or a5, t5, a5
 ; RV32I-NEXT:    sb t5, 24(a2)
-; RV32I-NEXT:    srli t5, t5, 8
-; RV32I-NEXT:    sb t5, 25(a2)
-; RV32I-NEXT:    srli t5, a3, 24
-; RV32I-NEXT:    sb t5, 31(a2)
-; RV32I-NEXT:    srli t5, a3, 16
-; RV32I-NEXT:    sb t5, 30(a2)
+; RV32I-NEXT:    srli t3, t5, 8
+; RV32I-NEXT:    sb t3, 25(a2)
+; RV32I-NEXT:    srli t3, a3, 24
+; RV32I-NEXT:    sb t3, 31(a2)
+; RV32I-NEXT:    srli t3, a3, 16
+; RV32I-NEXT:    sb t3, 30(a2)
 ; RV32I-NEXT:    sb a3, 28(a2)
 ; RV32I-NEXT:    srli a3, a3, 8
 ; RV32I-NEXT:    sb a3, 29(a2)
-; RV32I-NEXT:    srli a3, t4, 16
+; RV32I-NEXT:    srli a3, s0, 16
 ; RV32I-NEXT:    sb a3, 18(a2)
-; RV32I-NEXT:    or a3, t4, s1
-; RV32I-NEXT:    sb t4, 16(a2)
-; RV32I-NEXT:    srli t4, t4, 8
-; RV32I-NEXT:    sb t4, 17(a2)
-; RV32I-NEXT:    srli t4, t3, 16
-; RV32I-NEXT:    sb t4, 22(a2)
-; RV32I-NEXT:    or t1, t3, t1
-; RV32I-NEXT:    sb t3, 20(a2)
-; RV32I-NEXT:    srli t3, t3, 8
+; RV32I-NEXT:    or a3, s0, a6
+; RV32I-NEXT:    sb s0, 16(a2)
+; RV32I-NEXT:    srli s0, s0, 8
+; RV32I-NEXT:    sb s0, 17(a2)
+; RV32I-NEXT:    srli a6, t4, 16
+; RV32I-NEXT:    sb a6, 22(a2)
+; RV32I-NEXT:    or a6, t4, s1
+; RV32I-NEXT:    sb t4, 20(a2)
+; RV32I-NEXT:    srli t3, t4, 8
 ; RV32I-NEXT:    sb t3, 21(a2)
-; RV32I-NEXT:    srli t3, a6, 16
+; RV32I-NEXT:    srli t3, t0, 16
 ; RV32I-NEXT:    sb t3, 10(a2)
-; RV32I-NEXT:    or t3, a6, s2
-; RV32I-NEXT:    sb a6, 8(a2)
-; RV32I-NEXT:    srli a6, a6, 8
-; RV32I-NEXT:    sb a6, 9(a2)
-; RV32I-NEXT:    srli a6, t0, 16
-; RV32I-NEXT:    sb a6, 14(a2)
-; RV32I-NEXT:    or a6, t0, a7
-; RV32I-NEXT:    sb t0, 12(a2)
-; RV32I-NEXT:    srli a7, t0, 8
-; RV32I-NEXT:    sb a7, 13(a2)
-; RV32I-NEXT:    srli a7, a5, 16
-; RV32I-NEXT:    sb a7, 2(a2)
-; RV32I-NEXT:    or a1, a5, a1
-; RV32I-NEXT:    sb a5, 0(a2)
-; RV32I-NEXT:    srli a5, a5, 8
-; RV32I-NEXT:    sb a5, 1(a2)
-; RV32I-NEXT:    srli a5, a4, 16
-; RV32I-NEXT:    sb a5, 6(a2)
+; RV32I-NEXT:    or t3, t0, t6
+; RV32I-NEXT:    sb t0, 8(a2)
+; RV32I-NEXT:    srli t0, t0, 8
+; RV32I-NEXT:    sb t0, 9(a2)
+; RV32I-NEXT:    srli t0, t2, 16
+; RV32I-NEXT:    sb t0, 14(a2)
+; RV32I-NEXT:    or t0, t2, t1
+; RV32I-NEXT:    sb t2, 12(a2)
+; RV32I-NEXT:    srli t1, t2, 8
+; RV32I-NEXT:    sb t1, 13(a2)
+; RV32I-NEXT:    srli t1, a1, 16
+; RV32I-NEXT:    sb t1, 2(a2)
+; RV32I-NEXT:    or a7, a1, a7
+; RV32I-NEXT:    sb a1, 0(a2)
+; RV32I-NEXT:    srli a1, a1, 8
+; RV32I-NEXT:    sb a1, 1(a2)
+; RV32I-NEXT:    srli a1, a4, 16
+; RV32I-NEXT:    sb a1, 6(a2)
 ; RV32I-NEXT:    or a0, a4, a0
 ; RV32I-NEXT:    sb a4, 4(a2)
 ; RV32I-NEXT:    srli a4, a4, 8
 ; RV32I-NEXT:    sb a4, 5(a2)
-; RV32I-NEXT:    srli a4, t2, 24
-; RV32I-NEXT:    sb a4, 27(a2)
+; RV32I-NEXT:    srli a5, a5, 24
+; RV32I-NEXT:    sb a5, 27(a2)
 ; RV32I-NEXT:    srli a3, a3, 24
 ; RV32I-NEXT:    sb a3, 19(a2)
-; RV32I-NEXT:    srli a3, t1, 24
-; RV32I-NEXT:    sb a3, 23(a2)
-; RV32I-NEXT:    srli a3, t3, 24
-; RV32I-NEXT:    sb a3, 11(a2)
-; RV32I-NEXT:    srli a3, a6, 24
-; RV32I-NEXT:    sb a3, 15(a2)
-; RV32I-NEXT:    srli a1, a1, 24
+; RV32I-NEXT:    srli a1, a6, 24
+; RV32I-NEXT:    sb a1, 23(a2)
+; RV32I-NEXT:    srli a1, t3, 24
+; RV32I-NEXT:    sb a1, 11(a2)
+; RV32I-NEXT:    srli a1, t0, 24
+; RV32I-NEXT:    sb a1, 15(a2)
+; RV32I-NEXT:    srli a1, a7, 24
 ; RV32I-NEXT:    sb a1, 3(a2)
 ; RV32I-NEXT:    srli a0, a0, 24
 ; RV32I-NEXT:    sb a0, 7(a2)
diff --git a/llvm/test/CodeGen/RISCV/xtheadmempair.ll b/llvm/test/CodeGen/RISCV/xtheadmempair.ll
index 34900b30069159..95cb61ef050519 100644
--- a/llvm/test/CodeGen/RISCV/xtheadmempair.ll
+++ b/llvm/test/CodeGen/RISCV/xtheadmempair.ll
@@ -57,14 +57,14 @@ define i64 @lwud(i32* %a) {
 define i64 @ldd(i64* %a) {
 ; RV32XTHEADMEMPAIR-LABEL: ldd:
 ; RV32XTHEADMEMPAIR:       # %bb.0:
-; RV32XTHEADMEMPAIR-NEXT:    lw a1, 32(a0)
-; RV32XTHEADMEMPAIR-NEXT:    lw a2, 36(a0)
-; RV32XTHEADMEMPAIR-NEXT:    lw a3, 44(a0)
+; RV32XTHEADMEMPAIR-NEXT:    lw a1, 44(a0)
+; RV32XTHEADMEMPAIR-NEXT:    lw a2, 32(a0)
+; RV32XTHEADMEMPAIR-NEXT:    lw a3, 36(a0)
 ; RV32XTHEADMEMPAIR-NEXT:    lw a0, 40(a0)
-; RV32XTHEADMEMPAIR-NEXT:    add a2, a2, a3
-; RV32XTHEADMEMPAIR-NEXT:    add a0, a1, a0
-; RV32XTHEADMEMPAIR-NEXT:    sltu a1, a0, a1
-; RV32XTHEADMEMPAIR-NEXT:    add a1, a2, a1
+; RV32XTHEADMEMPAIR-NEXT:    add a1, a3, a1
+; RV32XTHEADMEMPAIR-NEXT:    add a0, a2, a0
+; RV32XTHEADMEMPAIR-NEXT:    sltu a2, a0, a2
+; RV32XTHEADMEMPAIR-NEXT:    add a1, a1, a2
 ; RV32XTHEADMEMPAIR-NEXT:    ret
 ;
 ; RV64XTHEADMEMPAIR-LABEL: ldd:

>From 044d1d83831b1a8e0b6ececb79dafebb72d79cac Mon Sep 17 00:00:00 2001
From: Alex Bradbury <asb at igalia.com>
Date: Wed, 13 Dec 2023 14:41:35 +0000
Subject: [PATCH 4/4] [RISCV] Enable load clustering in SelectionDAG scheduler

---
 llvm/lib/Target/RISCV/RISCVInstrInfo.cpp      |  65 +++
 llvm/lib/Target/RISCV/RISCVInstrInfo.h        |   7 +
 llvm/test/CodeGen/RISCV/byval.ll              |   6 +-
 ...calling-conv-ilp32-ilp32f-ilp32d-common.ll | 100 ++---
 .../calling-conv-lp64-lp64f-lp64d-common.ll   |  50 +--
 llvm/test/CodeGen/RISCV/fpclamptosat.ll       | 320 +++++++--------
 llvm/test/CodeGen/RISCV/iabs.ll               | 144 +++----
 llvm/test/CodeGen/RISCV/legalize-fneg.ll      |  18 +-
 llvm/test/CodeGen/RISCV/mem.ll                |  30 +-
 llvm/test/CodeGen/RISCV/mem64.ll              |  39 +-
 llvm/test/CodeGen/RISCV/memcpy-inline.ll      | 288 ++++++-------
 llvm/test/CodeGen/RISCV/memcpy.ll             |  22 +-
 .../CodeGen/RISCV/misched-load-clustering.ll  |  10 +-
 llvm/test/CodeGen/RISCV/pr63816.ll            |  62 +--
 llvm/test/CodeGen/RISCV/rv64-legal-i32/mem.ll |   8 +-
 .../CodeGen/RISCV/rv64-legal-i32/mem64.ll     |  39 +-
 .../RISCV/rvv/fixed-vectors-fp2i-sat.ll       |  92 ++---
 .../CodeGen/RISCV/rvv/fixed-vectors-i2fp.ll   |  24 +-
 ...fixed-vectors-interleaved-access-zve32x.ll |   2 +-
 .../CodeGen/RISCV/rvv/fixed-vectors-llrint.ll | 124 +++---
 .../CodeGen/RISCV/rvv/fixed-vectors-lrint.ll  |  40 +-
 .../RISCV/rvv/fixed-vectors-masked-scatter.ll |  18 +-
 .../RISCV/rvv/fixed-vectors-setcc-fp-vp.ll    | 384 +++++++++---------
 llvm/test/CodeGen/RISCV/rvv/memcpy-inline.ll  |  40 +-
 .../CodeGen/RISCV/srem-seteq-illegal-types.ll |   6 +-
 llvm/test/CodeGen/RISCV/srem-vector-lkk.ll    | 160 ++++----
 .../RISCV/umulo-128-legalisation-lowering.ll  |  94 ++---
 llvm/test/CodeGen/RISCV/urem-vector-lkk.ll    | 138 +++----
 .../RISCV/wide-scalar-shift-legalization.ll   | 344 ++++++++--------
 llvm/test/CodeGen/RISCV/xtheadmempair.ll      |  10 +-
 30 files changed, 1388 insertions(+), 1296 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index b8960fd9571c9b..0aab5e9376c1d7 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -2194,6 +2194,71 @@ MachineInstr *RISCVInstrInfo::emitLdStWithAddr(MachineInstr &MemI,
       .setMIFlags(MemI.getFlags());
 }
 
+bool RISCVInstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
+                                             int64_t &Offset1,
+                                             int64_t &Offset2) const {
+  if (!Load1->isMachineOpcode() || !Load2->isMachineOpcode())
+    return false;
+
+  auto IsLoadOpcode = [&](unsigned Opcode) {
+    switch (Opcode) {
+    case RISCV::LB:
+    case RISCV::LBU:
+    case RISCV::LH:
+    case RISCV::LHU:
+    case RISCV::FLH:
+    case RISCV::LW:
+    case RISCV::LWU:
+    case RISCV::FLW:
+    case RISCV::LD:
+    case RISCV::FLD:
+      return true;
+    default:
+      return false;
+    }
+  };
+
+  if (!IsLoadOpcode(Load1->getMachineOpcode()) ||
+      !IsLoadOpcode(Load2->getMachineOpcode()))
+    return false;
+
+  // Check if base address and chain operands match.
+  if (Load1->getOperand(0) != Load2->getOperand(0))
+    if (Load1->getOperand(0) != Load2->getOperand(0) ||
+        Load1->getOperand(2) != Load2->getOperand(2))
+      return false;
+
+  // Determine the offsets.
+  if (isa<ConstantSDNode>(Load1->getOperand(1)) &&
+      isa<ConstantSDNode>(Load2->getOperand(1))) {
+    Offset1 = cast<ConstantSDNode>(Load1->getOperand(1))->getSExtValue();
+    Offset2 = cast<ConstantSDNode>(Load2->getOperand(1))->getSExtValue();
+    return true;
+  }
+
+  return false;
+}
+
+bool RISCVInstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
+                                             int64_t Offset1, int64_t Offset2,
+                                             unsigned NumLoads) const {
+  assert(Offset2 > Offset1);
+
+  if ((Offset2 - Offset1) / 8 > 64)
+    return false;
+
+  // Check if the machine opcodes are different. If they are different
+  // then we consider them to not be of the same base address,
+  if ((Load1->getMachineOpcode() != Load2->getMachineOpcode()))
+    return false; // FIXME: overly conservative?
+
+  // Four loads in a row should be sufficient.
+  if (NumLoads >= 3)
+    return false;
+
+  return true;
+}
+
 bool RISCVInstrInfo::getMemOperandsWithOffsetWidth(
     const MachineInstr &LdSt, SmallVectorImpl<const MachineOperand *> &BaseOps,
     int64_t &Offset, bool &OffsetIsScalable, unsigned &Width,
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.h b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
index 7e1d3f31180650..45fca85f534a8b 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
@@ -157,6 +157,9 @@ class RISCVInstrInfo : public RISCVGenInstrInfo {
       int64_t &Offset, bool &OffsetIsScalable, unsigned &Width,
       const TargetRegisterInfo *TRI) const override;
 
+  bool areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, int64_t &Offset1,
+                             int64_t &Offset2) const override;
+
   bool shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
                            int64_t Offset1, bool OffsetIsScalable1,
                            ArrayRef<const MachineOperand *> BaseOps2,
@@ -164,6 +167,10 @@ class RISCVInstrInfo : public RISCVGenInstrInfo {
                            unsigned ClusterSize,
                            unsigned NumBytes) const override;
 
+  bool shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2, int64_t Offset1,
+                               int64_t Offset2,
+                               unsigned NumLoads) const override;
+
   bool getMemOperandWithOffsetWidth(const MachineInstr &LdSt,
                                     const MachineOperand *&BaseOp,
                                     int64_t &Offset, unsigned &Width,
diff --git a/llvm/test/CodeGen/RISCV/byval.ll b/llvm/test/CodeGen/RISCV/byval.ll
index d300542e080752..368acaa1ca2648 100644
--- a/llvm/test/CodeGen/RISCV/byval.ll
+++ b/llvm/test/CodeGen/RISCV/byval.ll
@@ -26,10 +26,10 @@ define void @caller() nounwind {
 ; RV32I-NEXT:    sw a1, 12(sp)
 ; RV32I-NEXT:    addi a0, a0, %lo(foo)
 ; RV32I-NEXT:    lw a1, 12(a0)
-; RV32I-NEXT:    sw a1, 24(sp)
-; RV32I-NEXT:    lw a1, 8(a0)
-; RV32I-NEXT:    sw a1, 20(sp)
+; RV32I-NEXT:    lw a2, 8(a0)
 ; RV32I-NEXT:    lw a0, 4(a0)
+; RV32I-NEXT:    sw a1, 24(sp)
+; RV32I-NEXT:    sw a2, 20(sp)
 ; RV32I-NEXT:    sw a0, 16(sp)
 ; RV32I-NEXT:    addi a0, sp, 12
 ; RV32I-NEXT:    call callee at plt
diff --git a/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-ilp32d-common.ll b/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-ilp32d-common.ll
index 15ce89b5d35646..3407a99497fadd 100644
--- a/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-ilp32d-common.ll
+++ b/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-ilp32d-common.ll
@@ -190,21 +190,21 @@ define i32 @caller_many_scalars() nounwind {
 define i32 @callee_large_scalars(i128 %a, fp128 %b) nounwind {
 ; RV32I-FPELIM-LABEL: callee_large_scalars:
 ; RV32I-FPELIM:       # %bb.0:
-; RV32I-FPELIM-NEXT:    lw a2, 0(a0)
-; RV32I-FPELIM-NEXT:    lw a3, 4(a0)
-; RV32I-FPELIM-NEXT:    lw a4, 12(a1)
+; RV32I-FPELIM-NEXT:    lw a2, 0(a1)
+; RV32I-FPELIM-NEXT:    lw a3, 4(a1)
+; RV32I-FPELIM-NEXT:    lw a4, 8(a1)
+; RV32I-FPELIM-NEXT:    lw a1, 12(a1)
 ; RV32I-FPELIM-NEXT:    lw a5, 12(a0)
-; RV32I-FPELIM-NEXT:    lw a6, 0(a1)
-; RV32I-FPELIM-NEXT:    lw a7, 4(a1)
-; RV32I-FPELIM-NEXT:    lw a1, 8(a1)
+; RV32I-FPELIM-NEXT:    lw a6, 4(a0)
+; RV32I-FPELIM-NEXT:    lw a7, 0(a0)
 ; RV32I-FPELIM-NEXT:    lw a0, 8(a0)
-; RV32I-FPELIM-NEXT:    xor a4, a5, a4
-; RV32I-FPELIM-NEXT:    xor a3, a3, a7
-; RV32I-FPELIM-NEXT:    or a3, a3, a4
-; RV32I-FPELIM-NEXT:    xor a0, a0, a1
-; RV32I-FPELIM-NEXT:    xor a1, a2, a6
-; RV32I-FPELIM-NEXT:    or a0, a1, a0
-; RV32I-FPELIM-NEXT:    or a0, a0, a3
+; RV32I-FPELIM-NEXT:    xor a1, a5, a1
+; RV32I-FPELIM-NEXT:    xor a3, a6, a3
+; RV32I-FPELIM-NEXT:    or a1, a3, a1
+; RV32I-FPELIM-NEXT:    xor a0, a0, a4
+; RV32I-FPELIM-NEXT:    xor a2, a7, a2
+; RV32I-FPELIM-NEXT:    or a0, a2, a0
+; RV32I-FPELIM-NEXT:    or a0, a0, a1
 ; RV32I-FPELIM-NEXT:    seqz a0, a0
 ; RV32I-FPELIM-NEXT:    ret
 ;
@@ -214,21 +214,21 @@ define i32 @callee_large_scalars(i128 %a, fp128 %b) nounwind {
 ; RV32I-WITHFP-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32I-WITHFP-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32I-WITHFP-NEXT:    addi s0, sp, 16
-; RV32I-WITHFP-NEXT:    lw a2, 0(a0)
-; RV32I-WITHFP-NEXT:    lw a3, 4(a0)
-; RV32I-WITHFP-NEXT:    lw a4, 12(a1)
+; RV32I-WITHFP-NEXT:    lw a2, 0(a1)
+; RV32I-WITHFP-NEXT:    lw a3, 4(a1)
+; RV32I-WITHFP-NEXT:    lw a4, 8(a1)
+; RV32I-WITHFP-NEXT:    lw a1, 12(a1)
 ; RV32I-WITHFP-NEXT:    lw a5, 12(a0)
-; RV32I-WITHFP-NEXT:    lw a6, 0(a1)
-; RV32I-WITHFP-NEXT:    lw a7, 4(a1)
-; RV32I-WITHFP-NEXT:    lw a1, 8(a1)
+; RV32I-WITHFP-NEXT:    lw a6, 4(a0)
+; RV32I-WITHFP-NEXT:    lw a7, 0(a0)
 ; RV32I-WITHFP-NEXT:    lw a0, 8(a0)
-; RV32I-WITHFP-NEXT:    xor a4, a5, a4
-; RV32I-WITHFP-NEXT:    xor a3, a3, a7
-; RV32I-WITHFP-NEXT:    or a3, a3, a4
-; RV32I-WITHFP-NEXT:    xor a0, a0, a1
-; RV32I-WITHFP-NEXT:    xor a1, a2, a6
-; RV32I-WITHFP-NEXT:    or a0, a1, a0
-; RV32I-WITHFP-NEXT:    or a0, a0, a3
+; RV32I-WITHFP-NEXT:    xor a1, a5, a1
+; RV32I-WITHFP-NEXT:    xor a3, a6, a3
+; RV32I-WITHFP-NEXT:    or a1, a3, a1
+; RV32I-WITHFP-NEXT:    xor a0, a0, a4
+; RV32I-WITHFP-NEXT:    xor a2, a7, a2
+; RV32I-WITHFP-NEXT:    or a0, a2, a0
+; RV32I-WITHFP-NEXT:    or a0, a0, a1
 ; RV32I-WITHFP-NEXT:    seqz a0, a0
 ; RV32I-WITHFP-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-WITHFP-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -297,21 +297,21 @@ define i32 @callee_large_scalars_exhausted_regs(i32 %a, i32 %b, i32 %c, i32 %d,
 ; RV32I-FPELIM-LABEL: callee_large_scalars_exhausted_regs:
 ; RV32I-FPELIM:       # %bb.0:
 ; RV32I-FPELIM-NEXT:    lw a0, 4(sp)
-; RV32I-FPELIM-NEXT:    lw a1, 0(a7)
-; RV32I-FPELIM-NEXT:    lw a2, 4(a7)
-; RV32I-FPELIM-NEXT:    lw a3, 12(a0)
+; RV32I-FPELIM-NEXT:    lw a1, 0(a0)
+; RV32I-FPELIM-NEXT:    lw a2, 4(a0)
+; RV32I-FPELIM-NEXT:    lw a3, 8(a0)
+; RV32I-FPELIM-NEXT:    lw a0, 12(a0)
 ; RV32I-FPELIM-NEXT:    lw a4, 12(a7)
-; RV32I-FPELIM-NEXT:    lw a5, 0(a0)
-; RV32I-FPELIM-NEXT:    lw a6, 4(a0)
-; RV32I-FPELIM-NEXT:    lw a0, 8(a0)
+; RV32I-FPELIM-NEXT:    lw a5, 4(a7)
+; RV32I-FPELIM-NEXT:    lw a6, 0(a7)
 ; RV32I-FPELIM-NEXT:    lw a7, 8(a7)
-; RV32I-FPELIM-NEXT:    xor a3, a4, a3
-; RV32I-FPELIM-NEXT:    xor a2, a2, a6
-; RV32I-FPELIM-NEXT:    or a2, a2, a3
-; RV32I-FPELIM-NEXT:    xor a0, a7, a0
-; RV32I-FPELIM-NEXT:    xor a1, a1, a5
+; RV32I-FPELIM-NEXT:    xor a0, a4, a0
+; RV32I-FPELIM-NEXT:    xor a2, a5, a2
+; RV32I-FPELIM-NEXT:    or a0, a2, a0
+; RV32I-FPELIM-NEXT:    xor a2, a7, a3
+; RV32I-FPELIM-NEXT:    xor a1, a6, a1
+; RV32I-FPELIM-NEXT:    or a1, a1, a2
 ; RV32I-FPELIM-NEXT:    or a0, a1, a0
-; RV32I-FPELIM-NEXT:    or a0, a0, a2
 ; RV32I-FPELIM-NEXT:    seqz a0, a0
 ; RV32I-FPELIM-NEXT:    ret
 ;
@@ -322,21 +322,21 @@ define i32 @callee_large_scalars_exhausted_regs(i32 %a, i32 %b, i32 %c, i32 %d,
 ; RV32I-WITHFP-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32I-WITHFP-NEXT:    addi s0, sp, 16
 ; RV32I-WITHFP-NEXT:    lw a0, 4(s0)
-; RV32I-WITHFP-NEXT:    lw a1, 0(a7)
-; RV32I-WITHFP-NEXT:    lw a2, 4(a7)
-; RV32I-WITHFP-NEXT:    lw a3, 12(a0)
+; RV32I-WITHFP-NEXT:    lw a1, 0(a0)
+; RV32I-WITHFP-NEXT:    lw a2, 4(a0)
+; RV32I-WITHFP-NEXT:    lw a3, 8(a0)
+; RV32I-WITHFP-NEXT:    lw a0, 12(a0)
 ; RV32I-WITHFP-NEXT:    lw a4, 12(a7)
-; RV32I-WITHFP-NEXT:    lw a5, 0(a0)
-; RV32I-WITHFP-NEXT:    lw a6, 4(a0)
-; RV32I-WITHFP-NEXT:    lw a0, 8(a0)
+; RV32I-WITHFP-NEXT:    lw a5, 4(a7)
+; RV32I-WITHFP-NEXT:    lw a6, 0(a7)
 ; RV32I-WITHFP-NEXT:    lw a7, 8(a7)
-; RV32I-WITHFP-NEXT:    xor a3, a4, a3
-; RV32I-WITHFP-NEXT:    xor a2, a2, a6
-; RV32I-WITHFP-NEXT:    or a2, a2, a3
-; RV32I-WITHFP-NEXT:    xor a0, a7, a0
-; RV32I-WITHFP-NEXT:    xor a1, a1, a5
+; RV32I-WITHFP-NEXT:    xor a0, a4, a0
+; RV32I-WITHFP-NEXT:    xor a2, a5, a2
+; RV32I-WITHFP-NEXT:    or a0, a2, a0
+; RV32I-WITHFP-NEXT:    xor a2, a7, a3
+; RV32I-WITHFP-NEXT:    xor a1, a6, a1
+; RV32I-WITHFP-NEXT:    or a1, a1, a2
 ; RV32I-WITHFP-NEXT:    or a0, a1, a0
-; RV32I-WITHFP-NEXT:    or a0, a0, a2
 ; RV32I-WITHFP-NEXT:    seqz a0, a0
 ; RV32I-WITHFP-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-WITHFP-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/calling-conv-lp64-lp64f-lp64d-common.ll b/llvm/test/CodeGen/RISCV/calling-conv-lp64-lp64f-lp64d-common.ll
index a174c3422dbb27..f99bb598f48343 100644
--- a/llvm/test/CodeGen/RISCV/calling-conv-lp64-lp64f-lp64d-common.ll
+++ b/llvm/test/CodeGen/RISCV/calling-conv-lp64-lp64f-lp64d-common.ll
@@ -106,21 +106,21 @@ define i32 @caller_many_scalars() nounwind {
 define i64 @callee_large_scalars(i256 %a, i256 %b) nounwind {
 ; RV64I-LABEL: callee_large_scalars:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    ld a2, 0(a0)
-; RV64I-NEXT:    ld a3, 8(a0)
-; RV64I-NEXT:    ld a4, 24(a1)
+; RV64I-NEXT:    ld a2, 0(a1)
+; RV64I-NEXT:    ld a3, 8(a1)
+; RV64I-NEXT:    ld a4, 16(a1)
+; RV64I-NEXT:    ld a1, 24(a1)
 ; RV64I-NEXT:    ld a5, 24(a0)
-; RV64I-NEXT:    ld a6, 0(a1)
-; RV64I-NEXT:    ld a7, 8(a1)
-; RV64I-NEXT:    ld a1, 16(a1)
+; RV64I-NEXT:    ld a6, 8(a0)
+; RV64I-NEXT:    ld a7, 0(a0)
 ; RV64I-NEXT:    ld a0, 16(a0)
-; RV64I-NEXT:    xor a4, a5, a4
-; RV64I-NEXT:    xor a3, a3, a7
-; RV64I-NEXT:    or a3, a3, a4
-; RV64I-NEXT:    xor a0, a0, a1
-; RV64I-NEXT:    xor a1, a2, a6
-; RV64I-NEXT:    or a0, a1, a0
-; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    xor a1, a5, a1
+; RV64I-NEXT:    xor a3, a6, a3
+; RV64I-NEXT:    or a1, a3, a1
+; RV64I-NEXT:    xor a0, a0, a4
+; RV64I-NEXT:    xor a2, a7, a2
+; RV64I-NEXT:    or a0, a2, a0
+; RV64I-NEXT:    or a0, a0, a1
 ; RV64I-NEXT:    seqz a0, a0
 ; RV64I-NEXT:    ret
   %1 = icmp eq i256 %a, %b
@@ -161,21 +161,21 @@ define i64 @callee_large_scalars_exhausted_regs(i64 %a, i64 %b, i64 %c, i64 %d,
 ; RV64I-LABEL: callee_large_scalars_exhausted_regs:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    ld a0, 8(sp)
-; RV64I-NEXT:    ld a1, 0(a7)
-; RV64I-NEXT:    ld a2, 8(a7)
-; RV64I-NEXT:    ld a3, 24(a0)
+; RV64I-NEXT:    ld a1, 0(a0)
+; RV64I-NEXT:    ld a2, 8(a0)
+; RV64I-NEXT:    ld a3, 16(a0)
+; RV64I-NEXT:    ld a0, 24(a0)
 ; RV64I-NEXT:    ld a4, 24(a7)
-; RV64I-NEXT:    ld a5, 0(a0)
-; RV64I-NEXT:    ld a6, 8(a0)
-; RV64I-NEXT:    ld a0, 16(a0)
+; RV64I-NEXT:    ld a5, 8(a7)
+; RV64I-NEXT:    ld a6, 0(a7)
 ; RV64I-NEXT:    ld a7, 16(a7)
-; RV64I-NEXT:    xor a3, a4, a3
-; RV64I-NEXT:    xor a2, a2, a6
-; RV64I-NEXT:    or a2, a2, a3
-; RV64I-NEXT:    xor a0, a7, a0
-; RV64I-NEXT:    xor a1, a1, a5
+; RV64I-NEXT:    xor a0, a4, a0
+; RV64I-NEXT:    xor a2, a5, a2
+; RV64I-NEXT:    or a0, a2, a0
+; RV64I-NEXT:    xor a2, a7, a3
+; RV64I-NEXT:    xor a1, a6, a1
+; RV64I-NEXT:    or a1, a1, a2
 ; RV64I-NEXT:    or a0, a1, a0
-; RV64I-NEXT:    or a0, a0, a2
 ; RV64I-NEXT:    seqz a0, a0
 ; RV64I-NEXT:    ret
   %1 = icmp eq i256 %h, %j
diff --git a/llvm/test/CodeGen/RISCV/fpclamptosat.ll b/llvm/test/CodeGen/RISCV/fpclamptosat.ll
index 7695815cf50686..ae42cd6f8128b2 100644
--- a/llvm/test/CodeGen/RISCV/fpclamptosat.ll
+++ b/llvm/test/CodeGen/RISCV/fpclamptosat.ll
@@ -1043,21 +1043,21 @@ define i64 @stest_f64i64(double %x) {
 ; RV32IF-NEXT:    mv a1, a0
 ; RV32IF-NEXT:    addi a0, sp, 8
 ; RV32IF-NEXT:    call __fixdfti at plt
-; RV32IF-NEXT:    lw a3, 8(sp)
+; RV32IF-NEXT:    lw a2, 8(sp)
 ; RV32IF-NEXT:    lw a1, 12(sp)
-; RV32IF-NEXT:    lw a2, 16(sp)
+; RV32IF-NEXT:    lw a3, 16(sp)
 ; RV32IF-NEXT:    lw a4, 20(sp)
 ; RV32IF-NEXT:    lui a0, 524288
 ; RV32IF-NEXT:    addi a5, a0, -1
 ; RV32IF-NEXT:    beq a1, a5, .LBB18_2
 ; RV32IF-NEXT:  # %bb.1: # %entry
 ; RV32IF-NEXT:    sltu a6, a1, a5
-; RV32IF-NEXT:    or a7, a2, a4
+; RV32IF-NEXT:    or a7, a3, a4
 ; RV32IF-NEXT:    bnez a7, .LBB18_3
 ; RV32IF-NEXT:    j .LBB18_4
 ; RV32IF-NEXT:  .LBB18_2:
-; RV32IF-NEXT:    sltiu a6, a3, -1
-; RV32IF-NEXT:    or a7, a2, a4
+; RV32IF-NEXT:    sltiu a6, a2, -1
+; RV32IF-NEXT:    or a7, a3, a4
 ; RV32IF-NEXT:    beqz a7, .LBB18_4
 ; RV32IF-NEXT:  .LBB18_3: # %entry
 ; RV32IF-NEXT:    slti a6, a4, 0
@@ -1068,19 +1068,19 @@ define i64 @stest_f64i64(double %x) {
 ; RV32IF-NEXT:  # %bb.5: # %entry
 ; RV32IF-NEXT:    mv a1, a5
 ; RV32IF-NEXT:  .LBB18_6: # %entry
-; RV32IF-NEXT:    or a3, t0, a3
+; RV32IF-NEXT:    or a2, t0, a2
 ; RV32IF-NEXT:    and a4, a7, a4
-; RV32IF-NEXT:    and a2, a7, a2
+; RV32IF-NEXT:    and a3, a7, a3
 ; RV32IF-NEXT:    beq a1, a0, .LBB18_8
 ; RV32IF-NEXT:  # %bb.7: # %entry
 ; RV32IF-NEXT:    sltu a0, a0, a1
 ; RV32IF-NEXT:    j .LBB18_9
 ; RV32IF-NEXT:  .LBB18_8:
-; RV32IF-NEXT:    snez a0, a3
+; RV32IF-NEXT:    snez a0, a2
 ; RV32IF-NEXT:  .LBB18_9: # %entry
-; RV32IF-NEXT:    and a2, a2, a4
+; RV32IF-NEXT:    and a3, a3, a4
 ; RV32IF-NEXT:    li a5, -1
-; RV32IF-NEXT:    beq a2, a5, .LBB18_11
+; RV32IF-NEXT:    beq a3, a5, .LBB18_11
 ; RV32IF-NEXT:  # %bb.10: # %entry
 ; RV32IF-NEXT:    slti a0, a4, 0
 ; RV32IF-NEXT:    xori a0, a0, 1
@@ -1090,7 +1090,7 @@ define i64 @stest_f64i64(double %x) {
 ; RV32IF-NEXT:    lui a1, 524288
 ; RV32IF-NEXT:  .LBB18_13: # %entry
 ; RV32IF-NEXT:    neg a0, a0
-; RV32IF-NEXT:    and a0, a0, a3
+; RV32IF-NEXT:    and a0, a0, a2
 ; RV32IF-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IF-NEXT:    addi sp, sp, 32
 ; RV32IF-NEXT:    ret
@@ -1142,21 +1142,21 @@ define i64 @stest_f64i64(double %x) {
 ; RV32IFD-NEXT:    .cfi_offset ra, -4
 ; RV32IFD-NEXT:    addi a0, sp, 8
 ; RV32IFD-NEXT:    call __fixdfti at plt
-; RV32IFD-NEXT:    lw a3, 8(sp)
+; RV32IFD-NEXT:    lw a2, 8(sp)
 ; RV32IFD-NEXT:    lw a1, 12(sp)
-; RV32IFD-NEXT:    lw a2, 16(sp)
+; RV32IFD-NEXT:    lw a3, 16(sp)
 ; RV32IFD-NEXT:    lw a4, 20(sp)
 ; RV32IFD-NEXT:    lui a0, 524288
 ; RV32IFD-NEXT:    addi a5, a0, -1
 ; RV32IFD-NEXT:    beq a1, a5, .LBB18_2
 ; RV32IFD-NEXT:  # %bb.1: # %entry
 ; RV32IFD-NEXT:    sltu a6, a1, a5
-; RV32IFD-NEXT:    or a7, a2, a4
+; RV32IFD-NEXT:    or a7, a3, a4
 ; RV32IFD-NEXT:    bnez a7, .LBB18_3
 ; RV32IFD-NEXT:    j .LBB18_4
 ; RV32IFD-NEXT:  .LBB18_2:
-; RV32IFD-NEXT:    sltiu a6, a3, -1
-; RV32IFD-NEXT:    or a7, a2, a4
+; RV32IFD-NEXT:    sltiu a6, a2, -1
+; RV32IFD-NEXT:    or a7, a3, a4
 ; RV32IFD-NEXT:    beqz a7, .LBB18_4
 ; RV32IFD-NEXT:  .LBB18_3: # %entry
 ; RV32IFD-NEXT:    slti a6, a4, 0
@@ -1167,19 +1167,19 @@ define i64 @stest_f64i64(double %x) {
 ; RV32IFD-NEXT:  # %bb.5: # %entry
 ; RV32IFD-NEXT:    mv a1, a5
 ; RV32IFD-NEXT:  .LBB18_6: # %entry
-; RV32IFD-NEXT:    or a3, t0, a3
+; RV32IFD-NEXT:    or a2, t0, a2
 ; RV32IFD-NEXT:    and a4, a7, a4
-; RV32IFD-NEXT:    and a2, a7, a2
+; RV32IFD-NEXT:    and a3, a7, a3
 ; RV32IFD-NEXT:    beq a1, a0, .LBB18_8
 ; RV32IFD-NEXT:  # %bb.7: # %entry
 ; RV32IFD-NEXT:    sltu a0, a0, a1
 ; RV32IFD-NEXT:    j .LBB18_9
 ; RV32IFD-NEXT:  .LBB18_8:
-; RV32IFD-NEXT:    snez a0, a3
+; RV32IFD-NEXT:    snez a0, a2
 ; RV32IFD-NEXT:  .LBB18_9: # %entry
-; RV32IFD-NEXT:    and a2, a2, a4
+; RV32IFD-NEXT:    and a3, a3, a4
 ; RV32IFD-NEXT:    li a5, -1
-; RV32IFD-NEXT:    beq a2, a5, .LBB18_11
+; RV32IFD-NEXT:    beq a3, a5, .LBB18_11
 ; RV32IFD-NEXT:  # %bb.10: # %entry
 ; RV32IFD-NEXT:    slti a0, a4, 0
 ; RV32IFD-NEXT:    xori a0, a0, 1
@@ -1189,7 +1189,7 @@ define i64 @stest_f64i64(double %x) {
 ; RV32IFD-NEXT:    lui a1, 524288
 ; RV32IFD-NEXT:  .LBB18_13: # %entry
 ; RV32IFD-NEXT:    neg a0, a0
-; RV32IFD-NEXT:    and a0, a0, a3
+; RV32IFD-NEXT:    and a0, a0, a2
 ; RV32IFD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IFD-NEXT:    addi sp, sp, 32
 ; RV32IFD-NEXT:    ret
@@ -1225,18 +1225,18 @@ define i64 @utest_f64i64(double %x) {
 ; RV32IF-NEXT:    call __fixunsdfti at plt
 ; RV32IF-NEXT:    lw a0, 16(sp)
 ; RV32IF-NEXT:    lw a1, 20(sp)
-; RV32IF-NEXT:    lw a2, 12(sp)
-; RV32IF-NEXT:    lw a3, 8(sp)
-; RV32IF-NEXT:    or a4, a1, a0
+; RV32IF-NEXT:    lw a2, 8(sp)
+; RV32IF-NEXT:    lw a3, 12(sp)
+; RV32IF-NEXT:    xori a4, a0, 1
+; RV32IF-NEXT:    or a4, a4, a1
 ; RV32IF-NEXT:    seqz a4, a4
-; RV32IF-NEXT:    xori a0, a0, 1
-; RV32IF-NEXT:    or a0, a0, a1
+; RV32IF-NEXT:    addi a4, a4, -1
+; RV32IF-NEXT:    or a0, a1, a0
 ; RV32IF-NEXT:    seqz a0, a0
-; RV32IF-NEXT:    addi a0, a0, -1
-; RV32IF-NEXT:    and a0, a0, a4
+; RV32IF-NEXT:    and a0, a4, a0
 ; RV32IF-NEXT:    neg a1, a0
-; RV32IF-NEXT:    and a0, a1, a3
-; RV32IF-NEXT:    and a1, a1, a2
+; RV32IF-NEXT:    and a0, a1, a2
+; RV32IF-NEXT:    and a1, a1, a3
 ; RV32IF-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IF-NEXT:    addi sp, sp, 32
 ; RV32IF-NEXT:    ret
@@ -1265,18 +1265,18 @@ define i64 @utest_f64i64(double %x) {
 ; RV32IFD-NEXT:    call __fixunsdfti at plt
 ; RV32IFD-NEXT:    lw a0, 16(sp)
 ; RV32IFD-NEXT:    lw a1, 20(sp)
-; RV32IFD-NEXT:    lw a2, 12(sp)
-; RV32IFD-NEXT:    lw a3, 8(sp)
-; RV32IFD-NEXT:    or a4, a1, a0
+; RV32IFD-NEXT:    lw a2, 8(sp)
+; RV32IFD-NEXT:    lw a3, 12(sp)
+; RV32IFD-NEXT:    xori a4, a0, 1
+; RV32IFD-NEXT:    or a4, a4, a1
 ; RV32IFD-NEXT:    seqz a4, a4
-; RV32IFD-NEXT:    xori a0, a0, 1
-; RV32IFD-NEXT:    or a0, a0, a1
+; RV32IFD-NEXT:    addi a4, a4, -1
+; RV32IFD-NEXT:    or a0, a1, a0
 ; RV32IFD-NEXT:    seqz a0, a0
-; RV32IFD-NEXT:    addi a0, a0, -1
-; RV32IFD-NEXT:    and a0, a0, a4
+; RV32IFD-NEXT:    and a0, a4, a0
 ; RV32IFD-NEXT:    neg a1, a0
-; RV32IFD-NEXT:    and a0, a1, a3
-; RV32IFD-NEXT:    and a1, a1, a2
+; RV32IFD-NEXT:    and a0, a1, a2
+; RV32IFD-NEXT:    and a1, a1, a3
 ; RV32IFD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IFD-NEXT:    addi sp, sp, 32
 ; RV32IFD-NEXT:    ret
@@ -1440,21 +1440,21 @@ define i64 @stest_f32i64(float %x) {
 ; RV32-NEXT:    .cfi_offset ra, -4
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    call __fixsfti at plt
-; RV32-NEXT:    lw a3, 8(sp)
+; RV32-NEXT:    lw a2, 8(sp)
 ; RV32-NEXT:    lw a1, 12(sp)
-; RV32-NEXT:    lw a2, 16(sp)
+; RV32-NEXT:    lw a3, 16(sp)
 ; RV32-NEXT:    lw a4, 20(sp)
 ; RV32-NEXT:    lui a0, 524288
 ; RV32-NEXT:    addi a5, a0, -1
 ; RV32-NEXT:    beq a1, a5, .LBB21_2
 ; RV32-NEXT:  # %bb.1: # %entry
 ; RV32-NEXT:    sltu a6, a1, a5
-; RV32-NEXT:    or a7, a2, a4
+; RV32-NEXT:    or a7, a3, a4
 ; RV32-NEXT:    bnez a7, .LBB21_3
 ; RV32-NEXT:    j .LBB21_4
 ; RV32-NEXT:  .LBB21_2:
-; RV32-NEXT:    sltiu a6, a3, -1
-; RV32-NEXT:    or a7, a2, a4
+; RV32-NEXT:    sltiu a6, a2, -1
+; RV32-NEXT:    or a7, a3, a4
 ; RV32-NEXT:    beqz a7, .LBB21_4
 ; RV32-NEXT:  .LBB21_3: # %entry
 ; RV32-NEXT:    slti a6, a4, 0
@@ -1465,19 +1465,19 @@ define i64 @stest_f32i64(float %x) {
 ; RV32-NEXT:  # %bb.5: # %entry
 ; RV32-NEXT:    mv a1, a5
 ; RV32-NEXT:  .LBB21_6: # %entry
-; RV32-NEXT:    or a3, t0, a3
+; RV32-NEXT:    or a2, t0, a2
 ; RV32-NEXT:    and a4, a7, a4
-; RV32-NEXT:    and a2, a7, a2
+; RV32-NEXT:    and a3, a7, a3
 ; RV32-NEXT:    beq a1, a0, .LBB21_8
 ; RV32-NEXT:  # %bb.7: # %entry
 ; RV32-NEXT:    sltu a0, a0, a1
 ; RV32-NEXT:    j .LBB21_9
 ; RV32-NEXT:  .LBB21_8:
-; RV32-NEXT:    snez a0, a3
+; RV32-NEXT:    snez a0, a2
 ; RV32-NEXT:  .LBB21_9: # %entry
-; RV32-NEXT:    and a2, a2, a4
+; RV32-NEXT:    and a3, a3, a4
 ; RV32-NEXT:    li a5, -1
-; RV32-NEXT:    beq a2, a5, .LBB21_11
+; RV32-NEXT:    beq a3, a5, .LBB21_11
 ; RV32-NEXT:  # %bb.10: # %entry
 ; RV32-NEXT:    slti a0, a4, 0
 ; RV32-NEXT:    xori a0, a0, 1
@@ -1487,7 +1487,7 @@ define i64 @stest_f32i64(float %x) {
 ; RV32-NEXT:    lui a1, 524288
 ; RV32-NEXT:  .LBB21_13: # %entry
 ; RV32-NEXT:    neg a0, a0
-; RV32-NEXT:    and a0, a0, a3
+; RV32-NEXT:    and a0, a0, a2
 ; RV32-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    addi sp, sp, 32
 ; RV32-NEXT:    ret
@@ -1521,18 +1521,18 @@ define i64 @utest_f32i64(float %x) {
 ; RV32-NEXT:    call __fixunssfti at plt
 ; RV32-NEXT:    lw a0, 16(sp)
 ; RV32-NEXT:    lw a1, 20(sp)
-; RV32-NEXT:    lw a2, 12(sp)
-; RV32-NEXT:    lw a3, 8(sp)
-; RV32-NEXT:    or a4, a1, a0
+; RV32-NEXT:    lw a2, 8(sp)
+; RV32-NEXT:    lw a3, 12(sp)
+; RV32-NEXT:    xori a4, a0, 1
+; RV32-NEXT:    or a4, a4, a1
 ; RV32-NEXT:    seqz a4, a4
-; RV32-NEXT:    xori a0, a0, 1
-; RV32-NEXT:    or a0, a0, a1
+; RV32-NEXT:    addi a4, a4, -1
+; RV32-NEXT:    or a0, a1, a0
 ; RV32-NEXT:    seqz a0, a0
-; RV32-NEXT:    addi a0, a0, -1
-; RV32-NEXT:    and a0, a0, a4
+; RV32-NEXT:    and a0, a4, a0
 ; RV32-NEXT:    neg a1, a0
-; RV32-NEXT:    and a0, a1, a3
-; RV32-NEXT:    and a1, a1, a2
+; RV32-NEXT:    and a0, a1, a2
+; RV32-NEXT:    and a1, a1, a3
 ; RV32-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    addi sp, sp, 32
 ; RV32-NEXT:    ret
@@ -1657,21 +1657,21 @@ define i64 @stest_f16i64(half %x) {
 ; RV32-NEXT:    call __extendhfsf2 at plt
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    call __fixsfti at plt
-; RV32-NEXT:    lw a3, 8(sp)
+; RV32-NEXT:    lw a2, 8(sp)
 ; RV32-NEXT:    lw a1, 12(sp)
-; RV32-NEXT:    lw a2, 16(sp)
+; RV32-NEXT:    lw a3, 16(sp)
 ; RV32-NEXT:    lw a4, 20(sp)
 ; RV32-NEXT:    lui a0, 524288
 ; RV32-NEXT:    addi a5, a0, -1
 ; RV32-NEXT:    beq a1, a5, .LBB24_2
 ; RV32-NEXT:  # %bb.1: # %entry
 ; RV32-NEXT:    sltu a6, a1, a5
-; RV32-NEXT:    or a7, a2, a4
+; RV32-NEXT:    or a7, a3, a4
 ; RV32-NEXT:    bnez a7, .LBB24_3
 ; RV32-NEXT:    j .LBB24_4
 ; RV32-NEXT:  .LBB24_2:
-; RV32-NEXT:    sltiu a6, a3, -1
-; RV32-NEXT:    or a7, a2, a4
+; RV32-NEXT:    sltiu a6, a2, -1
+; RV32-NEXT:    or a7, a3, a4
 ; RV32-NEXT:    beqz a7, .LBB24_4
 ; RV32-NEXT:  .LBB24_3: # %entry
 ; RV32-NEXT:    slti a6, a4, 0
@@ -1682,19 +1682,19 @@ define i64 @stest_f16i64(half %x) {
 ; RV32-NEXT:  # %bb.5: # %entry
 ; RV32-NEXT:    mv a1, a5
 ; RV32-NEXT:  .LBB24_6: # %entry
-; RV32-NEXT:    or a3, t0, a3
+; RV32-NEXT:    or a2, t0, a2
 ; RV32-NEXT:    and a4, a7, a4
-; RV32-NEXT:    and a2, a7, a2
+; RV32-NEXT:    and a3, a7, a3
 ; RV32-NEXT:    beq a1, a0, .LBB24_8
 ; RV32-NEXT:  # %bb.7: # %entry
 ; RV32-NEXT:    sltu a0, a0, a1
 ; RV32-NEXT:    j .LBB24_9
 ; RV32-NEXT:  .LBB24_8:
-; RV32-NEXT:    snez a0, a3
+; RV32-NEXT:    snez a0, a2
 ; RV32-NEXT:  .LBB24_9: # %entry
-; RV32-NEXT:    and a2, a2, a4
+; RV32-NEXT:    and a3, a3, a4
 ; RV32-NEXT:    li a5, -1
-; RV32-NEXT:    beq a2, a5, .LBB24_11
+; RV32-NEXT:    beq a3, a5, .LBB24_11
 ; RV32-NEXT:  # %bb.10: # %entry
 ; RV32-NEXT:    slti a0, a4, 0
 ; RV32-NEXT:    xori a0, a0, 1
@@ -1704,7 +1704,7 @@ define i64 @stest_f16i64(half %x) {
 ; RV32-NEXT:    lui a1, 524288
 ; RV32-NEXT:  .LBB24_13: # %entry
 ; RV32-NEXT:    neg a0, a0
-; RV32-NEXT:    and a0, a0, a3
+; RV32-NEXT:    and a0, a0, a2
 ; RV32-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    addi sp, sp, 32
 ; RV32-NEXT:    ret
@@ -1770,18 +1770,18 @@ define i64 @utesth_f16i64(half %x) {
 ; RV32-NEXT:    call __fixunssfti at plt
 ; RV32-NEXT:    lw a0, 16(sp)
 ; RV32-NEXT:    lw a1, 20(sp)
-; RV32-NEXT:    lw a2, 12(sp)
-; RV32-NEXT:    lw a3, 8(sp)
-; RV32-NEXT:    or a4, a1, a0
+; RV32-NEXT:    lw a2, 8(sp)
+; RV32-NEXT:    lw a3, 12(sp)
+; RV32-NEXT:    xori a4, a0, 1
+; RV32-NEXT:    or a4, a4, a1
 ; RV32-NEXT:    seqz a4, a4
-; RV32-NEXT:    xori a0, a0, 1
-; RV32-NEXT:    or a0, a0, a1
+; RV32-NEXT:    addi a4, a4, -1
+; RV32-NEXT:    or a0, a1, a0
 ; RV32-NEXT:    seqz a0, a0
-; RV32-NEXT:    addi a0, a0, -1
-; RV32-NEXT:    and a0, a0, a4
+; RV32-NEXT:    and a0, a4, a0
 ; RV32-NEXT:    neg a1, a0
-; RV32-NEXT:    and a0, a1, a3
-; RV32-NEXT:    and a1, a1, a2
+; RV32-NEXT:    and a0, a1, a2
+; RV32-NEXT:    and a1, a1, a3
 ; RV32-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    addi sp, sp, 32
 ; RV32-NEXT:    ret
@@ -2892,21 +2892,21 @@ define i64 @stest_f64i64_mm(double %x) {
 ; RV32IF-NEXT:    mv a1, a0
 ; RV32IF-NEXT:    addi a0, sp, 8
 ; RV32IF-NEXT:    call __fixdfti at plt
-; RV32IF-NEXT:    lw a3, 8(sp)
+; RV32IF-NEXT:    lw a2, 8(sp)
 ; RV32IF-NEXT:    lw a1, 12(sp)
-; RV32IF-NEXT:    lw a2, 16(sp)
+; RV32IF-NEXT:    lw a3, 16(sp)
 ; RV32IF-NEXT:    lw a4, 20(sp)
 ; RV32IF-NEXT:    lui a0, 524288
 ; RV32IF-NEXT:    addi a5, a0, -1
 ; RV32IF-NEXT:    beq a1, a5, .LBB45_2
 ; RV32IF-NEXT:  # %bb.1: # %entry
 ; RV32IF-NEXT:    sltu a6, a1, a5
-; RV32IF-NEXT:    or a7, a2, a4
+; RV32IF-NEXT:    or a7, a3, a4
 ; RV32IF-NEXT:    bnez a7, .LBB45_3
 ; RV32IF-NEXT:    j .LBB45_4
 ; RV32IF-NEXT:  .LBB45_2:
-; RV32IF-NEXT:    sltiu a6, a3, -1
-; RV32IF-NEXT:    or a7, a2, a4
+; RV32IF-NEXT:    sltiu a6, a2, -1
+; RV32IF-NEXT:    or a7, a3, a4
 ; RV32IF-NEXT:    beqz a7, .LBB45_4
 ; RV32IF-NEXT:  .LBB45_3: # %entry
 ; RV32IF-NEXT:    slti a6, a4, 0
@@ -2917,19 +2917,19 @@ define i64 @stest_f64i64_mm(double %x) {
 ; RV32IF-NEXT:  # %bb.5: # %entry
 ; RV32IF-NEXT:    mv a1, a5
 ; RV32IF-NEXT:  .LBB45_6: # %entry
-; RV32IF-NEXT:    or a3, t0, a3
+; RV32IF-NEXT:    or a2, t0, a2
 ; RV32IF-NEXT:    and a4, a7, a4
-; RV32IF-NEXT:    and a2, a7, a2
+; RV32IF-NEXT:    and a3, a7, a3
 ; RV32IF-NEXT:    beq a1, a0, .LBB45_8
 ; RV32IF-NEXT:  # %bb.7: # %entry
 ; RV32IF-NEXT:    sltu a0, a0, a1
 ; RV32IF-NEXT:    j .LBB45_9
 ; RV32IF-NEXT:  .LBB45_8:
-; RV32IF-NEXT:    snez a0, a3
+; RV32IF-NEXT:    snez a0, a2
 ; RV32IF-NEXT:  .LBB45_9: # %entry
-; RV32IF-NEXT:    and a2, a2, a4
+; RV32IF-NEXT:    and a3, a3, a4
 ; RV32IF-NEXT:    li a5, -1
-; RV32IF-NEXT:    beq a2, a5, .LBB45_11
+; RV32IF-NEXT:    beq a3, a5, .LBB45_11
 ; RV32IF-NEXT:  # %bb.10: # %entry
 ; RV32IF-NEXT:    slti a0, a4, 0
 ; RV32IF-NEXT:    xori a0, a0, 1
@@ -2939,7 +2939,7 @@ define i64 @stest_f64i64_mm(double %x) {
 ; RV32IF-NEXT:    lui a1, 524288
 ; RV32IF-NEXT:  .LBB45_13: # %entry
 ; RV32IF-NEXT:    neg a0, a0
-; RV32IF-NEXT:    and a0, a0, a3
+; RV32IF-NEXT:    and a0, a0, a2
 ; RV32IF-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IF-NEXT:    addi sp, sp, 32
 ; RV32IF-NEXT:    ret
@@ -2991,21 +2991,21 @@ define i64 @stest_f64i64_mm(double %x) {
 ; RV32IFD-NEXT:    .cfi_offset ra, -4
 ; RV32IFD-NEXT:    addi a0, sp, 8
 ; RV32IFD-NEXT:    call __fixdfti at plt
-; RV32IFD-NEXT:    lw a3, 8(sp)
+; RV32IFD-NEXT:    lw a2, 8(sp)
 ; RV32IFD-NEXT:    lw a1, 12(sp)
-; RV32IFD-NEXT:    lw a2, 16(sp)
+; RV32IFD-NEXT:    lw a3, 16(sp)
 ; RV32IFD-NEXT:    lw a4, 20(sp)
 ; RV32IFD-NEXT:    lui a0, 524288
 ; RV32IFD-NEXT:    addi a5, a0, -1
 ; RV32IFD-NEXT:    beq a1, a5, .LBB45_2
 ; RV32IFD-NEXT:  # %bb.1: # %entry
 ; RV32IFD-NEXT:    sltu a6, a1, a5
-; RV32IFD-NEXT:    or a7, a2, a4
+; RV32IFD-NEXT:    or a7, a3, a4
 ; RV32IFD-NEXT:    bnez a7, .LBB45_3
 ; RV32IFD-NEXT:    j .LBB45_4
 ; RV32IFD-NEXT:  .LBB45_2:
-; RV32IFD-NEXT:    sltiu a6, a3, -1
-; RV32IFD-NEXT:    or a7, a2, a4
+; RV32IFD-NEXT:    sltiu a6, a2, -1
+; RV32IFD-NEXT:    or a7, a3, a4
 ; RV32IFD-NEXT:    beqz a7, .LBB45_4
 ; RV32IFD-NEXT:  .LBB45_3: # %entry
 ; RV32IFD-NEXT:    slti a6, a4, 0
@@ -3016,19 +3016,19 @@ define i64 @stest_f64i64_mm(double %x) {
 ; RV32IFD-NEXT:  # %bb.5: # %entry
 ; RV32IFD-NEXT:    mv a1, a5
 ; RV32IFD-NEXT:  .LBB45_6: # %entry
-; RV32IFD-NEXT:    or a3, t0, a3
+; RV32IFD-NEXT:    or a2, t0, a2
 ; RV32IFD-NEXT:    and a4, a7, a4
-; RV32IFD-NEXT:    and a2, a7, a2
+; RV32IFD-NEXT:    and a3, a7, a3
 ; RV32IFD-NEXT:    beq a1, a0, .LBB45_8
 ; RV32IFD-NEXT:  # %bb.7: # %entry
 ; RV32IFD-NEXT:    sltu a0, a0, a1
 ; RV32IFD-NEXT:    j .LBB45_9
 ; RV32IFD-NEXT:  .LBB45_8:
-; RV32IFD-NEXT:    snez a0, a3
+; RV32IFD-NEXT:    snez a0, a2
 ; RV32IFD-NEXT:  .LBB45_9: # %entry
-; RV32IFD-NEXT:    and a2, a2, a4
+; RV32IFD-NEXT:    and a3, a3, a4
 ; RV32IFD-NEXT:    li a5, -1
-; RV32IFD-NEXT:    beq a2, a5, .LBB45_11
+; RV32IFD-NEXT:    beq a3, a5, .LBB45_11
 ; RV32IFD-NEXT:  # %bb.10: # %entry
 ; RV32IFD-NEXT:    slti a0, a4, 0
 ; RV32IFD-NEXT:    xori a0, a0, 1
@@ -3038,7 +3038,7 @@ define i64 @stest_f64i64_mm(double %x) {
 ; RV32IFD-NEXT:    lui a1, 524288
 ; RV32IFD-NEXT:  .LBB45_13: # %entry
 ; RV32IFD-NEXT:    neg a0, a0
-; RV32IFD-NEXT:    and a0, a0, a3
+; RV32IFD-NEXT:    and a0, a0, a2
 ; RV32IFD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IFD-NEXT:    addi sp, sp, 32
 ; RV32IFD-NEXT:    ret
@@ -3072,18 +3072,18 @@ define i64 @utest_f64i64_mm(double %x) {
 ; RV32IF-NEXT:    call __fixunsdfti at plt
 ; RV32IF-NEXT:    lw a0, 16(sp)
 ; RV32IF-NEXT:    lw a1, 20(sp)
-; RV32IF-NEXT:    lw a2, 12(sp)
-; RV32IF-NEXT:    lw a3, 8(sp)
-; RV32IF-NEXT:    or a4, a1, a0
+; RV32IF-NEXT:    lw a2, 8(sp)
+; RV32IF-NEXT:    lw a3, 12(sp)
+; RV32IF-NEXT:    xori a4, a0, 1
+; RV32IF-NEXT:    or a4, a4, a1
 ; RV32IF-NEXT:    seqz a4, a4
-; RV32IF-NEXT:    xori a0, a0, 1
-; RV32IF-NEXT:    or a0, a0, a1
+; RV32IF-NEXT:    addi a4, a4, -1
+; RV32IF-NEXT:    or a0, a1, a0
 ; RV32IF-NEXT:    seqz a0, a0
-; RV32IF-NEXT:    addi a0, a0, -1
-; RV32IF-NEXT:    and a0, a0, a4
+; RV32IF-NEXT:    and a0, a4, a0
 ; RV32IF-NEXT:    neg a1, a0
-; RV32IF-NEXT:    and a0, a1, a3
-; RV32IF-NEXT:    and a1, a1, a2
+; RV32IF-NEXT:    and a0, a1, a2
+; RV32IF-NEXT:    and a1, a1, a3
 ; RV32IF-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IF-NEXT:    addi sp, sp, 32
 ; RV32IF-NEXT:    ret
@@ -3112,18 +3112,18 @@ define i64 @utest_f64i64_mm(double %x) {
 ; RV32IFD-NEXT:    call __fixunsdfti at plt
 ; RV32IFD-NEXT:    lw a0, 16(sp)
 ; RV32IFD-NEXT:    lw a1, 20(sp)
-; RV32IFD-NEXT:    lw a2, 12(sp)
-; RV32IFD-NEXT:    lw a3, 8(sp)
-; RV32IFD-NEXT:    or a4, a1, a0
+; RV32IFD-NEXT:    lw a2, 8(sp)
+; RV32IFD-NEXT:    lw a3, 12(sp)
+; RV32IFD-NEXT:    xori a4, a0, 1
+; RV32IFD-NEXT:    or a4, a4, a1
 ; RV32IFD-NEXT:    seqz a4, a4
-; RV32IFD-NEXT:    xori a0, a0, 1
-; RV32IFD-NEXT:    or a0, a0, a1
+; RV32IFD-NEXT:    addi a4, a4, -1
+; RV32IFD-NEXT:    or a0, a1, a0
 ; RV32IFD-NEXT:    seqz a0, a0
-; RV32IFD-NEXT:    addi a0, a0, -1
-; RV32IFD-NEXT:    and a0, a0, a4
+; RV32IFD-NEXT:    and a0, a4, a0
 ; RV32IFD-NEXT:    neg a1, a0
-; RV32IFD-NEXT:    and a0, a1, a3
-; RV32IFD-NEXT:    and a1, a1, a2
+; RV32IFD-NEXT:    and a0, a1, a2
+; RV32IFD-NEXT:    and a1, a1, a3
 ; RV32IFD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IFD-NEXT:    addi sp, sp, 32
 ; RV32IFD-NEXT:    ret
@@ -3247,21 +3247,21 @@ define i64 @stest_f32i64_mm(float %x) {
 ; RV32-NEXT:    .cfi_offset ra, -4
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    call __fixsfti at plt
-; RV32-NEXT:    lw a3, 8(sp)
+; RV32-NEXT:    lw a2, 8(sp)
 ; RV32-NEXT:    lw a1, 12(sp)
-; RV32-NEXT:    lw a2, 16(sp)
+; RV32-NEXT:    lw a3, 16(sp)
 ; RV32-NEXT:    lw a4, 20(sp)
 ; RV32-NEXT:    lui a0, 524288
 ; RV32-NEXT:    addi a5, a0, -1
 ; RV32-NEXT:    beq a1, a5, .LBB48_2
 ; RV32-NEXT:  # %bb.1: # %entry
 ; RV32-NEXT:    sltu a6, a1, a5
-; RV32-NEXT:    or a7, a2, a4
+; RV32-NEXT:    or a7, a3, a4
 ; RV32-NEXT:    bnez a7, .LBB48_3
 ; RV32-NEXT:    j .LBB48_4
 ; RV32-NEXT:  .LBB48_2:
-; RV32-NEXT:    sltiu a6, a3, -1
-; RV32-NEXT:    or a7, a2, a4
+; RV32-NEXT:    sltiu a6, a2, -1
+; RV32-NEXT:    or a7, a3, a4
 ; RV32-NEXT:    beqz a7, .LBB48_4
 ; RV32-NEXT:  .LBB48_3: # %entry
 ; RV32-NEXT:    slti a6, a4, 0
@@ -3272,19 +3272,19 @@ define i64 @stest_f32i64_mm(float %x) {
 ; RV32-NEXT:  # %bb.5: # %entry
 ; RV32-NEXT:    mv a1, a5
 ; RV32-NEXT:  .LBB48_6: # %entry
-; RV32-NEXT:    or a3, t0, a3
+; RV32-NEXT:    or a2, t0, a2
 ; RV32-NEXT:    and a4, a7, a4
-; RV32-NEXT:    and a2, a7, a2
+; RV32-NEXT:    and a3, a7, a3
 ; RV32-NEXT:    beq a1, a0, .LBB48_8
 ; RV32-NEXT:  # %bb.7: # %entry
 ; RV32-NEXT:    sltu a0, a0, a1
 ; RV32-NEXT:    j .LBB48_9
 ; RV32-NEXT:  .LBB48_8:
-; RV32-NEXT:    snez a0, a3
+; RV32-NEXT:    snez a0, a2
 ; RV32-NEXT:  .LBB48_9: # %entry
-; RV32-NEXT:    and a2, a2, a4
+; RV32-NEXT:    and a3, a3, a4
 ; RV32-NEXT:    li a5, -1
-; RV32-NEXT:    beq a2, a5, .LBB48_11
+; RV32-NEXT:    beq a3, a5, .LBB48_11
 ; RV32-NEXT:  # %bb.10: # %entry
 ; RV32-NEXT:    slti a0, a4, 0
 ; RV32-NEXT:    xori a0, a0, 1
@@ -3294,7 +3294,7 @@ define i64 @stest_f32i64_mm(float %x) {
 ; RV32-NEXT:    lui a1, 524288
 ; RV32-NEXT:  .LBB48_13: # %entry
 ; RV32-NEXT:    neg a0, a0
-; RV32-NEXT:    and a0, a0, a3
+; RV32-NEXT:    and a0, a0, a2
 ; RV32-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    addi sp, sp, 32
 ; RV32-NEXT:    ret
@@ -3326,18 +3326,18 @@ define i64 @utest_f32i64_mm(float %x) {
 ; RV32-NEXT:    call __fixunssfti at plt
 ; RV32-NEXT:    lw a0, 16(sp)
 ; RV32-NEXT:    lw a1, 20(sp)
-; RV32-NEXT:    lw a2, 12(sp)
-; RV32-NEXT:    lw a3, 8(sp)
-; RV32-NEXT:    or a4, a1, a0
+; RV32-NEXT:    lw a2, 8(sp)
+; RV32-NEXT:    lw a3, 12(sp)
+; RV32-NEXT:    xori a4, a0, 1
+; RV32-NEXT:    or a4, a4, a1
 ; RV32-NEXT:    seqz a4, a4
-; RV32-NEXT:    xori a0, a0, 1
-; RV32-NEXT:    or a0, a0, a1
+; RV32-NEXT:    addi a4, a4, -1
+; RV32-NEXT:    or a0, a1, a0
 ; RV32-NEXT:    seqz a0, a0
-; RV32-NEXT:    addi a0, a0, -1
-; RV32-NEXT:    and a0, a0, a4
+; RV32-NEXT:    and a0, a4, a0
 ; RV32-NEXT:    neg a1, a0
-; RV32-NEXT:    and a0, a1, a3
-; RV32-NEXT:    and a1, a1, a2
+; RV32-NEXT:    and a0, a1, a2
+; RV32-NEXT:    and a1, a1, a3
 ; RV32-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    addi sp, sp, 32
 ; RV32-NEXT:    ret
@@ -3438,21 +3438,21 @@ define i64 @stest_f16i64_mm(half %x) {
 ; RV32-NEXT:    call __extendhfsf2 at plt
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    call __fixsfti at plt
-; RV32-NEXT:    lw a3, 8(sp)
+; RV32-NEXT:    lw a2, 8(sp)
 ; RV32-NEXT:    lw a1, 12(sp)
-; RV32-NEXT:    lw a2, 16(sp)
+; RV32-NEXT:    lw a3, 16(sp)
 ; RV32-NEXT:    lw a4, 20(sp)
 ; RV32-NEXT:    lui a0, 524288
 ; RV32-NEXT:    addi a5, a0, -1
 ; RV32-NEXT:    beq a1, a5, .LBB51_2
 ; RV32-NEXT:  # %bb.1: # %entry
 ; RV32-NEXT:    sltu a6, a1, a5
-; RV32-NEXT:    or a7, a2, a4
+; RV32-NEXT:    or a7, a3, a4
 ; RV32-NEXT:    bnez a7, .LBB51_3
 ; RV32-NEXT:    j .LBB51_4
 ; RV32-NEXT:  .LBB51_2:
-; RV32-NEXT:    sltiu a6, a3, -1
-; RV32-NEXT:    or a7, a2, a4
+; RV32-NEXT:    sltiu a6, a2, -1
+; RV32-NEXT:    or a7, a3, a4
 ; RV32-NEXT:    beqz a7, .LBB51_4
 ; RV32-NEXT:  .LBB51_3: # %entry
 ; RV32-NEXT:    slti a6, a4, 0
@@ -3463,19 +3463,19 @@ define i64 @stest_f16i64_mm(half %x) {
 ; RV32-NEXT:  # %bb.5: # %entry
 ; RV32-NEXT:    mv a1, a5
 ; RV32-NEXT:  .LBB51_6: # %entry
-; RV32-NEXT:    or a3, t0, a3
+; RV32-NEXT:    or a2, t0, a2
 ; RV32-NEXT:    and a4, a7, a4
-; RV32-NEXT:    and a2, a7, a2
+; RV32-NEXT:    and a3, a7, a3
 ; RV32-NEXT:    beq a1, a0, .LBB51_8
 ; RV32-NEXT:  # %bb.7: # %entry
 ; RV32-NEXT:    sltu a0, a0, a1
 ; RV32-NEXT:    j .LBB51_9
 ; RV32-NEXT:  .LBB51_8:
-; RV32-NEXT:    snez a0, a3
+; RV32-NEXT:    snez a0, a2
 ; RV32-NEXT:  .LBB51_9: # %entry
-; RV32-NEXT:    and a2, a2, a4
+; RV32-NEXT:    and a3, a3, a4
 ; RV32-NEXT:    li a5, -1
-; RV32-NEXT:    beq a2, a5, .LBB51_11
+; RV32-NEXT:    beq a3, a5, .LBB51_11
 ; RV32-NEXT:  # %bb.10: # %entry
 ; RV32-NEXT:    slti a0, a4, 0
 ; RV32-NEXT:    xori a0, a0, 1
@@ -3485,7 +3485,7 @@ define i64 @stest_f16i64_mm(half %x) {
 ; RV32-NEXT:    lui a1, 524288
 ; RV32-NEXT:  .LBB51_13: # %entry
 ; RV32-NEXT:    neg a0, a0
-; RV32-NEXT:    and a0, a0, a3
+; RV32-NEXT:    and a0, a0, a2
 ; RV32-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    addi sp, sp, 32
 ; RV32-NEXT:    ret
@@ -3549,18 +3549,18 @@ define i64 @utesth_f16i64_mm(half %x) {
 ; RV32-NEXT:    call __fixunssfti at plt
 ; RV32-NEXT:    lw a0, 16(sp)
 ; RV32-NEXT:    lw a1, 20(sp)
-; RV32-NEXT:    lw a2, 12(sp)
-; RV32-NEXT:    lw a3, 8(sp)
-; RV32-NEXT:    or a4, a1, a0
+; RV32-NEXT:    lw a2, 8(sp)
+; RV32-NEXT:    lw a3, 12(sp)
+; RV32-NEXT:    xori a4, a0, 1
+; RV32-NEXT:    or a4, a4, a1
 ; RV32-NEXT:    seqz a4, a4
-; RV32-NEXT:    xori a0, a0, 1
-; RV32-NEXT:    or a0, a0, a1
+; RV32-NEXT:    addi a4, a4, -1
+; RV32-NEXT:    or a0, a1, a0
 ; RV32-NEXT:    seqz a0, a0
-; RV32-NEXT:    addi a0, a0, -1
-; RV32-NEXT:    and a0, a0, a4
+; RV32-NEXT:    and a0, a4, a0
 ; RV32-NEXT:    neg a1, a0
-; RV32-NEXT:    and a0, a1, a3
-; RV32-NEXT:    and a1, a1, a2
+; RV32-NEXT:    and a0, a1, a2
+; RV32-NEXT:    and a1, a1, a3
 ; RV32-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    addi sp, sp, 32
 ; RV32-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/iabs.ll b/llvm/test/CodeGen/RISCV/iabs.ll
index 6e276b7a7a597f..81f6890a6b1410 100644
--- a/llvm/test/CodeGen/RISCV/iabs.ll
+++ b/llvm/test/CodeGen/RISCV/iabs.ll
@@ -301,58 +301,58 @@ define i64 @select_abs64(i64 %x) {
 define i128 @abs128(i128 %x) {
 ; RV32I-LABEL: abs128:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    lw a2, 12(a1)
-; RV32I-NEXT:    lw a4, 0(a1)
-; RV32I-NEXT:    lw a3, 4(a1)
+; RV32I-NEXT:    lw a4, 12(a1)
+; RV32I-NEXT:    lw a3, 0(a1)
+; RV32I-NEXT:    lw a2, 4(a1)
 ; RV32I-NEXT:    lw a1, 8(a1)
-; RV32I-NEXT:    bgez a2, .LBB8_2
+; RV32I-NEXT:    bgez a4, .LBB8_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    neg a5, a1
-; RV32I-NEXT:    or a6, a4, a3
-; RV32I-NEXT:    snez a6, a6
-; RV32I-NEXT:    sltu a7, a5, a6
-; RV32I-NEXT:    snez a1, a1
-; RV32I-NEXT:    add a1, a2, a1
+; RV32I-NEXT:    snez a5, a1
+; RV32I-NEXT:    add a4, a4, a5
 ; RV32I-NEXT:    neg a1, a1
-; RV32I-NEXT:    sub a2, a1, a7
-; RV32I-NEXT:    sub a1, a5, a6
-; RV32I-NEXT:    snez a5, a4
-; RV32I-NEXT:    neg a3, a3
-; RV32I-NEXT:    sub a3, a3, a5
+; RV32I-NEXT:    or a5, a3, a2
+; RV32I-NEXT:    snez a5, a5
+; RV32I-NEXT:    sltu a6, a1, a5
 ; RV32I-NEXT:    neg a4, a4
+; RV32I-NEXT:    sub a4, a4, a6
+; RV32I-NEXT:    snez a6, a3
+; RV32I-NEXT:    neg a2, a2
+; RV32I-NEXT:    sub a2, a2, a6
+; RV32I-NEXT:    sub a1, a1, a5
+; RV32I-NEXT:    neg a3, a3
 ; RV32I-NEXT:  .LBB8_2:
-; RV32I-NEXT:    sw a4, 0(a0)
+; RV32I-NEXT:    sw a3, 0(a0)
 ; RV32I-NEXT:    sw a1, 8(a0)
-; RV32I-NEXT:    sw a3, 4(a0)
-; RV32I-NEXT:    sw a2, 12(a0)
+; RV32I-NEXT:    sw a2, 4(a0)
+; RV32I-NEXT:    sw a4, 12(a0)
 ; RV32I-NEXT:    ret
 ;
 ; RV32ZBB-LABEL: abs128:
 ; RV32ZBB:       # %bb.0:
-; RV32ZBB-NEXT:    lw a2, 12(a1)
-; RV32ZBB-NEXT:    lw a4, 0(a1)
-; RV32ZBB-NEXT:    lw a3, 4(a1)
+; RV32ZBB-NEXT:    lw a4, 12(a1)
+; RV32ZBB-NEXT:    lw a3, 0(a1)
+; RV32ZBB-NEXT:    lw a2, 4(a1)
 ; RV32ZBB-NEXT:    lw a1, 8(a1)
-; RV32ZBB-NEXT:    bgez a2, .LBB8_2
+; RV32ZBB-NEXT:    bgez a4, .LBB8_2
 ; RV32ZBB-NEXT:  # %bb.1:
-; RV32ZBB-NEXT:    neg a5, a1
-; RV32ZBB-NEXT:    or a6, a4, a3
-; RV32ZBB-NEXT:    snez a6, a6
-; RV32ZBB-NEXT:    sltu a7, a5, a6
-; RV32ZBB-NEXT:    snez a1, a1
-; RV32ZBB-NEXT:    add a1, a2, a1
+; RV32ZBB-NEXT:    snez a5, a1
+; RV32ZBB-NEXT:    add a4, a4, a5
 ; RV32ZBB-NEXT:    neg a1, a1
-; RV32ZBB-NEXT:    sub a2, a1, a7
-; RV32ZBB-NEXT:    sub a1, a5, a6
-; RV32ZBB-NEXT:    snez a5, a4
-; RV32ZBB-NEXT:    neg a3, a3
-; RV32ZBB-NEXT:    sub a3, a3, a5
+; RV32ZBB-NEXT:    or a5, a3, a2
+; RV32ZBB-NEXT:    snez a5, a5
+; RV32ZBB-NEXT:    sltu a6, a1, a5
 ; RV32ZBB-NEXT:    neg a4, a4
+; RV32ZBB-NEXT:    sub a4, a4, a6
+; RV32ZBB-NEXT:    snez a6, a3
+; RV32ZBB-NEXT:    neg a2, a2
+; RV32ZBB-NEXT:    sub a2, a2, a6
+; RV32ZBB-NEXT:    sub a1, a1, a5
+; RV32ZBB-NEXT:    neg a3, a3
 ; RV32ZBB-NEXT:  .LBB8_2:
-; RV32ZBB-NEXT:    sw a4, 0(a0)
+; RV32ZBB-NEXT:    sw a3, 0(a0)
 ; RV32ZBB-NEXT:    sw a1, 8(a0)
-; RV32ZBB-NEXT:    sw a3, 4(a0)
-; RV32ZBB-NEXT:    sw a2, 12(a0)
+; RV32ZBB-NEXT:    sw a2, 4(a0)
+; RV32ZBB-NEXT:    sw a4, 12(a0)
 ; RV32ZBB-NEXT:    ret
 ;
 ; RV64I-LABEL: abs128:
@@ -383,58 +383,58 @@ define i128 @abs128(i128 %x) {
 define i128 @select_abs128(i128 %x) {
 ; RV32I-LABEL: select_abs128:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    lw a2, 12(a1)
-; RV32I-NEXT:    lw a4, 0(a1)
-; RV32I-NEXT:    lw a3, 4(a1)
+; RV32I-NEXT:    lw a4, 12(a1)
+; RV32I-NEXT:    lw a3, 0(a1)
+; RV32I-NEXT:    lw a2, 4(a1)
 ; RV32I-NEXT:    lw a1, 8(a1)
-; RV32I-NEXT:    bgez a2, .LBB9_2
+; RV32I-NEXT:    bgez a4, .LBB9_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    neg a5, a1
-; RV32I-NEXT:    or a6, a4, a3
-; RV32I-NEXT:    snez a6, a6
-; RV32I-NEXT:    sltu a7, a5, a6
-; RV32I-NEXT:    snez a1, a1
-; RV32I-NEXT:    add a1, a2, a1
+; RV32I-NEXT:    snez a5, a1
+; RV32I-NEXT:    add a4, a4, a5
 ; RV32I-NEXT:    neg a1, a1
-; RV32I-NEXT:    sub a2, a1, a7
-; RV32I-NEXT:    sub a1, a5, a6
-; RV32I-NEXT:    snez a5, a4
-; RV32I-NEXT:    neg a3, a3
-; RV32I-NEXT:    sub a3, a3, a5
+; RV32I-NEXT:    or a5, a3, a2
+; RV32I-NEXT:    snez a5, a5
+; RV32I-NEXT:    sltu a6, a1, a5
 ; RV32I-NEXT:    neg a4, a4
+; RV32I-NEXT:    sub a4, a4, a6
+; RV32I-NEXT:    snez a6, a3
+; RV32I-NEXT:    neg a2, a2
+; RV32I-NEXT:    sub a2, a2, a6
+; RV32I-NEXT:    sub a1, a1, a5
+; RV32I-NEXT:    neg a3, a3
 ; RV32I-NEXT:  .LBB9_2:
-; RV32I-NEXT:    sw a4, 0(a0)
+; RV32I-NEXT:    sw a3, 0(a0)
 ; RV32I-NEXT:    sw a1, 8(a0)
-; RV32I-NEXT:    sw a3, 4(a0)
-; RV32I-NEXT:    sw a2, 12(a0)
+; RV32I-NEXT:    sw a2, 4(a0)
+; RV32I-NEXT:    sw a4, 12(a0)
 ; RV32I-NEXT:    ret
 ;
 ; RV32ZBB-LABEL: select_abs128:
 ; RV32ZBB:       # %bb.0:
-; RV32ZBB-NEXT:    lw a2, 12(a1)
-; RV32ZBB-NEXT:    lw a4, 0(a1)
-; RV32ZBB-NEXT:    lw a3, 4(a1)
+; RV32ZBB-NEXT:    lw a4, 12(a1)
+; RV32ZBB-NEXT:    lw a3, 0(a1)
+; RV32ZBB-NEXT:    lw a2, 4(a1)
 ; RV32ZBB-NEXT:    lw a1, 8(a1)
-; RV32ZBB-NEXT:    bgez a2, .LBB9_2
+; RV32ZBB-NEXT:    bgez a4, .LBB9_2
 ; RV32ZBB-NEXT:  # %bb.1:
-; RV32ZBB-NEXT:    neg a5, a1
-; RV32ZBB-NEXT:    or a6, a4, a3
-; RV32ZBB-NEXT:    snez a6, a6
-; RV32ZBB-NEXT:    sltu a7, a5, a6
-; RV32ZBB-NEXT:    snez a1, a1
-; RV32ZBB-NEXT:    add a1, a2, a1
+; RV32ZBB-NEXT:    snez a5, a1
+; RV32ZBB-NEXT:    add a4, a4, a5
 ; RV32ZBB-NEXT:    neg a1, a1
-; RV32ZBB-NEXT:    sub a2, a1, a7
-; RV32ZBB-NEXT:    sub a1, a5, a6
-; RV32ZBB-NEXT:    snez a5, a4
-; RV32ZBB-NEXT:    neg a3, a3
-; RV32ZBB-NEXT:    sub a3, a3, a5
+; RV32ZBB-NEXT:    or a5, a3, a2
+; RV32ZBB-NEXT:    snez a5, a5
+; RV32ZBB-NEXT:    sltu a6, a1, a5
 ; RV32ZBB-NEXT:    neg a4, a4
+; RV32ZBB-NEXT:    sub a4, a4, a6
+; RV32ZBB-NEXT:    snez a6, a3
+; RV32ZBB-NEXT:    neg a2, a2
+; RV32ZBB-NEXT:    sub a2, a2, a6
+; RV32ZBB-NEXT:    sub a1, a1, a5
+; RV32ZBB-NEXT:    neg a3, a3
 ; RV32ZBB-NEXT:  .LBB9_2:
-; RV32ZBB-NEXT:    sw a4, 0(a0)
+; RV32ZBB-NEXT:    sw a3, 0(a0)
 ; RV32ZBB-NEXT:    sw a1, 8(a0)
-; RV32ZBB-NEXT:    sw a3, 4(a0)
-; RV32ZBB-NEXT:    sw a2, 12(a0)
+; RV32ZBB-NEXT:    sw a2, 4(a0)
+; RV32ZBB-NEXT:    sw a4, 12(a0)
 ; RV32ZBB-NEXT:    ret
 ;
 ; RV64I-LABEL: select_abs128:
diff --git a/llvm/test/CodeGen/RISCV/legalize-fneg.ll b/llvm/test/CodeGen/RISCV/legalize-fneg.ll
index dfd62e8d5f9f56..591abbc767e2d7 100644
--- a/llvm/test/CodeGen/RISCV/legalize-fneg.ll
+++ b/llvm/test/CodeGen/RISCV/legalize-fneg.ll
@@ -56,16 +56,16 @@ entry:
 define void @test3(ptr %a, ptr %b) nounwind {
 ; RV32-LABEL: test3:
 ; RV32:       # %bb.0: # %entry
-; RV32-NEXT:    lw a2, 12(a1)
-; RV32-NEXT:    lw a3, 4(a1)
-; RV32-NEXT:    lw a4, 8(a1)
-; RV32-NEXT:    lw a1, 0(a1)
+; RV32-NEXT:    lw a2, 8(a1)
+; RV32-NEXT:    lw a3, 12(a1)
+; RV32-NEXT:    lw a4, 0(a1)
+; RV32-NEXT:    lw a1, 4(a1)
 ; RV32-NEXT:    lui a5, 524288
-; RV32-NEXT:    xor a2, a2, a5
-; RV32-NEXT:    sw a4, 8(a0)
-; RV32-NEXT:    sw a1, 0(a0)
-; RV32-NEXT:    sw a3, 4(a0)
-; RV32-NEXT:    sw a2, 12(a0)
+; RV32-NEXT:    xor a3, a3, a5
+; RV32-NEXT:    sw a2, 8(a0)
+; RV32-NEXT:    sw a4, 0(a0)
+; RV32-NEXT:    sw a1, 4(a0)
+; RV32-NEXT:    sw a3, 12(a0)
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: test3:
diff --git a/llvm/test/CodeGen/RISCV/mem.ll b/llvm/test/CodeGen/RISCV/mem.ll
index 3718ce80142d49..8ffc302a5c13d7 100644
--- a/llvm/test/CodeGen/RISCV/mem.ll
+++ b/llvm/test/CodeGen/RISCV/mem.ll
@@ -22,9 +22,8 @@ define dso_local i32 @lb(ptr %a) nounwind {
 define dso_local i32 @lh(ptr %a) nounwind {
 ; RV32I-LABEL: lh:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    lh a1, 4(a0)
 ; RV32I-NEXT:    lh zero, 0(a0)
-; RV32I-NEXT:    mv a0, a1
+; RV32I-NEXT:    lh a0, 4(a0)
 ; RV32I-NEXT:    ret
   %1 = getelementptr i16, ptr %a, i32 2
   %2 = load i16, ptr %1
@@ -37,9 +36,8 @@ define dso_local i32 @lh(ptr %a) nounwind {
 define dso_local i32 @lw(ptr %a) nounwind {
 ; RV32I-LABEL: lw:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    lw a1, 12(a0)
 ; RV32I-NEXT:    lw zero, 0(a0)
-; RV32I-NEXT:    mv a0, a1
+; RV32I-NEXT:    lw a0, 12(a0)
 ; RV32I-NEXT:    ret
   %1 = getelementptr i32, ptr %a, i32 3
   %2 = load i32, ptr %1
@@ -50,9 +48,9 @@ define dso_local i32 @lw(ptr %a) nounwind {
 define dso_local i32 @lbu(ptr %a) nounwind {
 ; RV32I-LABEL: lbu:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    lbu a1, 4(a0)
-; RV32I-NEXT:    lbu a0, 0(a0)
-; RV32I-NEXT:    add a0, a1, a0
+; RV32I-NEXT:    lbu a1, 0(a0)
+; RV32I-NEXT:    lbu a0, 4(a0)
+; RV32I-NEXT:    add a0, a0, a1
 ; RV32I-NEXT:    ret
   %1 = getelementptr i8, ptr %a, i32 4
   %2 = load i8, ptr %1
@@ -66,9 +64,9 @@ define dso_local i32 @lbu(ptr %a) nounwind {
 define dso_local i32 @lhu(ptr %a) nounwind {
 ; RV32I-LABEL: lhu:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    lhu a1, 10(a0)
-; RV32I-NEXT:    lhu a0, 0(a0)
-; RV32I-NEXT:    add a0, a1, a0
+; RV32I-NEXT:    lhu a1, 0(a0)
+; RV32I-NEXT:    lhu a0, 10(a0)
+; RV32I-NEXT:    add a0, a0, a1
 ; RV32I-NEXT:    ret
   %1 = getelementptr i16, ptr %a, i32 5
   %2 = load i16, ptr %1
@@ -121,10 +119,10 @@ define dso_local void @sw(ptr %a, i32 %b) nounwind {
 define dso_local i32 @load_sext_zext_anyext_i1(ptr %a) nounwind {
 ; RV32I-LABEL: load_sext_zext_anyext_i1:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    lbu a1, 1(a0)
-; RV32I-NEXT:    lbu a2, 2(a0)
 ; RV32I-NEXT:    lbu zero, 0(a0)
-; RV32I-NEXT:    sub a0, a2, a1
+; RV32I-NEXT:    lbu a1, 1(a0)
+; RV32I-NEXT:    lbu a0, 2(a0)
+; RV32I-NEXT:    sub a0, a0, a1
 ; RV32I-NEXT:    ret
   ; sextload i1
   %1 = getelementptr i1, ptr %a, i32 1
@@ -143,10 +141,10 @@ define dso_local i32 @load_sext_zext_anyext_i1(ptr %a) nounwind {
 define dso_local i16 @load_sext_zext_anyext_i1_i16(ptr %a) nounwind {
 ; RV32I-LABEL: load_sext_zext_anyext_i1_i16:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    lbu a1, 1(a0)
-; RV32I-NEXT:    lbu a2, 2(a0)
 ; RV32I-NEXT:    lbu zero, 0(a0)
-; RV32I-NEXT:    sub a0, a2, a1
+; RV32I-NEXT:    lbu a1, 1(a0)
+; RV32I-NEXT:    lbu a0, 2(a0)
+; RV32I-NEXT:    sub a0, a0, a1
 ; RV32I-NEXT:    ret
   ; sextload i1
   %1 = getelementptr i1, ptr %a, i32 1
diff --git a/llvm/test/CodeGen/RISCV/mem64.ll b/llvm/test/CodeGen/RISCV/mem64.ll
index 09b04535498c5e..befef07476d040 100644
--- a/llvm/test/CodeGen/RISCV/mem64.ll
+++ b/llvm/test/CodeGen/RISCV/mem64.ll
@@ -22,9 +22,8 @@ define dso_local i64 @lb(ptr %a) nounwind {
 define dso_local i64 @lh(ptr %a) nounwind {
 ; RV64I-LABEL: lh:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lh a1, 4(a0)
 ; RV64I-NEXT:    lh zero, 0(a0)
-; RV64I-NEXT:    mv a0, a1
+; RV64I-NEXT:    lh a0, 4(a0)
 ; RV64I-NEXT:    ret
   %1 = getelementptr i16, ptr %a, i32 2
   %2 = load i16, ptr %1
@@ -37,9 +36,8 @@ define dso_local i64 @lh(ptr %a) nounwind {
 define dso_local i64 @lw(ptr %a) nounwind {
 ; RV64I-LABEL: lw:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lw a1, 12(a0)
 ; RV64I-NEXT:    lw zero, 0(a0)
-; RV64I-NEXT:    mv a0, a1
+; RV64I-NEXT:    lw a0, 12(a0)
 ; RV64I-NEXT:    ret
   %1 = getelementptr i32, ptr %a, i32 3
   %2 = load i32, ptr %1
@@ -52,9 +50,9 @@ define dso_local i64 @lw(ptr %a) nounwind {
 define dso_local i64 @lbu(ptr %a) nounwind {
 ; RV64I-LABEL: lbu:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lbu a1, 4(a0)
-; RV64I-NEXT:    lbu a0, 0(a0)
-; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    lbu a1, 0(a0)
+; RV64I-NEXT:    lbu a0, 4(a0)
+; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    ret
   %1 = getelementptr i8, ptr %a, i32 4
   %2 = load i8, ptr %1
@@ -68,9 +66,9 @@ define dso_local i64 @lbu(ptr %a) nounwind {
 define dso_local i64 @lhu(ptr %a) nounwind {
 ; RV64I-LABEL: lhu:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lhu a1, 10(a0)
-; RV64I-NEXT:    lhu a0, 0(a0)
-; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    lhu a1, 0(a0)
+; RV64I-NEXT:    lhu a0, 10(a0)
+; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    ret
   %1 = getelementptr i16, ptr %a, i32 5
   %2 = load i16, ptr %1
@@ -84,9 +82,9 @@ define dso_local i64 @lhu(ptr %a) nounwind {
 define dso_local i64 @lwu(ptr %a) nounwind {
 ; RV64I-LABEL: lwu:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lwu a1, 24(a0)
-; RV64I-NEXT:    lwu a0, 0(a0)
-; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    lwu a1, 0(a0)
+; RV64I-NEXT:    lwu a0, 24(a0)
+; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    ret
   %1 = getelementptr i32, ptr %a, i32 6
   %2 = load i32, ptr %1
@@ -140,9 +138,8 @@ define dso_local void @sw(ptr %a, i32 %b) nounwind {
 define dso_local i64 @ld(ptr %a) nounwind {
 ; RV64I-LABEL: ld:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    ld a1, 80(a0)
 ; RV64I-NEXT:    ld zero, 0(a0)
-; RV64I-NEXT:    mv a0, a1
+; RV64I-NEXT:    ld a0, 80(a0)
 ; RV64I-NEXT:    ret
   %1 = getelementptr i64, ptr %a, i32 10
   %2 = load i64, ptr %1
@@ -166,10 +163,10 @@ define dso_local void @sd(ptr %a, i64 %b) nounwind {
 define dso_local i64 @load_sext_zext_anyext_i1(ptr %a) nounwind {
 ; RV64I-LABEL: load_sext_zext_anyext_i1:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lbu a1, 1(a0)
-; RV64I-NEXT:    lbu a2, 2(a0)
 ; RV64I-NEXT:    lbu zero, 0(a0)
-; RV64I-NEXT:    sub a0, a2, a1
+; RV64I-NEXT:    lbu a1, 1(a0)
+; RV64I-NEXT:    lbu a0, 2(a0)
+; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    ret
   ; sextload i1
   %1 = getelementptr i1, ptr %a, i32 1
@@ -188,10 +185,10 @@ define dso_local i64 @load_sext_zext_anyext_i1(ptr %a) nounwind {
 define dso_local i16 @load_sext_zext_anyext_i1_i16(ptr %a) nounwind {
 ; RV64I-LABEL: load_sext_zext_anyext_i1_i16:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lbu a1, 1(a0)
-; RV64I-NEXT:    lbu a2, 2(a0)
 ; RV64I-NEXT:    lbu zero, 0(a0)
-; RV64I-NEXT:    sub a0, a2, a1
+; RV64I-NEXT:    lbu a1, 1(a0)
+; RV64I-NEXT:    lbu a0, 2(a0)
+; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    ret
   ; sextload i1
   %1 = getelementptr i1, ptr %a, i32 1
diff --git a/llvm/test/CodeGen/RISCV/memcpy-inline.ll b/llvm/test/CodeGen/RISCV/memcpy-inline.ll
index 343695ee37da84..266afe00037280 100644
--- a/llvm/test/CodeGen/RISCV/memcpy-inline.ll
+++ b/llvm/test/CodeGen/RISCV/memcpy-inline.ll
@@ -45,16 +45,16 @@ define void @unaligned_memcpy2(ptr nocapture %dest, ptr %src) nounwind {
 ; RV32-LABEL: unaligned_memcpy2:
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    lbu a2, 1(a1)
-; RV32-NEXT:    sb a2, 1(a0)
 ; RV32-NEXT:    lbu a1, 0(a1)
+; RV32-NEXT:    sb a2, 1(a0)
 ; RV32-NEXT:    sb a1, 0(a0)
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: unaligned_memcpy2:
 ; RV64:       # %bb.0: # %entry
 ; RV64-NEXT:    lbu a2, 1(a1)
-; RV64-NEXT:    sb a2, 1(a0)
 ; RV64-NEXT:    lbu a1, 0(a1)
+; RV64-NEXT:    sb a2, 1(a0)
 ; RV64-NEXT:    sb a1, 0(a0)
 ; RV64-NEXT:    ret
 ;
@@ -78,20 +78,20 @@ define void @unaligned_memcpy3(ptr nocapture %dest, ptr %src) nounwind {
 ; RV32-LABEL: unaligned_memcpy3:
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    lbu a2, 2(a1)
-; RV32-NEXT:    sb a2, 2(a0)
-; RV32-NEXT:    lbu a2, 1(a1)
-; RV32-NEXT:    sb a2, 1(a0)
+; RV32-NEXT:    lbu a3, 1(a1)
 ; RV32-NEXT:    lbu a1, 0(a1)
+; RV32-NEXT:    sb a2, 2(a0)
+; RV32-NEXT:    sb a3, 1(a0)
 ; RV32-NEXT:    sb a1, 0(a0)
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: unaligned_memcpy3:
 ; RV64:       # %bb.0: # %entry
 ; RV64-NEXT:    lbu a2, 2(a1)
-; RV64-NEXT:    sb a2, 2(a0)
-; RV64-NEXT:    lbu a2, 1(a1)
-; RV64-NEXT:    sb a2, 1(a0)
+; RV64-NEXT:    lbu a3, 1(a1)
 ; RV64-NEXT:    lbu a1, 0(a1)
+; RV64-NEXT:    sb a2, 2(a0)
+; RV64-NEXT:    sb a3, 1(a0)
 ; RV64-NEXT:    sb a1, 0(a0)
 ; RV64-NEXT:    ret
 ;
@@ -119,24 +119,24 @@ define void @unaligned_memcpy4(ptr nocapture %dest, ptr %src) nounwind {
 ; RV32-LABEL: unaligned_memcpy4:
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    lbu a2, 3(a1)
-; RV32-NEXT:    sb a2, 3(a0)
-; RV32-NEXT:    lbu a2, 2(a1)
-; RV32-NEXT:    sb a2, 2(a0)
-; RV32-NEXT:    lbu a2, 1(a1)
-; RV32-NEXT:    sb a2, 1(a0)
+; RV32-NEXT:    lbu a3, 2(a1)
+; RV32-NEXT:    lbu a4, 1(a1)
 ; RV32-NEXT:    lbu a1, 0(a1)
+; RV32-NEXT:    sb a2, 3(a0)
+; RV32-NEXT:    sb a3, 2(a0)
+; RV32-NEXT:    sb a4, 1(a0)
 ; RV32-NEXT:    sb a1, 0(a0)
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: unaligned_memcpy4:
 ; RV64:       # %bb.0: # %entry
 ; RV64-NEXT:    lbu a2, 3(a1)
-; RV64-NEXT:    sb a2, 3(a0)
-; RV64-NEXT:    lbu a2, 2(a1)
-; RV64-NEXT:    sb a2, 2(a0)
-; RV64-NEXT:    lbu a2, 1(a1)
-; RV64-NEXT:    sb a2, 1(a0)
+; RV64-NEXT:    lbu a3, 2(a1)
+; RV64-NEXT:    lbu a4, 1(a1)
 ; RV64-NEXT:    lbu a1, 0(a1)
+; RV64-NEXT:    sb a2, 3(a0)
+; RV64-NEXT:    sb a3, 2(a0)
+; RV64-NEXT:    sb a4, 1(a0)
 ; RV64-NEXT:    sb a1, 0(a0)
 ; RV64-NEXT:    ret
 ;
@@ -166,12 +166,12 @@ define void @unaligned_memcpy7(ptr nocapture %dest, ptr %src) nounwind {
 ; RV32-NEXT:    lbu a2, 4(a1)
 ; RV32-NEXT:    sb a2, 4(a0)
 ; RV32-NEXT:    lbu a2, 3(a1)
-; RV32-NEXT:    sb a2, 3(a0)
-; RV32-NEXT:    lbu a2, 2(a1)
-; RV32-NEXT:    sb a2, 2(a0)
-; RV32-NEXT:    lbu a2, 1(a1)
-; RV32-NEXT:    sb a2, 1(a0)
+; RV32-NEXT:    lbu a3, 2(a1)
+; RV32-NEXT:    lbu a4, 1(a1)
 ; RV32-NEXT:    lbu a1, 0(a1)
+; RV32-NEXT:    sb a2, 3(a0)
+; RV32-NEXT:    sb a3, 2(a0)
+; RV32-NEXT:    sb a4, 1(a0)
 ; RV32-NEXT:    sb a1, 0(a0)
 ; RV32-NEXT:    ret
 ;
@@ -184,28 +184,28 @@ define void @unaligned_memcpy7(ptr nocapture %dest, ptr %src) nounwind {
 ; RV64-NEXT:    lbu a2, 4(a1)
 ; RV64-NEXT:    sb a2, 4(a0)
 ; RV64-NEXT:    lbu a2, 3(a1)
-; RV64-NEXT:    sb a2, 3(a0)
-; RV64-NEXT:    lbu a2, 2(a1)
-; RV64-NEXT:    sb a2, 2(a0)
-; RV64-NEXT:    lbu a2, 1(a1)
-; RV64-NEXT:    sb a2, 1(a0)
+; RV64-NEXT:    lbu a3, 2(a1)
+; RV64-NEXT:    lbu a4, 1(a1)
 ; RV64-NEXT:    lbu a1, 0(a1)
+; RV64-NEXT:    sb a2, 3(a0)
+; RV64-NEXT:    sb a3, 2(a0)
+; RV64-NEXT:    sb a4, 1(a0)
 ; RV64-NEXT:    sb a1, 0(a0)
 ; RV64-NEXT:    ret
 ;
 ; RV32-FAST-LABEL: unaligned_memcpy7:
 ; RV32-FAST:       # %bb.0: # %entry
 ; RV32-FAST-NEXT:    lw a2, 3(a1)
-; RV32-FAST-NEXT:    sw a2, 3(a0)
 ; RV32-FAST-NEXT:    lw a1, 0(a1)
+; RV32-FAST-NEXT:    sw a2, 3(a0)
 ; RV32-FAST-NEXT:    sw a1, 0(a0)
 ; RV32-FAST-NEXT:    ret
 ;
 ; RV64-FAST-LABEL: unaligned_memcpy7:
 ; RV64-FAST:       # %bb.0: # %entry
 ; RV64-FAST-NEXT:    lw a2, 3(a1)
-; RV64-FAST-NEXT:    sw a2, 3(a0)
 ; RV64-FAST-NEXT:    lw a1, 0(a1)
+; RV64-FAST-NEXT:    sw a2, 3(a0)
 ; RV64-FAST-NEXT:    sw a1, 0(a0)
 ; RV64-FAST-NEXT:    ret
 entry:
@@ -225,12 +225,12 @@ define void @unaligned_memcpy8(ptr nocapture %dest, ptr %src) nounwind {
 ; RV32-NEXT:    lbu a2, 4(a1)
 ; RV32-NEXT:    sb a2, 4(a0)
 ; RV32-NEXT:    lbu a2, 3(a1)
-; RV32-NEXT:    sb a2, 3(a0)
-; RV32-NEXT:    lbu a2, 2(a1)
-; RV32-NEXT:    sb a2, 2(a0)
-; RV32-NEXT:    lbu a2, 1(a1)
-; RV32-NEXT:    sb a2, 1(a0)
+; RV32-NEXT:    lbu a3, 2(a1)
+; RV32-NEXT:    lbu a4, 1(a1)
 ; RV32-NEXT:    lbu a1, 0(a1)
+; RV32-NEXT:    sb a2, 3(a0)
+; RV32-NEXT:    sb a3, 2(a0)
+; RV32-NEXT:    sb a4, 1(a0)
 ; RV32-NEXT:    sb a1, 0(a0)
 ; RV32-NEXT:    ret
 ;
@@ -245,20 +245,20 @@ define void @unaligned_memcpy8(ptr nocapture %dest, ptr %src) nounwind {
 ; RV64-NEXT:    lbu a2, 4(a1)
 ; RV64-NEXT:    sb a2, 4(a0)
 ; RV64-NEXT:    lbu a2, 3(a1)
-; RV64-NEXT:    sb a2, 3(a0)
-; RV64-NEXT:    lbu a2, 2(a1)
-; RV64-NEXT:    sb a2, 2(a0)
-; RV64-NEXT:    lbu a2, 1(a1)
-; RV64-NEXT:    sb a2, 1(a0)
+; RV64-NEXT:    lbu a3, 2(a1)
+; RV64-NEXT:    lbu a4, 1(a1)
 ; RV64-NEXT:    lbu a1, 0(a1)
+; RV64-NEXT:    sb a2, 3(a0)
+; RV64-NEXT:    sb a3, 2(a0)
+; RV64-NEXT:    sb a4, 1(a0)
 ; RV64-NEXT:    sb a1, 0(a0)
 ; RV64-NEXT:    ret
 ;
 ; RV32-FAST-LABEL: unaligned_memcpy8:
 ; RV32-FAST:       # %bb.0: # %entry
 ; RV32-FAST-NEXT:    lw a2, 4(a1)
-; RV32-FAST-NEXT:    sw a2, 4(a0)
 ; RV32-FAST-NEXT:    lw a1, 0(a1)
+; RV32-FAST-NEXT:    sw a2, 4(a0)
 ; RV32-FAST-NEXT:    sw a1, 0(a0)
 ; RV32-FAST-NEXT:    ret
 ;
@@ -298,12 +298,12 @@ define void @unaligned_memcpy15(ptr nocapture %dest, ptr %src) nounwind {
 ; RV32-NEXT:    lbu a2, 4(a1)
 ; RV32-NEXT:    sb a2, 4(a0)
 ; RV32-NEXT:    lbu a2, 3(a1)
-; RV32-NEXT:    sb a2, 3(a0)
-; RV32-NEXT:    lbu a2, 2(a1)
-; RV32-NEXT:    sb a2, 2(a0)
-; RV32-NEXT:    lbu a2, 1(a1)
-; RV32-NEXT:    sb a2, 1(a0)
+; RV32-NEXT:    lbu a3, 2(a1)
+; RV32-NEXT:    lbu a4, 1(a1)
 ; RV32-NEXT:    lbu a1, 0(a1)
+; RV32-NEXT:    sb a2, 3(a0)
+; RV32-NEXT:    sb a3, 2(a0)
+; RV32-NEXT:    sb a4, 1(a0)
 ; RV32-NEXT:    sb a1, 0(a0)
 ; RV32-NEXT:    ret
 ;
@@ -332,32 +332,32 @@ define void @unaligned_memcpy15(ptr nocapture %dest, ptr %src) nounwind {
 ; RV64-NEXT:    lbu a2, 4(a1)
 ; RV64-NEXT:    sb a2, 4(a0)
 ; RV64-NEXT:    lbu a2, 3(a1)
-; RV64-NEXT:    sb a2, 3(a0)
-; RV64-NEXT:    lbu a2, 2(a1)
-; RV64-NEXT:    sb a2, 2(a0)
-; RV64-NEXT:    lbu a2, 1(a1)
-; RV64-NEXT:    sb a2, 1(a0)
+; RV64-NEXT:    lbu a3, 2(a1)
+; RV64-NEXT:    lbu a4, 1(a1)
 ; RV64-NEXT:    lbu a1, 0(a1)
+; RV64-NEXT:    sb a2, 3(a0)
+; RV64-NEXT:    sb a3, 2(a0)
+; RV64-NEXT:    sb a4, 1(a0)
 ; RV64-NEXT:    sb a1, 0(a0)
 ; RV64-NEXT:    ret
 ;
 ; RV32-FAST-LABEL: unaligned_memcpy15:
 ; RV32-FAST:       # %bb.0: # %entry
 ; RV32-FAST-NEXT:    lw a2, 11(a1)
-; RV32-FAST-NEXT:    sw a2, 11(a0)
-; RV32-FAST-NEXT:    lw a2, 8(a1)
-; RV32-FAST-NEXT:    sw a2, 8(a0)
-; RV32-FAST-NEXT:    lw a2, 4(a1)
-; RV32-FAST-NEXT:    sw a2, 4(a0)
+; RV32-FAST-NEXT:    lw a3, 8(a1)
+; RV32-FAST-NEXT:    lw a4, 4(a1)
 ; RV32-FAST-NEXT:    lw a1, 0(a1)
+; RV32-FAST-NEXT:    sw a2, 11(a0)
+; RV32-FAST-NEXT:    sw a3, 8(a0)
+; RV32-FAST-NEXT:    sw a4, 4(a0)
 ; RV32-FAST-NEXT:    sw a1, 0(a0)
 ; RV32-FAST-NEXT:    ret
 ;
 ; RV64-FAST-LABEL: unaligned_memcpy15:
 ; RV64-FAST:       # %bb.0: # %entry
 ; RV64-FAST-NEXT:    ld a2, 7(a1)
-; RV64-FAST-NEXT:    sd a2, 7(a0)
 ; RV64-FAST-NEXT:    ld a1, 0(a1)
+; RV64-FAST-NEXT:    sd a2, 7(a0)
 ; RV64-FAST-NEXT:    sd a1, 0(a0)
 ; RV64-FAST-NEXT:    ret
 entry:
@@ -393,12 +393,12 @@ define void @unaligned_memcpy16(ptr nocapture %dest, ptr %src) nounwind {
 ; RV32-NEXT:    lbu a2, 4(a1)
 ; RV32-NEXT:    sb a2, 4(a0)
 ; RV32-NEXT:    lbu a2, 3(a1)
-; RV32-NEXT:    sb a2, 3(a0)
-; RV32-NEXT:    lbu a2, 2(a1)
-; RV32-NEXT:    sb a2, 2(a0)
-; RV32-NEXT:    lbu a2, 1(a1)
-; RV32-NEXT:    sb a2, 1(a0)
+; RV32-NEXT:    lbu a3, 2(a1)
+; RV32-NEXT:    lbu a4, 1(a1)
 ; RV32-NEXT:    lbu a1, 0(a1)
+; RV32-NEXT:    sb a2, 3(a0)
+; RV32-NEXT:    sb a3, 2(a0)
+; RV32-NEXT:    sb a4, 1(a0)
 ; RV32-NEXT:    sb a1, 0(a0)
 ; RV32-NEXT:    ret
 ;
@@ -429,32 +429,32 @@ define void @unaligned_memcpy16(ptr nocapture %dest, ptr %src) nounwind {
 ; RV64-NEXT:    lbu a2, 4(a1)
 ; RV64-NEXT:    sb a2, 4(a0)
 ; RV64-NEXT:    lbu a2, 3(a1)
-; RV64-NEXT:    sb a2, 3(a0)
-; RV64-NEXT:    lbu a2, 2(a1)
-; RV64-NEXT:    sb a2, 2(a0)
-; RV64-NEXT:    lbu a2, 1(a1)
-; RV64-NEXT:    sb a2, 1(a0)
+; RV64-NEXT:    lbu a3, 2(a1)
+; RV64-NEXT:    lbu a4, 1(a1)
 ; RV64-NEXT:    lbu a1, 0(a1)
+; RV64-NEXT:    sb a2, 3(a0)
+; RV64-NEXT:    sb a3, 2(a0)
+; RV64-NEXT:    sb a4, 1(a0)
 ; RV64-NEXT:    sb a1, 0(a0)
 ; RV64-NEXT:    ret
 ;
 ; RV32-FAST-LABEL: unaligned_memcpy16:
 ; RV32-FAST:       # %bb.0: # %entry
 ; RV32-FAST-NEXT:    lw a2, 12(a1)
-; RV32-FAST-NEXT:    sw a2, 12(a0)
-; RV32-FAST-NEXT:    lw a2, 8(a1)
-; RV32-FAST-NEXT:    sw a2, 8(a0)
-; RV32-FAST-NEXT:    lw a2, 4(a1)
-; RV32-FAST-NEXT:    sw a2, 4(a0)
+; RV32-FAST-NEXT:    lw a3, 8(a1)
+; RV32-FAST-NEXT:    lw a4, 4(a1)
 ; RV32-FAST-NEXT:    lw a1, 0(a1)
+; RV32-FAST-NEXT:    sw a2, 12(a0)
+; RV32-FAST-NEXT:    sw a3, 8(a0)
+; RV32-FAST-NEXT:    sw a4, 4(a0)
 ; RV32-FAST-NEXT:    sw a1, 0(a0)
 ; RV32-FAST-NEXT:    ret
 ;
 ; RV64-FAST-LABEL: unaligned_memcpy16:
 ; RV64-FAST:       # %bb.0: # %entry
 ; RV64-FAST-NEXT:    ld a2, 8(a1)
-; RV64-FAST-NEXT:    sd a2, 8(a0)
 ; RV64-FAST-NEXT:    ld a1, 0(a1)
+; RV64-FAST-NEXT:    sd a2, 8(a0)
 ; RV64-FAST-NEXT:    sd a1, 0(a0)
 ; RV64-FAST-NEXT:    ret
 entry:
@@ -520,12 +520,12 @@ define void @unaligned_memcpy31(ptr nocapture %dest, ptr %src) nounwind {
 ; RV32-NEXT:    lbu a2, 4(a1)
 ; RV32-NEXT:    sb a2, 4(a0)
 ; RV32-NEXT:    lbu a2, 3(a1)
-; RV32-NEXT:    sb a2, 3(a0)
-; RV32-NEXT:    lbu a2, 2(a1)
-; RV32-NEXT:    sb a2, 2(a0)
-; RV32-NEXT:    lbu a2, 1(a1)
-; RV32-NEXT:    sb a2, 1(a0)
+; RV32-NEXT:    lbu a3, 2(a1)
+; RV32-NEXT:    lbu a4, 1(a1)
 ; RV32-NEXT:    lbu a1, 0(a1)
+; RV32-NEXT:    sb a2, 3(a0)
+; RV32-NEXT:    sb a3, 2(a0)
+; RV32-NEXT:    sb a4, 1(a0)
 ; RV32-NEXT:    sb a1, 0(a0)
 ; RV32-NEXT:    ret
 ;
@@ -586,12 +586,12 @@ define void @unaligned_memcpy31(ptr nocapture %dest, ptr %src) nounwind {
 ; RV64-NEXT:    lbu a2, 4(a1)
 ; RV64-NEXT:    sb a2, 4(a0)
 ; RV64-NEXT:    lbu a2, 3(a1)
-; RV64-NEXT:    sb a2, 3(a0)
-; RV64-NEXT:    lbu a2, 2(a1)
-; RV64-NEXT:    sb a2, 2(a0)
-; RV64-NEXT:    lbu a2, 1(a1)
-; RV64-NEXT:    sb a2, 1(a0)
+; RV64-NEXT:    lbu a3, 2(a1)
+; RV64-NEXT:    lbu a4, 1(a1)
 ; RV64-NEXT:    lbu a1, 0(a1)
+; RV64-NEXT:    sb a2, 3(a0)
+; RV64-NEXT:    sb a3, 2(a0)
+; RV64-NEXT:    sb a4, 1(a0)
 ; RV64-NEXT:    sb a1, 0(a0)
 ; RV64-NEXT:    ret
 ;
@@ -606,24 +606,24 @@ define void @unaligned_memcpy31(ptr nocapture %dest, ptr %src) nounwind {
 ; RV32-FAST-NEXT:    lw a2, 16(a1)
 ; RV32-FAST-NEXT:    sw a2, 16(a0)
 ; RV32-FAST-NEXT:    lw a2, 12(a1)
-; RV32-FAST-NEXT:    sw a2, 12(a0)
-; RV32-FAST-NEXT:    lw a2, 8(a1)
-; RV32-FAST-NEXT:    sw a2, 8(a0)
-; RV32-FAST-NEXT:    lw a2, 4(a1)
-; RV32-FAST-NEXT:    sw a2, 4(a0)
+; RV32-FAST-NEXT:    lw a3, 8(a1)
+; RV32-FAST-NEXT:    lw a4, 4(a1)
 ; RV32-FAST-NEXT:    lw a1, 0(a1)
+; RV32-FAST-NEXT:    sw a2, 12(a0)
+; RV32-FAST-NEXT:    sw a3, 8(a0)
+; RV32-FAST-NEXT:    sw a4, 4(a0)
 ; RV32-FAST-NEXT:    sw a1, 0(a0)
 ; RV32-FAST-NEXT:    ret
 ;
 ; RV64-FAST-LABEL: unaligned_memcpy31:
 ; RV64-FAST:       # %bb.0: # %entry
 ; RV64-FAST-NEXT:    ld a2, 23(a1)
-; RV64-FAST-NEXT:    sd a2, 23(a0)
-; RV64-FAST-NEXT:    ld a2, 16(a1)
-; RV64-FAST-NEXT:    sd a2, 16(a0)
-; RV64-FAST-NEXT:    ld a2, 8(a1)
-; RV64-FAST-NEXT:    sd a2, 8(a0)
+; RV64-FAST-NEXT:    ld a3, 16(a1)
+; RV64-FAST-NEXT:    ld a4, 8(a1)
 ; RV64-FAST-NEXT:    ld a1, 0(a1)
+; RV64-FAST-NEXT:    sd a2, 23(a0)
+; RV64-FAST-NEXT:    sd a3, 16(a0)
+; RV64-FAST-NEXT:    sd a4, 8(a0)
 ; RV64-FAST-NEXT:    sd a1, 0(a0)
 ; RV64-FAST-NEXT:    ret
 entry:
@@ -743,16 +743,16 @@ define void @aligned_memcpy7(ptr nocapture %dest, ptr %src) nounwind {
 ; RV32-FAST-LABEL: aligned_memcpy7:
 ; RV32-FAST:       # %bb.0: # %entry
 ; RV32-FAST-NEXT:    lw a2, 3(a1)
-; RV32-FAST-NEXT:    sw a2, 3(a0)
 ; RV32-FAST-NEXT:    lw a1, 0(a1)
+; RV32-FAST-NEXT:    sw a2, 3(a0)
 ; RV32-FAST-NEXT:    sw a1, 0(a0)
 ; RV32-FAST-NEXT:    ret
 ;
 ; RV64-FAST-LABEL: aligned_memcpy7:
 ; RV64-FAST:       # %bb.0: # %entry
 ; RV64-FAST-NEXT:    lw a2, 3(a1)
-; RV64-FAST-NEXT:    sw a2, 3(a0)
 ; RV64-FAST-NEXT:    lw a1, 0(a1)
+; RV64-FAST-NEXT:    sw a2, 3(a0)
 ; RV64-FAST-NEXT:    sw a1, 0(a0)
 ; RV64-FAST-NEXT:    ret
 entry:
@@ -764,8 +764,8 @@ define void @aligned_memcpy8(ptr nocapture %dest, ptr %src) nounwind {
 ; RV32-BOTH-LABEL: aligned_memcpy8:
 ; RV32-BOTH:       # %bb.0: # %entry
 ; RV32-BOTH-NEXT:    lw a2, 4(a1)
-; RV32-BOTH-NEXT:    sw a2, 4(a0)
 ; RV32-BOTH-NEXT:    lw a1, 0(a1)
+; RV32-BOTH-NEXT:    sw a2, 4(a0)
 ; RV32-BOTH-NEXT:    sw a1, 0(a0)
 ; RV32-BOTH-NEXT:    ret
 ;
@@ -787,10 +787,10 @@ define void @aligned_memcpy15(ptr nocapture %dest, ptr %src) nounwind {
 ; RV32-NEXT:    lh a2, 12(a1)
 ; RV32-NEXT:    sh a2, 12(a0)
 ; RV32-NEXT:    lw a2, 8(a1)
-; RV32-NEXT:    sw a2, 8(a0)
-; RV32-NEXT:    lw a2, 4(a1)
-; RV32-NEXT:    sw a2, 4(a0)
+; RV32-NEXT:    lw a3, 4(a1)
 ; RV32-NEXT:    lw a1, 0(a1)
+; RV32-NEXT:    sw a2, 8(a0)
+; RV32-NEXT:    sw a3, 4(a0)
 ; RV32-NEXT:    sw a1, 0(a0)
 ; RV32-NEXT:    ret
 ;
@@ -809,20 +809,20 @@ define void @aligned_memcpy15(ptr nocapture %dest, ptr %src) nounwind {
 ; RV32-FAST-LABEL: aligned_memcpy15:
 ; RV32-FAST:       # %bb.0: # %entry
 ; RV32-FAST-NEXT:    lw a2, 11(a1)
-; RV32-FAST-NEXT:    sw a2, 11(a0)
-; RV32-FAST-NEXT:    lw a2, 8(a1)
-; RV32-FAST-NEXT:    sw a2, 8(a0)
-; RV32-FAST-NEXT:    lw a2, 4(a1)
-; RV32-FAST-NEXT:    sw a2, 4(a0)
+; RV32-FAST-NEXT:    lw a3, 8(a1)
+; RV32-FAST-NEXT:    lw a4, 4(a1)
 ; RV32-FAST-NEXT:    lw a1, 0(a1)
+; RV32-FAST-NEXT:    sw a2, 11(a0)
+; RV32-FAST-NEXT:    sw a3, 8(a0)
+; RV32-FAST-NEXT:    sw a4, 4(a0)
 ; RV32-FAST-NEXT:    sw a1, 0(a0)
 ; RV32-FAST-NEXT:    ret
 ;
 ; RV64-FAST-LABEL: aligned_memcpy15:
 ; RV64-FAST:       # %bb.0: # %entry
 ; RV64-FAST-NEXT:    ld a2, 7(a1)
-; RV64-FAST-NEXT:    sd a2, 7(a0)
 ; RV64-FAST-NEXT:    ld a1, 0(a1)
+; RV64-FAST-NEXT:    sd a2, 7(a0)
 ; RV64-FAST-NEXT:    sd a1, 0(a0)
 ; RV64-FAST-NEXT:    ret
 entry:
@@ -834,20 +834,20 @@ define void @aligned_memcpy16(ptr nocapture %dest, ptr %src) nounwind {
 ; RV32-BOTH-LABEL: aligned_memcpy16:
 ; RV32-BOTH:       # %bb.0: # %entry
 ; RV32-BOTH-NEXT:    lw a2, 12(a1)
-; RV32-BOTH-NEXT:    sw a2, 12(a0)
-; RV32-BOTH-NEXT:    lw a2, 8(a1)
-; RV32-BOTH-NEXT:    sw a2, 8(a0)
-; RV32-BOTH-NEXT:    lw a2, 4(a1)
-; RV32-BOTH-NEXT:    sw a2, 4(a0)
+; RV32-BOTH-NEXT:    lw a3, 8(a1)
+; RV32-BOTH-NEXT:    lw a4, 4(a1)
 ; RV32-BOTH-NEXT:    lw a1, 0(a1)
+; RV32-BOTH-NEXT:    sw a2, 12(a0)
+; RV32-BOTH-NEXT:    sw a3, 8(a0)
+; RV32-BOTH-NEXT:    sw a4, 4(a0)
 ; RV32-BOTH-NEXT:    sw a1, 0(a0)
 ; RV32-BOTH-NEXT:    ret
 ;
 ; RV64-BOTH-LABEL: aligned_memcpy16:
 ; RV64-BOTH:       # %bb.0: # %entry
 ; RV64-BOTH-NEXT:    ld a2, 8(a1)
-; RV64-BOTH-NEXT:    sd a2, 8(a0)
 ; RV64-BOTH-NEXT:    ld a1, 0(a1)
+; RV64-BOTH-NEXT:    sd a2, 8(a0)
 ; RV64-BOTH-NEXT:    sd a1, 0(a0)
 ; RV64-BOTH-NEXT:    ret
 entry:
@@ -869,12 +869,12 @@ define void @aligned_memcpy31(ptr nocapture %dest, ptr %src) nounwind {
 ; RV32-NEXT:    lw a2, 16(a1)
 ; RV32-NEXT:    sw a2, 16(a0)
 ; RV32-NEXT:    lw a2, 12(a1)
-; RV32-NEXT:    sw a2, 12(a0)
-; RV32-NEXT:    lw a2, 8(a1)
-; RV32-NEXT:    sw a2, 8(a0)
-; RV32-NEXT:    lw a2, 4(a1)
-; RV32-NEXT:    sw a2, 4(a0)
+; RV32-NEXT:    lw a3, 8(a1)
+; RV32-NEXT:    lw a4, 4(a1)
 ; RV32-NEXT:    lw a1, 0(a1)
+; RV32-NEXT:    sw a2, 12(a0)
+; RV32-NEXT:    sw a3, 8(a0)
+; RV32-NEXT:    sw a4, 4(a0)
 ; RV32-NEXT:    sw a1, 0(a0)
 ; RV32-NEXT:    ret
 ;
@@ -887,10 +887,10 @@ define void @aligned_memcpy31(ptr nocapture %dest, ptr %src) nounwind {
 ; RV64-NEXT:    lw a2, 24(a1)
 ; RV64-NEXT:    sw a2, 24(a0)
 ; RV64-NEXT:    ld a2, 16(a1)
-; RV64-NEXT:    sd a2, 16(a0)
-; RV64-NEXT:    ld a2, 8(a1)
-; RV64-NEXT:    sd a2, 8(a0)
+; RV64-NEXT:    ld a3, 8(a1)
 ; RV64-NEXT:    ld a1, 0(a1)
+; RV64-NEXT:    sd a2, 16(a0)
+; RV64-NEXT:    sd a3, 8(a0)
 ; RV64-NEXT:    sd a1, 0(a0)
 ; RV64-NEXT:    ret
 ;
@@ -905,24 +905,24 @@ define void @aligned_memcpy31(ptr nocapture %dest, ptr %src) nounwind {
 ; RV32-FAST-NEXT:    lw a2, 16(a1)
 ; RV32-FAST-NEXT:    sw a2, 16(a0)
 ; RV32-FAST-NEXT:    lw a2, 12(a1)
-; RV32-FAST-NEXT:    sw a2, 12(a0)
-; RV32-FAST-NEXT:    lw a2, 8(a1)
-; RV32-FAST-NEXT:    sw a2, 8(a0)
-; RV32-FAST-NEXT:    lw a2, 4(a1)
-; RV32-FAST-NEXT:    sw a2, 4(a0)
+; RV32-FAST-NEXT:    lw a3, 8(a1)
+; RV32-FAST-NEXT:    lw a4, 4(a1)
 ; RV32-FAST-NEXT:    lw a1, 0(a1)
+; RV32-FAST-NEXT:    sw a2, 12(a0)
+; RV32-FAST-NEXT:    sw a3, 8(a0)
+; RV32-FAST-NEXT:    sw a4, 4(a0)
 ; RV32-FAST-NEXT:    sw a1, 0(a0)
 ; RV32-FAST-NEXT:    ret
 ;
 ; RV64-FAST-LABEL: aligned_memcpy31:
 ; RV64-FAST:       # %bb.0: # %entry
 ; RV64-FAST-NEXT:    ld a2, 23(a1)
-; RV64-FAST-NEXT:    sd a2, 23(a0)
-; RV64-FAST-NEXT:    ld a2, 16(a1)
-; RV64-FAST-NEXT:    sd a2, 16(a0)
-; RV64-FAST-NEXT:    ld a2, 8(a1)
-; RV64-FAST-NEXT:    sd a2, 8(a0)
+; RV64-FAST-NEXT:    ld a3, 16(a1)
+; RV64-FAST-NEXT:    ld a4, 8(a1)
 ; RV64-FAST-NEXT:    ld a1, 0(a1)
+; RV64-FAST-NEXT:    sd a2, 23(a0)
+; RV64-FAST-NEXT:    sd a3, 16(a0)
+; RV64-FAST-NEXT:    sd a4, 8(a0)
 ; RV64-FAST-NEXT:    sd a1, 0(a0)
 ; RV64-FAST-NEXT:    ret
 entry:
@@ -938,32 +938,32 @@ define void @memcpy16_align4(ptr nocapture %dest, ptr nocapture %src) nounwind {
 ; RV32-BOTH-LABEL: memcpy16_align4:
 ; RV32-BOTH:       # %bb.0: # %entry
 ; RV32-BOTH-NEXT:    lw a2, 12(a1)
-; RV32-BOTH-NEXT:    sw a2, 12(a0)
-; RV32-BOTH-NEXT:    lw a2, 8(a1)
-; RV32-BOTH-NEXT:    sw a2, 8(a0)
-; RV32-BOTH-NEXT:    lw a2, 4(a1)
-; RV32-BOTH-NEXT:    sw a2, 4(a0)
+; RV32-BOTH-NEXT:    lw a3, 8(a1)
+; RV32-BOTH-NEXT:    lw a4, 4(a1)
 ; RV32-BOTH-NEXT:    lw a1, 0(a1)
+; RV32-BOTH-NEXT:    sw a2, 12(a0)
+; RV32-BOTH-NEXT:    sw a3, 8(a0)
+; RV32-BOTH-NEXT:    sw a4, 4(a0)
 ; RV32-BOTH-NEXT:    sw a1, 0(a0)
 ; RV32-BOTH-NEXT:    ret
 ;
 ; RV64-LABEL: memcpy16_align4:
 ; RV64:       # %bb.0: # %entry
 ; RV64-NEXT:    lw a2, 12(a1)
-; RV64-NEXT:    sw a2, 12(a0)
-; RV64-NEXT:    lw a2, 8(a1)
-; RV64-NEXT:    sw a2, 8(a0)
-; RV64-NEXT:    lw a2, 4(a1)
-; RV64-NEXT:    sw a2, 4(a0)
+; RV64-NEXT:    lw a3, 8(a1)
+; RV64-NEXT:    lw a4, 4(a1)
 ; RV64-NEXT:    lw a1, 0(a1)
+; RV64-NEXT:    sw a2, 12(a0)
+; RV64-NEXT:    sw a3, 8(a0)
+; RV64-NEXT:    sw a4, 4(a0)
 ; RV64-NEXT:    sw a1, 0(a0)
 ; RV64-NEXT:    ret
 ;
 ; RV64-FAST-LABEL: memcpy16_align4:
 ; RV64-FAST:       # %bb.0: # %entry
 ; RV64-FAST-NEXT:    ld a2, 8(a1)
-; RV64-FAST-NEXT:    sd a2, 8(a0)
 ; RV64-FAST-NEXT:    ld a1, 0(a1)
+; RV64-FAST-NEXT:    sd a2, 8(a0)
 ; RV64-FAST-NEXT:    sd a1, 0(a0)
 ; RV64-FAST-NEXT:    ret
 entry:
@@ -979,8 +979,8 @@ define i32 @memcpy11_align8(ptr nocapture %dest, ptr %src) {
 ; RV32-NEXT:    lh a2, 8(a1)
 ; RV32-NEXT:    sh a2, 8(a0)
 ; RV32-NEXT:    lw a2, 4(a1)
-; RV32-NEXT:    sw a2, 4(a0)
 ; RV32-NEXT:    lw a1, 0(a1)
+; RV32-NEXT:    sw a2, 4(a0)
 ; RV32-NEXT:    sw a1, 0(a0)
 ; RV32-NEXT:    li a0, 0
 ; RV32-NEXT:    ret
@@ -999,10 +999,10 @@ define i32 @memcpy11_align8(ptr nocapture %dest, ptr %src) {
 ; RV32-FAST-LABEL: memcpy11_align8:
 ; RV32-FAST:       # %bb.0: # %entry
 ; RV32-FAST-NEXT:    lw a2, 7(a1)
-; RV32-FAST-NEXT:    sw a2, 7(a0)
-; RV32-FAST-NEXT:    lw a2, 4(a1)
-; RV32-FAST-NEXT:    sw a2, 4(a0)
+; RV32-FAST-NEXT:    lw a3, 4(a1)
 ; RV32-FAST-NEXT:    lw a1, 0(a1)
+; RV32-FAST-NEXT:    sw a2, 7(a0)
+; RV32-FAST-NEXT:    sw a3, 4(a0)
 ; RV32-FAST-NEXT:    sw a1, 0(a0)
 ; RV32-FAST-NEXT:    li a0, 0
 ; RV32-FAST-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/memcpy.ll b/llvm/test/CodeGen/RISCV/memcpy.ll
index 44e9631d3442d6..9da5e2b00b2154 100644
--- a/llvm/test/CodeGen/RISCV/memcpy.ll
+++ b/llvm/test/CodeGen/RISCV/memcpy.ll
@@ -409,32 +409,32 @@ define void @t7(ptr nocapture %a, ptr nocapture %b) nounwind {
 ; RV32-BOTH-LABEL: t7:
 ; RV32-BOTH:       # %bb.0: # %entry
 ; RV32-BOTH-NEXT:    lw a2, 12(a1)
-; RV32-BOTH-NEXT:    sw a2, 12(a0)
-; RV32-BOTH-NEXT:    lw a2, 8(a1)
-; RV32-BOTH-NEXT:    sw a2, 8(a0)
-; RV32-BOTH-NEXT:    lw a2, 4(a1)
-; RV32-BOTH-NEXT:    sw a2, 4(a0)
+; RV32-BOTH-NEXT:    lw a3, 8(a1)
+; RV32-BOTH-NEXT:    lw a4, 4(a1)
 ; RV32-BOTH-NEXT:    lw a1, 0(a1)
+; RV32-BOTH-NEXT:    sw a2, 12(a0)
+; RV32-BOTH-NEXT:    sw a3, 8(a0)
+; RV32-BOTH-NEXT:    sw a4, 4(a0)
 ; RV32-BOTH-NEXT:    sw a1, 0(a0)
 ; RV32-BOTH-NEXT:    ret
 ;
 ; RV64-LABEL: t7:
 ; RV64:       # %bb.0: # %entry
 ; RV64-NEXT:    lw a2, 12(a1)
-; RV64-NEXT:    sw a2, 12(a0)
-; RV64-NEXT:    lw a2, 8(a1)
-; RV64-NEXT:    sw a2, 8(a0)
-; RV64-NEXT:    lw a2, 4(a1)
-; RV64-NEXT:    sw a2, 4(a0)
+; RV64-NEXT:    lw a3, 8(a1)
+; RV64-NEXT:    lw a4, 4(a1)
 ; RV64-NEXT:    lw a1, 0(a1)
+; RV64-NEXT:    sw a2, 12(a0)
+; RV64-NEXT:    sw a3, 8(a0)
+; RV64-NEXT:    sw a4, 4(a0)
 ; RV64-NEXT:    sw a1, 0(a0)
 ; RV64-NEXT:    ret
 ;
 ; RV64-FAST-LABEL: t7:
 ; RV64-FAST:       # %bb.0: # %entry
 ; RV64-FAST-NEXT:    ld a2, 8(a1)
-; RV64-FAST-NEXT:    sd a2, 8(a0)
 ; RV64-FAST-NEXT:    ld a1, 0(a1)
+; RV64-FAST-NEXT:    sd a2, 8(a0)
 ; RV64-FAST-NEXT:    sd a1, 0(a0)
 ; RV64-FAST-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/RISCV/misched-load-clustering.ll b/llvm/test/CodeGen/RISCV/misched-load-clustering.ll
index d84f24aaac8b8c..069d38460bb19a 100644
--- a/llvm/test/CodeGen/RISCV/misched-load-clustering.ll
+++ b/llvm/test/CodeGen/RISCV/misched-load-clustering.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; REQUIRES: asserts
 ; RUN: llc -mtriple=riscv32 -verify-misched -debug-only=machine-scheduler -o - 2>&1 < %s \
 ; RUN:   | FileCheck -check-prefix=LDCLUSTER %s
@@ -5,13 +6,6 @@
 ; RUN:   | FileCheck -check-prefix=LDCLUSTER %s
 
 define i32 @load_clustering_1(ptr nocapture %p) {
-; LDCLUSTER: ********** MI Scheduling **********
-; LDCLUSTER-LABEL: load_clustering_1:%bb.0
-; LDCLUSTER: *** Final schedule for %bb.0 ***
-; LDCLUSTER: SU(4): %4:gpr = LW %0:gpr, 4
-; LDCLUSTER: SU(2): %2:gpr = LW %0:gpr, 8
-; LDCLUSTER: SU(1): %1:gpr = LW %0:gpr, 12
-; LDCLUSTER: SU(5): %6:gpr = LW %0:gpr, 16
 entry:
   %arrayidx0 = getelementptr inbounds i32, ptr %p, i32 3
   %val0 = load i32, i32* %arrayidx0
@@ -26,3 +20,5 @@ entry:
   %tmp2 = add i32 %tmp1, %val3
   ret i32 %tmp2
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; LDCLUSTER: {{.*}}
diff --git a/llvm/test/CodeGen/RISCV/pr63816.ll b/llvm/test/CodeGen/RISCV/pr63816.ll
index 21730dfcf13bcd..45e5336d9c58c4 100644
--- a/llvm/test/CodeGen/RISCV/pr63816.ll
+++ b/llvm/test/CodeGen/RISCV/pr63816.ll
@@ -4,17 +4,20 @@
 define void @test(ptr %0, ptr %1) nounwind {
 ; CHECK-LABEL: test:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -80
-; CHECK-NEXT:    sd ra, 72(sp) # 8-byte Folded Spill
-; CHECK-NEXT:    sd s0, 64(sp) # 8-byte Folded Spill
-; CHECK-NEXT:    sd s1, 56(sp) # 8-byte Folded Spill
-; CHECK-NEXT:    fsd fs0, 48(sp) # 8-byte Folded Spill
-; CHECK-NEXT:    fsd fs1, 40(sp) # 8-byte Folded Spill
-; CHECK-NEXT:    fsd fs2, 32(sp) # 8-byte Folded Spill
-; CHECK-NEXT:    fsd fs3, 24(sp) # 8-byte Folded Spill
-; CHECK-NEXT:    fsd fs4, 16(sp) # 8-byte Folded Spill
-; CHECK-NEXT:    fsd fs5, 8(sp) # 8-byte Folded Spill
-; CHECK-NEXT:    fsd fs6, 0(sp) # 8-byte Folded Spill
+; CHECK-NEXT:    addi sp, sp, -112
+; CHECK-NEXT:    sd ra, 104(sp) # 8-byte Folded Spill
+; CHECK-NEXT:    sd s0, 96(sp) # 8-byte Folded Spill
+; CHECK-NEXT:    sd s1, 88(sp) # 8-byte Folded Spill
+; CHECK-NEXT:    sd s2, 80(sp) # 8-byte Folded Spill
+; CHECK-NEXT:    sd s3, 72(sp) # 8-byte Folded Spill
+; CHECK-NEXT:    sd s4, 64(sp) # 8-byte Folded Spill
+; CHECK-NEXT:    fsd fs0, 56(sp) # 8-byte Folded Spill
+; CHECK-NEXT:    fsd fs1, 48(sp) # 8-byte Folded Spill
+; CHECK-NEXT:    fsd fs2, 40(sp) # 8-byte Folded Spill
+; CHECK-NEXT:    fsd fs3, 32(sp) # 8-byte Folded Spill
+; CHECK-NEXT:    fsd fs4, 24(sp) # 8-byte Folded Spill
+; CHECK-NEXT:    fsd fs5, 16(sp) # 8-byte Folded Spill
+; CHECK-NEXT:    fsd fs6, 8(sp) # 8-byte Folded Spill
 ; CHECK-NEXT:    mv s0, a1
 ; CHECK-NEXT:    mv s1, a0
 ; CHECK-NEXT:    lhu a0, 12(a0)
@@ -30,19 +33,19 @@ define void @test(ptr %0, ptr %1) nounwind {
 ; CHECK-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NEXT:    fmv.s fs2, fa0
 ; CHECK-NEXT:    lhu a0, 6(s1)
+; CHECK-NEXT:    lhu s2, 0(s1)
+; CHECK-NEXT:    lhu s3, 2(s1)
+; CHECK-NEXT:    lhu s4, 4(s1)
 ; CHECK-NEXT:    fmv.w.x fa0, a0
 ; CHECK-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NEXT:    fmv.s fs3, fa0
-; CHECK-NEXT:    lhu a0, 4(s1)
-; CHECK-NEXT:    fmv.w.x fa0, a0
+; CHECK-NEXT:    fmv.w.x fa0, s4
 ; CHECK-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NEXT:    fmv.s fs4, fa0
-; CHECK-NEXT:    lhu a0, 2(s1)
-; CHECK-NEXT:    fmv.w.x fa0, a0
+; CHECK-NEXT:    fmv.w.x fa0, s3
 ; CHECK-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NEXT:    fmv.s fs5, fa0
-; CHECK-NEXT:    lhu a0, 0(s1)
-; CHECK-NEXT:    fmv.w.x fa0, a0
+; CHECK-NEXT:    fmv.w.x fa0, s2
 ; CHECK-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NEXT:    fcvt.d.s fs6, fa0
 ; CHECK-NEXT:    fcvt.d.s fs5, fs5
@@ -63,17 +66,20 @@ define void @test(ptr %0, ptr %1) nounwind {
 ; CHECK-NEXT:    fsd fs4, 16(s0)
 ; CHECK-NEXT:    fsd fs5, 8(s0)
 ; CHECK-NEXT:    fsd fs6, 0(s0)
-; CHECK-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
-; CHECK-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
-; CHECK-NEXT:    ld s1, 56(sp) # 8-byte Folded Reload
-; CHECK-NEXT:    fld fs0, 48(sp) # 8-byte Folded Reload
-; CHECK-NEXT:    fld fs1, 40(sp) # 8-byte Folded Reload
-; CHECK-NEXT:    fld fs2, 32(sp) # 8-byte Folded Reload
-; CHECK-NEXT:    fld fs3, 24(sp) # 8-byte Folded Reload
-; CHECK-NEXT:    fld fs4, 16(sp) # 8-byte Folded Reload
-; CHECK-NEXT:    fld fs5, 8(sp) # 8-byte Folded Reload
-; CHECK-NEXT:    fld fs6, 0(sp) # 8-byte Folded Reload
-; CHECK-NEXT:    addi sp, sp, 80
+; CHECK-NEXT:    ld ra, 104(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    ld s0, 96(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    ld s1, 88(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    ld s2, 80(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    ld s3, 72(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    ld s4, 64(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    fld fs0, 56(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    fld fs1, 48(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    fld fs2, 40(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    fld fs3, 32(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    fld fs4, 24(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    fld fs5, 16(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    fld fs6, 8(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    addi sp, sp, 112
 ; CHECK-NEXT:    ret
   %V1 = load <8 x half>, ptr %0
   %V2 = fpext <8 x half> %V1 to <8 x double>
diff --git a/llvm/test/CodeGen/RISCV/rv64-legal-i32/mem.ll b/llvm/test/CodeGen/RISCV/rv64-legal-i32/mem.ll
index 456a880891f730..3ecef05e962642 100644
--- a/llvm/test/CodeGen/RISCV/rv64-legal-i32/mem.ll
+++ b/llvm/test/CodeGen/RISCV/rv64-legal-i32/mem.ll
@@ -36,9 +36,9 @@ define void @lbu(ptr %a, ptr %b) nounwind {
 define void @lh(ptr %a, ptr %b) nounwind {
 ; RV64I-LABEL: lh:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lh a2, 2(a0)
 ; RV64I-NEXT:    lh zero, 0(a0)
-; RV64I-NEXT:    sw a2, 0(a1)
+; RV64I-NEXT:    lh a0, 2(a0)
+; RV64I-NEXT:    sw a0, 0(a1)
 ; RV64I-NEXT:    ret
   %1 = getelementptr i16, ptr %a, i32 1
   %2 = load i16, ptr %1
@@ -65,9 +65,9 @@ define void @lhu(ptr %a, ptr %b) nounwind {
 define void @lw(ptr %a, ptr %b) nounwind {
 ; RV64I-LABEL: lw:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lw a2, 4(a0)
 ; RV64I-NEXT:    lw zero, 0(a0)
-; RV64I-NEXT:    sd a2, 0(a1)
+; RV64I-NEXT:    lw a0, 4(a0)
+; RV64I-NEXT:    sd a0, 0(a1)
 ; RV64I-NEXT:    ret
   %1 = getelementptr i32, ptr %a, i64 1
   %2 = load i32, ptr %1
diff --git a/llvm/test/CodeGen/RISCV/rv64-legal-i32/mem64.ll b/llvm/test/CodeGen/RISCV/rv64-legal-i32/mem64.ll
index 76ab0e7d5810e7..e2b885c59b70ff 100644
--- a/llvm/test/CodeGen/RISCV/rv64-legal-i32/mem64.ll
+++ b/llvm/test/CodeGen/RISCV/rv64-legal-i32/mem64.ll
@@ -22,9 +22,8 @@ define dso_local i64 @lb(ptr %a) nounwind {
 define dso_local i64 @lh(ptr %a) nounwind {
 ; RV64I-LABEL: lh:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lh a1, 4(a0)
 ; RV64I-NEXT:    lh zero, 0(a0)
-; RV64I-NEXT:    mv a0, a1
+; RV64I-NEXT:    lh a0, 4(a0)
 ; RV64I-NEXT:    ret
   %1 = getelementptr i16, ptr %a, i32 2
   %2 = load i16, ptr %1
@@ -37,9 +36,8 @@ define dso_local i64 @lh(ptr %a) nounwind {
 define dso_local i64 @lw(ptr %a) nounwind {
 ; RV64I-LABEL: lw:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lw a1, 12(a0)
 ; RV64I-NEXT:    lw zero, 0(a0)
-; RV64I-NEXT:    mv a0, a1
+; RV64I-NEXT:    lw a0, 12(a0)
 ; RV64I-NEXT:    ret
   %1 = getelementptr i32, ptr %a, i32 3
   %2 = load i32, ptr %1
@@ -52,9 +50,9 @@ define dso_local i64 @lw(ptr %a) nounwind {
 define dso_local i64 @lbu(ptr %a) nounwind {
 ; RV64I-LABEL: lbu:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lbu a1, 4(a0)
-; RV64I-NEXT:    lbu a0, 0(a0)
-; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    lbu a1, 0(a0)
+; RV64I-NEXT:    lbu a0, 4(a0)
+; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    ret
   %1 = getelementptr i8, ptr %a, i32 4
   %2 = load i8, ptr %1
@@ -68,9 +66,9 @@ define dso_local i64 @lbu(ptr %a) nounwind {
 define dso_local i64 @lhu(ptr %a) nounwind {
 ; RV64I-LABEL: lhu:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lhu a1, 10(a0)
-; RV64I-NEXT:    lhu a0, 0(a0)
-; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    lhu a1, 0(a0)
+; RV64I-NEXT:    lhu a0, 10(a0)
+; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    ret
   %1 = getelementptr i16, ptr %a, i32 5
   %2 = load i16, ptr %1
@@ -84,9 +82,9 @@ define dso_local i64 @lhu(ptr %a) nounwind {
 define dso_local i64 @lwu(ptr %a) nounwind {
 ; RV64I-LABEL: lwu:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lwu a1, 24(a0)
-; RV64I-NEXT:    lwu a0, 0(a0)
-; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    lwu a1, 0(a0)
+; RV64I-NEXT:    lwu a0, 24(a0)
+; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    ret
   %1 = getelementptr i32, ptr %a, i32 6
   %2 = load i32, ptr %1
@@ -102,9 +100,8 @@ define dso_local i64 @lwu(ptr %a) nounwind {
 define dso_local i64 @ld(ptr %a) nounwind {
 ; RV64I-LABEL: ld:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    ld a1, 80(a0)
 ; RV64I-NEXT:    ld zero, 0(a0)
-; RV64I-NEXT:    mv a0, a1
+; RV64I-NEXT:    ld a0, 80(a0)
 ; RV64I-NEXT:    ret
   %1 = getelementptr i64, ptr %a, i32 10
   %2 = load i64, ptr %1
@@ -128,10 +125,10 @@ define dso_local void @sd(ptr %a, i64 %b) nounwind {
 define dso_local i64 @load_sext_zext_anyext_i1(ptr %a) nounwind {
 ; RV64I-LABEL: load_sext_zext_anyext_i1:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lbu a1, 1(a0)
-; RV64I-NEXT:    lbu a2, 2(a0)
 ; RV64I-NEXT:    lbu zero, 0(a0)
-; RV64I-NEXT:    sub a0, a2, a1
+; RV64I-NEXT:    lbu a1, 1(a0)
+; RV64I-NEXT:    lbu a0, 2(a0)
+; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    ret
   ; sextload i1
   %1 = getelementptr i1, ptr %a, i32 1
@@ -150,10 +147,10 @@ define dso_local i64 @load_sext_zext_anyext_i1(ptr %a) nounwind {
 define dso_local i16 @load_sext_zext_anyext_i1_i16(ptr %a) nounwind {
 ; RV64I-LABEL: load_sext_zext_anyext_i1_i16:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lbu a1, 1(a0)
-; RV64I-NEXT:    lbu a2, 2(a0)
 ; RV64I-NEXT:    lbu zero, 0(a0)
-; RV64I-NEXT:    subw a0, a2, a1
+; RV64I-NEXT:    lbu a1, 1(a0)
+; RV64I-NEXT:    lbu a0, 2(a0)
+; RV64I-NEXT:    subw a0, a0, a1
 ; RV64I-NEXT:    ret
   ; sextload i1
   %1 = getelementptr i1, ptr %a, i32 1
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i-sat.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i-sat.ll
index 3c0aaaa94fb55c..c6208d1fc3cbca 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i-sat.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i-sat.ll
@@ -357,37 +357,37 @@ define void @fp2si_v8f64_v8i8(ptr %x, ptr %y) {
 ; RV32-NEXT:    vslidedown.vi v8, v8, 3
 ; RV32-NEXT:    vfmv.f.s fa3, v8
 ; RV32-NEXT:    feq.d a0, fa3, fa3
+; RV32-NEXT:    neg a0, a0
 ; RV32-NEXT:    fmax.d fa3, fa3, fa5
 ; RV32-NEXT:    fmin.d fa3, fa3, fa4
 ; RV32-NEXT:    fcvt.w.d a2, fa3, rtz
+; RV32-NEXT:    and a0, a0, a2
 ; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; RV32-NEXT:    vslide1down.vx v8, v10, a0
 ; RV32-NEXT:    fld fa3, 32(sp)
 ; RV32-NEXT:    fld fa2, 40(sp)
 ; RV32-NEXT:    fld fa1, 48(sp)
 ; RV32-NEXT:    fld fa0, 56(sp)
-; RV32-NEXT:    feq.d a3, fa3, fa3
+; RV32-NEXT:    feq.d a0, fa3, fa3
+; RV32-NEXT:    neg a0, a0
 ; RV32-NEXT:    fmax.d fa3, fa3, fa5
 ; RV32-NEXT:    fmin.d fa3, fa3, fa4
-; RV32-NEXT:    fcvt.w.d a4, fa3, rtz
-; RV32-NEXT:    feq.d a5, fa2, fa2
+; RV32-NEXT:    fcvt.w.d a2, fa3, rtz
+; RV32-NEXT:    and a0, a0, a2
+; RV32-NEXT:    vslide1down.vx v8, v8, a0
+; RV32-NEXT:    feq.d a0, fa2, fa2
+; RV32-NEXT:    neg a0, a0
 ; RV32-NEXT:    fmax.d fa3, fa2, fa5
 ; RV32-NEXT:    fmin.d fa3, fa3, fa4
-; RV32-NEXT:    fcvt.w.d a6, fa3, rtz
-; RV32-NEXT:    feq.d a7, fa1, fa1
+; RV32-NEXT:    fcvt.w.d a2, fa3, rtz
+; RV32-NEXT:    and a0, a0, a2
+; RV32-NEXT:    vslide1down.vx v8, v8, a0
+; RV32-NEXT:    feq.d a0, fa1, fa1
+; RV32-NEXT:    neg a0, a0
 ; RV32-NEXT:    fmax.d fa3, fa1, fa5
 ; RV32-NEXT:    fmin.d fa3, fa3, fa4
-; RV32-NEXT:    fcvt.w.d t0, fa3, rtz
-; RV32-NEXT:    neg a0, a0
+; RV32-NEXT:    fcvt.w.d a2, fa3, rtz
 ; RV32-NEXT:    and a0, a0, a2
-; RV32-NEXT:    vslide1down.vx v8, v10, a0
-; RV32-NEXT:    neg a0, a3
-; RV32-NEXT:    and a0, a0, a4
-; RV32-NEXT:    vslide1down.vx v8, v8, a0
-; RV32-NEXT:    neg a0, a5
-; RV32-NEXT:    and a0, a0, a6
-; RV32-NEXT:    vslide1down.vx v8, v8, a0
-; RV32-NEXT:    neg a0, a7
-; RV32-NEXT:    and a0, a0, t0
 ; RV32-NEXT:    vslide1down.vx v8, v8, a0
 ; RV32-NEXT:    feq.d a0, fa0, fa0
 ; RV32-NEXT:    neg a0, a0
@@ -456,37 +456,37 @@ define void @fp2si_v8f64_v8i8(ptr %x, ptr %y) {
 ; RV64-NEXT:    vslidedown.vi v8, v8, 3
 ; RV64-NEXT:    vfmv.f.s fa3, v8
 ; RV64-NEXT:    feq.d a0, fa3, fa3
+; RV64-NEXT:    neg a0, a0
 ; RV64-NEXT:    fmax.d fa3, fa3, fa5
 ; RV64-NEXT:    fmin.d fa3, fa3, fa4
 ; RV64-NEXT:    fcvt.l.d a2, fa3, rtz
+; RV64-NEXT:    and a0, a0, a2
 ; RV64-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; RV64-NEXT:    vslide1down.vx v8, v10, a0
 ; RV64-NEXT:    fld fa3, 32(sp)
 ; RV64-NEXT:    fld fa2, 40(sp)
 ; RV64-NEXT:    fld fa1, 48(sp)
 ; RV64-NEXT:    fld fa0, 56(sp)
-; RV64-NEXT:    feq.d a3, fa3, fa3
+; RV64-NEXT:    feq.d a0, fa3, fa3
+; RV64-NEXT:    neg a0, a0
 ; RV64-NEXT:    fmax.d fa3, fa3, fa5
 ; RV64-NEXT:    fmin.d fa3, fa3, fa4
-; RV64-NEXT:    fcvt.l.d a4, fa3, rtz
-; RV64-NEXT:    feq.d a5, fa2, fa2
+; RV64-NEXT:    fcvt.l.d a2, fa3, rtz
+; RV64-NEXT:    and a0, a0, a2
+; RV64-NEXT:    vslide1down.vx v8, v8, a0
+; RV64-NEXT:    feq.d a0, fa2, fa2
+; RV64-NEXT:    neg a0, a0
 ; RV64-NEXT:    fmax.d fa3, fa2, fa5
 ; RV64-NEXT:    fmin.d fa3, fa3, fa4
-; RV64-NEXT:    fcvt.l.d a6, fa3, rtz
-; RV64-NEXT:    feq.d a7, fa1, fa1
+; RV64-NEXT:    fcvt.l.d a2, fa3, rtz
+; RV64-NEXT:    and a0, a0, a2
+; RV64-NEXT:    vslide1down.vx v8, v8, a0
+; RV64-NEXT:    feq.d a0, fa1, fa1
+; RV64-NEXT:    neg a0, a0
 ; RV64-NEXT:    fmax.d fa3, fa1, fa5
 ; RV64-NEXT:    fmin.d fa3, fa3, fa4
-; RV64-NEXT:    fcvt.l.d t0, fa3, rtz
-; RV64-NEXT:    neg a0, a0
+; RV64-NEXT:    fcvt.l.d a2, fa3, rtz
 ; RV64-NEXT:    and a0, a0, a2
-; RV64-NEXT:    vslide1down.vx v8, v10, a0
-; RV64-NEXT:    neg a0, a3
-; RV64-NEXT:    and a0, a0, a4
-; RV64-NEXT:    vslide1down.vx v8, v8, a0
-; RV64-NEXT:    neg a0, a5
-; RV64-NEXT:    and a0, a0, a6
-; RV64-NEXT:    vslide1down.vx v8, v8, a0
-; RV64-NEXT:    neg a0, a7
-; RV64-NEXT:    and a0, a0, t0
 ; RV64-NEXT:    vslide1down.vx v8, v8, a0
 ; RV64-NEXT:    feq.d a0, fa0, fa0
 ; RV64-NEXT:    neg a0, a0
@@ -556,23 +556,23 @@ define void @fp2ui_v8f64_v8i8(ptr %x, ptr %y) {
 ; RV32-NEXT:    fmin.d fa4, fa4, fa5
 ; RV32-NEXT:    fcvt.wu.d a0, fa4, rtz
 ; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; RV32-NEXT:    vslide1down.vx v8, v10, a0
 ; RV32-NEXT:    fld fa4, 32(sp)
 ; RV32-NEXT:    fld fa2, 40(sp)
 ; RV32-NEXT:    fld fa1, 48(sp)
 ; RV32-NEXT:    fld fa0, 56(sp)
 ; RV32-NEXT:    fmax.d fa4, fa4, fa3
 ; RV32-NEXT:    fmin.d fa4, fa4, fa5
-; RV32-NEXT:    fcvt.wu.d a2, fa4, rtz
+; RV32-NEXT:    fcvt.wu.d a0, fa4, rtz
+; RV32-NEXT:    vslide1down.vx v8, v8, a0
 ; RV32-NEXT:    fmax.d fa4, fa2, fa3
 ; RV32-NEXT:    fmin.d fa4, fa4, fa5
-; RV32-NEXT:    fcvt.wu.d a3, fa4, rtz
+; RV32-NEXT:    fcvt.wu.d a0, fa4, rtz
+; RV32-NEXT:    vslide1down.vx v8, v8, a0
 ; RV32-NEXT:    fmax.d fa4, fa1, fa3
 ; RV32-NEXT:    fmin.d fa4, fa4, fa5
-; RV32-NEXT:    fcvt.wu.d a4, fa4, rtz
-; RV32-NEXT:    vslide1down.vx v8, v10, a0
-; RV32-NEXT:    vslide1down.vx v8, v8, a2
-; RV32-NEXT:    vslide1down.vx v8, v8, a3
-; RV32-NEXT:    vslide1down.vx v8, v8, a4
+; RV32-NEXT:    fcvt.wu.d a0, fa4, rtz
+; RV32-NEXT:    vslide1down.vx v8, v8, a0
 ; RV32-NEXT:    fmax.d fa4, fa0, fa3
 ; RV32-NEXT:    fmin.d fa5, fa4, fa5
 ; RV32-NEXT:    fcvt.wu.d a0, fa5, rtz
@@ -630,23 +630,23 @@ define void @fp2ui_v8f64_v8i8(ptr %x, ptr %y) {
 ; RV64-NEXT:    fmin.d fa4, fa4, fa5
 ; RV64-NEXT:    fcvt.lu.d a0, fa4, rtz
 ; RV64-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; RV64-NEXT:    vslide1down.vx v8, v10, a0
 ; RV64-NEXT:    fld fa4, 32(sp)
 ; RV64-NEXT:    fld fa2, 40(sp)
 ; RV64-NEXT:    fld fa1, 48(sp)
 ; RV64-NEXT:    fld fa0, 56(sp)
 ; RV64-NEXT:    fmax.d fa4, fa4, fa3
 ; RV64-NEXT:    fmin.d fa4, fa4, fa5
-; RV64-NEXT:    fcvt.lu.d a2, fa4, rtz
+; RV64-NEXT:    fcvt.lu.d a0, fa4, rtz
+; RV64-NEXT:    vslide1down.vx v8, v8, a0
 ; RV64-NEXT:    fmax.d fa4, fa2, fa3
 ; RV64-NEXT:    fmin.d fa4, fa4, fa5
-; RV64-NEXT:    fcvt.lu.d a3, fa4, rtz
+; RV64-NEXT:    fcvt.lu.d a0, fa4, rtz
+; RV64-NEXT:    vslide1down.vx v8, v8, a0
 ; RV64-NEXT:    fmax.d fa4, fa1, fa3
 ; RV64-NEXT:    fmin.d fa4, fa4, fa5
-; RV64-NEXT:    fcvt.lu.d a4, fa4, rtz
-; RV64-NEXT:    vslide1down.vx v8, v10, a0
-; RV64-NEXT:    vslide1down.vx v8, v8, a2
-; RV64-NEXT:    vslide1down.vx v8, v8, a3
-; RV64-NEXT:    vslide1down.vx v8, v8, a4
+; RV64-NEXT:    fcvt.lu.d a0, fa4, rtz
+; RV64-NEXT:    vslide1down.vx v8, v8, a0
 ; RV64-NEXT:    fmax.d fa4, fa0, fa3
 ; RV64-NEXT:    fmin.d fa5, fa4, fa5
 ; RV64-NEXT:    fcvt.lu.d a0, fa5, rtz
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-i2fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-i2fp.ll
index 924d9a84ce9278..861ff459e87f33 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-i2fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-i2fp.ll
@@ -136,8 +136,8 @@ define <3 x float> @si2fp_v3i7_v3f32(<3 x i7> %x) {
 ; LMULMAX8RV32:       # %bb.0:
 ; LMULMAX8RV32-NEXT:    lw a1, 0(a0)
 ; LMULMAX8RV32-NEXT:    lw a2, 4(a0)
-; LMULMAX8RV32-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; LMULMAX8RV32-NEXT:    lw a0, 8(a0)
+; LMULMAX8RV32-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; LMULMAX8RV32-NEXT:    vmv.v.x v8, a1
 ; LMULMAX8RV32-NEXT:    vslide1down.vx v8, v8, a2
 ; LMULMAX8RV32-NEXT:    vslide1down.vx v8, v8, a0
@@ -153,8 +153,8 @@ define <3 x float> @si2fp_v3i7_v3f32(<3 x i7> %x) {
 ; LMULMAX8RV64:       # %bb.0:
 ; LMULMAX8RV64-NEXT:    ld a1, 0(a0)
 ; LMULMAX8RV64-NEXT:    ld a2, 8(a0)
-; LMULMAX8RV64-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; LMULMAX8RV64-NEXT:    ld a0, 16(a0)
+; LMULMAX8RV64-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; LMULMAX8RV64-NEXT:    vmv.v.x v8, a1
 ; LMULMAX8RV64-NEXT:    vslide1down.vx v8, v8, a2
 ; LMULMAX8RV64-NEXT:    vslide1down.vx v8, v8, a0
@@ -170,8 +170,8 @@ define <3 x float> @si2fp_v3i7_v3f32(<3 x i7> %x) {
 ; LMULMAX1RV32:       # %bb.0:
 ; LMULMAX1RV32-NEXT:    lw a1, 0(a0)
 ; LMULMAX1RV32-NEXT:    lw a2, 4(a0)
-; LMULMAX1RV32-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; LMULMAX1RV32-NEXT:    lw a0, 8(a0)
+; LMULMAX1RV32-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; LMULMAX1RV32-NEXT:    vmv.v.x v8, a1
 ; LMULMAX1RV32-NEXT:    vslide1down.vx v8, v8, a2
 ; LMULMAX1RV32-NEXT:    vslide1down.vx v8, v8, a0
@@ -187,8 +187,8 @@ define <3 x float> @si2fp_v3i7_v3f32(<3 x i7> %x) {
 ; LMULMAX1RV64:       # %bb.0:
 ; LMULMAX1RV64-NEXT:    ld a1, 0(a0)
 ; LMULMAX1RV64-NEXT:    ld a2, 8(a0)
-; LMULMAX1RV64-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; LMULMAX1RV64-NEXT:    ld a0, 16(a0)
+; LMULMAX1RV64-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; LMULMAX1RV64-NEXT:    vmv.v.x v8, a1
 ; LMULMAX1RV64-NEXT:    vslide1down.vx v8, v8, a2
 ; LMULMAX1RV64-NEXT:    vslide1down.vx v8, v8, a0
@@ -204,8 +204,8 @@ define <3 x float> @si2fp_v3i7_v3f32(<3 x i7> %x) {
 ; LMULMAX8RV32ZVFHMIN:       # %bb.0:
 ; LMULMAX8RV32ZVFHMIN-NEXT:    lw a1, 0(a0)
 ; LMULMAX8RV32ZVFHMIN-NEXT:    lw a2, 4(a0)
-; LMULMAX8RV32ZVFHMIN-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; LMULMAX8RV32ZVFHMIN-NEXT:    lw a0, 8(a0)
+; LMULMAX8RV32ZVFHMIN-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; LMULMAX8RV32ZVFHMIN-NEXT:    vmv.v.x v8, a1
 ; LMULMAX8RV32ZVFHMIN-NEXT:    vslide1down.vx v8, v8, a2
 ; LMULMAX8RV32ZVFHMIN-NEXT:    vslide1down.vx v8, v8, a0
@@ -221,8 +221,8 @@ define <3 x float> @si2fp_v3i7_v3f32(<3 x i7> %x) {
 ; LMULMAX8RV64ZVFHMIN:       # %bb.0:
 ; LMULMAX8RV64ZVFHMIN-NEXT:    ld a1, 0(a0)
 ; LMULMAX8RV64ZVFHMIN-NEXT:    ld a2, 8(a0)
-; LMULMAX8RV64ZVFHMIN-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; LMULMAX8RV64ZVFHMIN-NEXT:    ld a0, 16(a0)
+; LMULMAX8RV64ZVFHMIN-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; LMULMAX8RV64ZVFHMIN-NEXT:    vmv.v.x v8, a1
 ; LMULMAX8RV64ZVFHMIN-NEXT:    vslide1down.vx v8, v8, a2
 ; LMULMAX8RV64ZVFHMIN-NEXT:    vslide1down.vx v8, v8, a0
@@ -243,8 +243,8 @@ define <3 x float> @ui2fp_v3i7_v3f32(<3 x i7> %x) {
 ; LMULMAX8RV32:       # %bb.0:
 ; LMULMAX8RV32-NEXT:    lw a1, 0(a0)
 ; LMULMAX8RV32-NEXT:    lw a2, 4(a0)
-; LMULMAX8RV32-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; LMULMAX8RV32-NEXT:    lw a0, 8(a0)
+; LMULMAX8RV32-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; LMULMAX8RV32-NEXT:    vmv.v.x v8, a1
 ; LMULMAX8RV32-NEXT:    vslide1down.vx v8, v8, a2
 ; LMULMAX8RV32-NEXT:    vslide1down.vx v8, v8, a0
@@ -260,8 +260,8 @@ define <3 x float> @ui2fp_v3i7_v3f32(<3 x i7> %x) {
 ; LMULMAX8RV64:       # %bb.0:
 ; LMULMAX8RV64-NEXT:    ld a1, 0(a0)
 ; LMULMAX8RV64-NEXT:    ld a2, 8(a0)
-; LMULMAX8RV64-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; LMULMAX8RV64-NEXT:    ld a0, 16(a0)
+; LMULMAX8RV64-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; LMULMAX8RV64-NEXT:    vmv.v.x v8, a1
 ; LMULMAX8RV64-NEXT:    vslide1down.vx v8, v8, a2
 ; LMULMAX8RV64-NEXT:    vslide1down.vx v8, v8, a0
@@ -277,8 +277,8 @@ define <3 x float> @ui2fp_v3i7_v3f32(<3 x i7> %x) {
 ; LMULMAX1RV32:       # %bb.0:
 ; LMULMAX1RV32-NEXT:    lw a1, 0(a0)
 ; LMULMAX1RV32-NEXT:    lw a2, 4(a0)
-; LMULMAX1RV32-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; LMULMAX1RV32-NEXT:    lw a0, 8(a0)
+; LMULMAX1RV32-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; LMULMAX1RV32-NEXT:    vmv.v.x v8, a1
 ; LMULMAX1RV32-NEXT:    vslide1down.vx v8, v8, a2
 ; LMULMAX1RV32-NEXT:    vslide1down.vx v8, v8, a0
@@ -294,8 +294,8 @@ define <3 x float> @ui2fp_v3i7_v3f32(<3 x i7> %x) {
 ; LMULMAX1RV64:       # %bb.0:
 ; LMULMAX1RV64-NEXT:    ld a1, 0(a0)
 ; LMULMAX1RV64-NEXT:    ld a2, 8(a0)
-; LMULMAX1RV64-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; LMULMAX1RV64-NEXT:    ld a0, 16(a0)
+; LMULMAX1RV64-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; LMULMAX1RV64-NEXT:    vmv.v.x v8, a1
 ; LMULMAX1RV64-NEXT:    vslide1down.vx v8, v8, a2
 ; LMULMAX1RV64-NEXT:    vslide1down.vx v8, v8, a0
@@ -311,8 +311,8 @@ define <3 x float> @ui2fp_v3i7_v3f32(<3 x i7> %x) {
 ; LMULMAX8RV32ZVFHMIN:       # %bb.0:
 ; LMULMAX8RV32ZVFHMIN-NEXT:    lw a1, 0(a0)
 ; LMULMAX8RV32ZVFHMIN-NEXT:    lw a2, 4(a0)
-; LMULMAX8RV32ZVFHMIN-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; LMULMAX8RV32ZVFHMIN-NEXT:    lw a0, 8(a0)
+; LMULMAX8RV32ZVFHMIN-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; LMULMAX8RV32ZVFHMIN-NEXT:    vmv.v.x v8, a1
 ; LMULMAX8RV32ZVFHMIN-NEXT:    vslide1down.vx v8, v8, a2
 ; LMULMAX8RV32ZVFHMIN-NEXT:    vslide1down.vx v8, v8, a0
@@ -328,8 +328,8 @@ define <3 x float> @ui2fp_v3i7_v3f32(<3 x i7> %x) {
 ; LMULMAX8RV64ZVFHMIN:       # %bb.0:
 ; LMULMAX8RV64ZVFHMIN-NEXT:    ld a1, 0(a0)
 ; LMULMAX8RV64ZVFHMIN-NEXT:    ld a2, 8(a0)
-; LMULMAX8RV64ZVFHMIN-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; LMULMAX8RV64ZVFHMIN-NEXT:    ld a0, 16(a0)
+; LMULMAX8RV64ZVFHMIN-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; LMULMAX8RV64ZVFHMIN-NEXT:    vmv.v.x v8, a1
 ; LMULMAX8RV64ZVFHMIN-NEXT:    vslide1down.vx v8, v8, a2
 ; LMULMAX8RV64ZVFHMIN-NEXT:    vslide1down.vx v8, v8, a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access-zve32x.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access-zve32x.ll
index bad3836c506035..432c49514bb655 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access-zve32x.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access-zve32x.ll
@@ -15,9 +15,9 @@ define <4 x i1> @load_large_vector(ptr %p) {
 ; ZVE32X-NEXT:    ld a6, 56(a0)
 ; ZVE32X-NEXT:    ld a7, 72(a0)
 ; ZVE32X-NEXT:    ld a0, 80(a0)
+; ZVE32X-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; ZVE32X-NEXT:    xor a3, a3, a4
 ; ZVE32X-NEXT:    snez a3, a3
-; ZVE32X-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; ZVE32X-NEXT:    vmv.s.x v8, a3
 ; ZVE32X-NEXT:    vand.vi v8, v8, 1
 ; ZVE32X-NEXT:    vmsne.vi v0, v8, 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-llrint.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-llrint.ll
index 7c5047bbdf6352..db058fc5c102ef 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-llrint.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-llrint.ll
@@ -453,19 +453,25 @@ declare <8 x i64> @llvm.llrint.v8i64.v8f32(<8 x float>)
 define <16 x i64> @llrint_v16i64_v16f32(<16 x float> %x) {
 ; RV32-LABEL: llrint_v16i64_v16f32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -400
-; RV32-NEXT:    .cfi_def_cfa_offset 400
-; RV32-NEXT:    sw ra, 396(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s0, 392(sp) # 4-byte Folded Spill
+; RV32-NEXT:    addi sp, sp, -384
+; RV32-NEXT:    .cfi_def_cfa_offset 384
+; RV32-NEXT:    sw ra, 380(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s0, 376(sp) # 4-byte Folded Spill
+; RV32-NEXT:    fsd fs0, 368(sp) # 8-byte Folded Spill
+; RV32-NEXT:    fsd fs1, 360(sp) # 8-byte Folded Spill
+; RV32-NEXT:    fsd fs2, 352(sp) # 8-byte Folded Spill
 ; RV32-NEXT:    .cfi_offset ra, -4
 ; RV32-NEXT:    .cfi_offset s0, -8
-; RV32-NEXT:    addi s0, sp, 400
+; RV32-NEXT:    .cfi_offset fs0, -16
+; RV32-NEXT:    .cfi_offset fs1, -24
+; RV32-NEXT:    .cfi_offset fs2, -32
+; RV32-NEXT:    addi s0, sp, 384
 ; RV32-NEXT:    .cfi_def_cfa s0, 0
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 2
 ; RV32-NEXT:    sub sp, sp, a0
 ; RV32-NEXT:    andi sp, sp, -128
-; RV32-NEXT:    addi a0, sp, 384
+; RV32-NEXT:    addi a0, sp, 352
 ; RV32-NEXT:    vs4r.v v8, (a0) # Unknown-size Folded Spill
 ; RV32-NEXT:    addi a0, sp, 64
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
@@ -487,30 +493,33 @@ define <16 x i64> @llrint_v16i64_v16f32(<16 x float> %x) {
 ; RV32-NEXT:    sw a1, 228(sp)
 ; RV32-NEXT:    sw a0, 224(sp)
 ; RV32-NEXT:    flw fa0, 108(sp)
+; RV32-NEXT:    flw fs0, 96(sp)
+; RV32-NEXT:    flw fs1, 100(sp)
+; RV32-NEXT:    flw fs2, 104(sp)
 ; RV32-NEXT:    call llrintf at plt
 ; RV32-NEXT:    sw a1, 220(sp)
 ; RV32-NEXT:    sw a0, 216(sp)
-; RV32-NEXT:    flw fa0, 104(sp)
+; RV32-NEXT:    fmv.s fa0, fs2
 ; RV32-NEXT:    call llrintf at plt
 ; RV32-NEXT:    sw a1, 212(sp)
 ; RV32-NEXT:    sw a0, 208(sp)
-; RV32-NEXT:    flw fa0, 100(sp)
+; RV32-NEXT:    fmv.s fa0, fs1
 ; RV32-NEXT:    call llrintf at plt
 ; RV32-NEXT:    sw a1, 204(sp)
 ; RV32-NEXT:    sw a0, 200(sp)
-; RV32-NEXT:    flw fa0, 96(sp)
+; RV32-NEXT:    fmv.s fa0, fs0
 ; RV32-NEXT:    call llrintf at plt
 ; RV32-NEXT:    sw a1, 196(sp)
 ; RV32-NEXT:    sw a0, 192(sp)
 ; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV32-NEXT:    addi a0, sp, 384
+; RV32-NEXT:    addi a0, sp, 352
 ; RV32-NEXT:    vl4r.v v8, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vfmv.f.s fa0, v8
 ; RV32-NEXT:    call llrintf at plt
 ; RV32-NEXT:    sw a1, 132(sp)
 ; RV32-NEXT:    sw a0, 128(sp)
 ; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV32-NEXT:    addi a0, sp, 384
+; RV32-NEXT:    addi a0, sp, 352
 ; RV32-NEXT:    vl4r.v v8, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vslidedown.vi v8, v8, 3
 ; RV32-NEXT:    vfmv.f.s fa0, v8
@@ -518,7 +527,7 @@ define <16 x i64> @llrint_v16i64_v16f32(<16 x float> %x) {
 ; RV32-NEXT:    sw a1, 156(sp)
 ; RV32-NEXT:    sw a0, 152(sp)
 ; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV32-NEXT:    addi a0, sp, 384
+; RV32-NEXT:    addi a0, sp, 352
 ; RV32-NEXT:    vl4r.v v8, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vslidedown.vi v8, v8, 2
 ; RV32-NEXT:    vfmv.f.s fa0, v8
@@ -526,7 +535,7 @@ define <16 x i64> @llrint_v16i64_v16f32(<16 x float> %x) {
 ; RV32-NEXT:    sw a1, 148(sp)
 ; RV32-NEXT:    sw a0, 144(sp)
 ; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV32-NEXT:    addi a0, sp, 384
+; RV32-NEXT:    addi a0, sp, 352
 ; RV32-NEXT:    vl4r.v v8, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vslidedown.vi v8, v8, 1
 ; RV32-NEXT:    vfmv.f.s fa0, v8
@@ -534,7 +543,7 @@ define <16 x i64> @llrint_v16i64_v16f32(<16 x float> %x) {
 ; RV32-NEXT:    sw a1, 140(sp)
 ; RV32-NEXT:    sw a0, 136(sp)
 ; RV32-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV32-NEXT:    addi a0, sp, 384
+; RV32-NEXT:    addi a0, sp, 352
 ; RV32-NEXT:    vl4r.v v8, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vslidedown.vi v8, v8, 7
 ; RV32-NEXT:    vfmv.f.s fa0, v8
@@ -542,7 +551,7 @@ define <16 x i64> @llrint_v16i64_v16f32(<16 x float> %x) {
 ; RV32-NEXT:    sw a1, 188(sp)
 ; RV32-NEXT:    sw a0, 184(sp)
 ; RV32-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV32-NEXT:    addi a0, sp, 384
+; RV32-NEXT:    addi a0, sp, 352
 ; RV32-NEXT:    vl4r.v v8, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vslidedown.vi v8, v8, 6
 ; RV32-NEXT:    vfmv.f.s fa0, v8
@@ -550,7 +559,7 @@ define <16 x i64> @llrint_v16i64_v16f32(<16 x float> %x) {
 ; RV32-NEXT:    sw a1, 180(sp)
 ; RV32-NEXT:    sw a0, 176(sp)
 ; RV32-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV32-NEXT:    addi a0, sp, 384
+; RV32-NEXT:    addi a0, sp, 352
 ; RV32-NEXT:    vl4r.v v8, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vslidedown.vi v8, v8, 5
 ; RV32-NEXT:    vfmv.f.s fa0, v8
@@ -558,7 +567,7 @@ define <16 x i64> @llrint_v16i64_v16f32(<16 x float> %x) {
 ; RV32-NEXT:    sw a1, 172(sp)
 ; RV32-NEXT:    sw a0, 168(sp)
 ; RV32-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV32-NEXT:    addi a0, sp, 384
+; RV32-NEXT:    addi a0, sp, 352
 ; RV32-NEXT:    vl4r.v v8, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vslidedown.vi v8, v8, 4
 ; RV32-NEXT:    vfmv.f.s fa0, v8
@@ -569,10 +578,13 @@ define <16 x i64> @llrint_v16i64_v16f32(<16 x float> %x) {
 ; RV32-NEXT:    addi a1, sp, 128
 ; RV32-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; RV32-NEXT:    vle32.v v8, (a1)
-; RV32-NEXT:    addi sp, s0, -400
-; RV32-NEXT:    lw ra, 396(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s0, 392(sp) # 4-byte Folded Reload
-; RV32-NEXT:    addi sp, sp, 400
+; RV32-NEXT:    addi sp, s0, -384
+; RV32-NEXT:    lw ra, 380(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s0, 376(sp) # 4-byte Folded Reload
+; RV32-NEXT:    fld fs0, 368(sp) # 8-byte Folded Reload
+; RV32-NEXT:    fld fs1, 360(sp) # 8-byte Folded Reload
+; RV32-NEXT:    fld fs2, 352(sp) # 8-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 384
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: llrint_v16i64_v16f32:
@@ -602,16 +614,16 @@ define <16 x i64> @llrint_v16i64_v16f32(<16 x float> %x) {
 ; RV64-NEXT:    fcvt.l.s a0, fa5
 ; RV64-NEXT:    sd a0, 224(sp)
 ; RV64-NEXT:    flw fa5, 108(sp)
+; RV64-NEXT:    flw fa4, 104(sp)
+; RV64-NEXT:    flw fa3, 96(sp)
+; RV64-NEXT:    flw fa2, 100(sp)
 ; RV64-NEXT:    fcvt.l.s a0, fa5
 ; RV64-NEXT:    sd a0, 216(sp)
-; RV64-NEXT:    flw fa5, 104(sp)
-; RV64-NEXT:    fcvt.l.s a0, fa5
+; RV64-NEXT:    fcvt.l.s a0, fa4
 ; RV64-NEXT:    sd a0, 208(sp)
-; RV64-NEXT:    flw fa5, 100(sp)
-; RV64-NEXT:    fcvt.l.s a0, fa5
+; RV64-NEXT:    fcvt.l.s a0, fa2
 ; RV64-NEXT:    sd a0, 200(sp)
-; RV64-NEXT:    flw fa5, 96(sp)
-; RV64-NEXT:    fcvt.l.s a0, fa5
+; RV64-NEXT:    fcvt.l.s a0, fa3
 ; RV64-NEXT:    sd a0, 192(sp)
 ; RV64-NEXT:    vfmv.f.s fa5, v8
 ; RV64-NEXT:    fcvt.l.s a0, fa5
@@ -857,48 +869,57 @@ declare <4 x i64> @llvm.llrint.v4i64.v4f64(<4 x double>)
 define <8 x i64> @llrint_v8i64_v8f64(<8 x double> %x) {
 ; RV32-LABEL: llrint_v8i64_v8f64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -272
-; RV32-NEXT:    .cfi_def_cfa_offset 272
-; RV32-NEXT:    sw ra, 268(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s0, 264(sp) # 4-byte Folded Spill
+; RV32-NEXT:    addi sp, sp, -256
+; RV32-NEXT:    .cfi_def_cfa_offset 256
+; RV32-NEXT:    sw ra, 252(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s0, 248(sp) # 4-byte Folded Spill
+; RV32-NEXT:    fsd fs0, 240(sp) # 8-byte Folded Spill
+; RV32-NEXT:    fsd fs1, 232(sp) # 8-byte Folded Spill
+; RV32-NEXT:    fsd fs2, 224(sp) # 8-byte Folded Spill
 ; RV32-NEXT:    .cfi_offset ra, -4
 ; RV32-NEXT:    .cfi_offset s0, -8
-; RV32-NEXT:    addi s0, sp, 272
+; RV32-NEXT:    .cfi_offset fs0, -16
+; RV32-NEXT:    .cfi_offset fs1, -24
+; RV32-NEXT:    .cfi_offset fs2, -32
+; RV32-NEXT:    addi s0, sp, 256
 ; RV32-NEXT:    .cfi_def_cfa s0, 0
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 2
 ; RV32-NEXT:    sub sp, sp, a0
 ; RV32-NEXT:    andi sp, sp, -64
-; RV32-NEXT:    addi a0, sp, 256
+; RV32-NEXT:    addi a0, sp, 224
 ; RV32-NEXT:    vs4r.v v8, (a0) # Unknown-size Folded Spill
 ; RV32-NEXT:    addi a0, sp, 64
 ; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; RV32-NEXT:    vse64.v v8, (a0)
 ; RV32-NEXT:    fld fa0, 120(sp)
+; RV32-NEXT:    fld fs0, 96(sp)
+; RV32-NEXT:    fld fs1, 104(sp)
+; RV32-NEXT:    fld fs2, 112(sp)
 ; RV32-NEXT:    call llrint at plt
 ; RV32-NEXT:    sw a1, 188(sp)
 ; RV32-NEXT:    sw a0, 184(sp)
-; RV32-NEXT:    fld fa0, 112(sp)
+; RV32-NEXT:    fmv.d fa0, fs2
 ; RV32-NEXT:    call llrint at plt
 ; RV32-NEXT:    sw a1, 180(sp)
 ; RV32-NEXT:    sw a0, 176(sp)
-; RV32-NEXT:    fld fa0, 104(sp)
+; RV32-NEXT:    fmv.d fa0, fs1
 ; RV32-NEXT:    call llrint at plt
 ; RV32-NEXT:    sw a1, 172(sp)
 ; RV32-NEXT:    sw a0, 168(sp)
-; RV32-NEXT:    fld fa0, 96(sp)
+; RV32-NEXT:    fmv.d fa0, fs0
 ; RV32-NEXT:    call llrint at plt
 ; RV32-NEXT:    sw a1, 164(sp)
 ; RV32-NEXT:    sw a0, 160(sp)
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV32-NEXT:    addi a0, sp, 256
+; RV32-NEXT:    addi a0, sp, 224
 ; RV32-NEXT:    vl4r.v v8, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vfmv.f.s fa0, v8
 ; RV32-NEXT:    call llrint at plt
 ; RV32-NEXT:    sw a1, 132(sp)
 ; RV32-NEXT:    sw a0, 128(sp)
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV32-NEXT:    addi a0, sp, 256
+; RV32-NEXT:    addi a0, sp, 224
 ; RV32-NEXT:    vl4r.v v8, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vslidedown.vi v8, v8, 1
 ; RV32-NEXT:    vfmv.f.s fa0, v8
@@ -906,7 +927,7 @@ define <8 x i64> @llrint_v8i64_v8f64(<8 x double> %x) {
 ; RV32-NEXT:    sw a1, 140(sp)
 ; RV32-NEXT:    sw a0, 136(sp)
 ; RV32-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
-; RV32-NEXT:    addi a0, sp, 256
+; RV32-NEXT:    addi a0, sp, 224
 ; RV32-NEXT:    vl4r.v v8, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vslidedown.vi v8, v8, 3
 ; RV32-NEXT:    vfmv.f.s fa0, v8
@@ -914,7 +935,7 @@ define <8 x i64> @llrint_v8i64_v8f64(<8 x double> %x) {
 ; RV32-NEXT:    sw a1, 156(sp)
 ; RV32-NEXT:    sw a0, 152(sp)
 ; RV32-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
-; RV32-NEXT:    addi a0, sp, 256
+; RV32-NEXT:    addi a0, sp, 224
 ; RV32-NEXT:    vl4r.v v8, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vslidedown.vi v8, v8, 2
 ; RV32-NEXT:    vfmv.f.s fa0, v8
@@ -924,10 +945,13 @@ define <8 x i64> @llrint_v8i64_v8f64(<8 x double> %x) {
 ; RV32-NEXT:    addi a0, sp, 128
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV32-NEXT:    vle32.v v8, (a0)
-; RV32-NEXT:    addi sp, s0, -272
-; RV32-NEXT:    lw ra, 268(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s0, 264(sp) # 4-byte Folded Reload
-; RV32-NEXT:    addi sp, sp, 272
+; RV32-NEXT:    addi sp, s0, -256
+; RV32-NEXT:    lw ra, 252(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s0, 248(sp) # 4-byte Folded Reload
+; RV32-NEXT:    fld fs0, 240(sp) # 8-byte Folded Reload
+; RV32-NEXT:    fld fs1, 232(sp) # 8-byte Folded Reload
+; RV32-NEXT:    fld fs2, 224(sp) # 8-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 256
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: llrint_v8i64_v8f64:
@@ -945,16 +969,16 @@ define <8 x i64> @llrint_v8i64_v8f64(<8 x double> %x) {
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; RV64-NEXT:    vse64.v v8, (a0)
 ; RV64-NEXT:    fld fa5, 56(sp)
+; RV64-NEXT:    fld fa4, 48(sp)
+; RV64-NEXT:    fld fa3, 32(sp)
+; RV64-NEXT:    fld fa2, 40(sp)
 ; RV64-NEXT:    fcvt.l.d a0, fa5
 ; RV64-NEXT:    sd a0, 120(sp)
-; RV64-NEXT:    fld fa5, 48(sp)
-; RV64-NEXT:    fcvt.l.d a0, fa5
+; RV64-NEXT:    fcvt.l.d a0, fa4
 ; RV64-NEXT:    sd a0, 112(sp)
-; RV64-NEXT:    fld fa5, 40(sp)
-; RV64-NEXT:    fcvt.l.d a0, fa5
+; RV64-NEXT:    fcvt.l.d a0, fa2
 ; RV64-NEXT:    sd a0, 104(sp)
-; RV64-NEXT:    fld fa5, 32(sp)
-; RV64-NEXT:    fcvt.l.d a0, fa5
+; RV64-NEXT:    fcvt.l.d a0, fa3
 ; RV64-NEXT:    sd a0, 96(sp)
 ; RV64-NEXT:    vfmv.f.s fa5, v8
 ; RV64-NEXT:    fcvt.l.d a0, fa5
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-lrint.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-lrint.ll
index 39dda1fd5ebefa..3ad2b862e17e43 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-lrint.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-lrint.ll
@@ -576,17 +576,17 @@ define <8 x iXLen> @lrint_v8f64(<8 x double> %x) {
 ; RV32-NEXT:    vfmv.f.s fa5, v8
 ; RV32-NEXT:    fcvt.w.d a0, fa5
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vslide1down.vx v8, v10, a0
 ; RV32-NEXT:    fld fa5, 32(sp)
 ; RV32-NEXT:    fld fa4, 40(sp)
 ; RV32-NEXT:    fld fa3, 48(sp)
 ; RV32-NEXT:    fld fa2, 56(sp)
-; RV32-NEXT:    fcvt.w.d a1, fa5
-; RV32-NEXT:    fcvt.w.d a2, fa4
-; RV32-NEXT:    fcvt.w.d a3, fa3
-; RV32-NEXT:    vslide1down.vx v8, v10, a0
-; RV32-NEXT:    vslide1down.vx v8, v8, a1
-; RV32-NEXT:    vslide1down.vx v8, v8, a2
-; RV32-NEXT:    vslide1down.vx v8, v8, a3
+; RV32-NEXT:    fcvt.w.d a0, fa5
+; RV32-NEXT:    vslide1down.vx v8, v8, a0
+; RV32-NEXT:    fcvt.w.d a0, fa4
+; RV32-NEXT:    vslide1down.vx v8, v8, a0
+; RV32-NEXT:    fcvt.w.d a0, fa3
+; RV32-NEXT:    vslide1down.vx v8, v8, a0
 ; RV32-NEXT:    fcvt.w.d a0, fa2
 ; RV32-NEXT:    vslide1down.vx v8, v8, a0
 ; RV32-NEXT:    addi sp, s0, -128
@@ -629,17 +629,17 @@ define <8 x iXLen> @lrint_v8f64(<8 x double> %x) {
 ; RV64-i32-NEXT:    vfmv.f.s fa5, v8
 ; RV64-i32-NEXT:    fcvt.l.d a0, fa5
 ; RV64-i32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV64-i32-NEXT:    vslide1down.vx v8, v10, a0
 ; RV64-i32-NEXT:    fld fa5, 32(sp)
 ; RV64-i32-NEXT:    fld fa4, 40(sp)
 ; RV64-i32-NEXT:    fld fa3, 48(sp)
 ; RV64-i32-NEXT:    fld fa2, 56(sp)
-; RV64-i32-NEXT:    fcvt.l.d a1, fa5
-; RV64-i32-NEXT:    fcvt.l.d a2, fa4
-; RV64-i32-NEXT:    fcvt.l.d a3, fa3
-; RV64-i32-NEXT:    vslide1down.vx v8, v10, a0
-; RV64-i32-NEXT:    vslide1down.vx v8, v8, a1
-; RV64-i32-NEXT:    vslide1down.vx v8, v8, a2
-; RV64-i32-NEXT:    vslide1down.vx v8, v8, a3
+; RV64-i32-NEXT:    fcvt.l.d a0, fa5
+; RV64-i32-NEXT:    vslide1down.vx v8, v8, a0
+; RV64-i32-NEXT:    fcvt.l.d a0, fa4
+; RV64-i32-NEXT:    vslide1down.vx v8, v8, a0
+; RV64-i32-NEXT:    fcvt.l.d a0, fa3
+; RV64-i32-NEXT:    vslide1down.vx v8, v8, a0
 ; RV64-i32-NEXT:    fcvt.l.d a0, fa2
 ; RV64-i32-NEXT:    vslide1down.vx v8, v8, a0
 ; RV64-i32-NEXT:    addi sp, s0, -128
@@ -663,16 +663,16 @@ define <8 x iXLen> @lrint_v8f64(<8 x double> %x) {
 ; RV64-i64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; RV64-i64-NEXT:    vse64.v v8, (a0)
 ; RV64-i64-NEXT:    fld fa5, 56(sp)
+; RV64-i64-NEXT:    fld fa4, 48(sp)
+; RV64-i64-NEXT:    fld fa3, 32(sp)
+; RV64-i64-NEXT:    fld fa2, 40(sp)
 ; RV64-i64-NEXT:    fcvt.l.d a0, fa5
 ; RV64-i64-NEXT:    sd a0, 120(sp)
-; RV64-i64-NEXT:    fld fa5, 48(sp)
-; RV64-i64-NEXT:    fcvt.l.d a0, fa5
+; RV64-i64-NEXT:    fcvt.l.d a0, fa4
 ; RV64-i64-NEXT:    sd a0, 112(sp)
-; RV64-i64-NEXT:    fld fa5, 40(sp)
-; RV64-i64-NEXT:    fcvt.l.d a0, fa5
+; RV64-i64-NEXT:    fcvt.l.d a0, fa2
 ; RV64-i64-NEXT:    sd a0, 104(sp)
-; RV64-i64-NEXT:    fld fa5, 32(sp)
-; RV64-i64-NEXT:    fcvt.l.d a0, fa5
+; RV64-i64-NEXT:    fcvt.l.d a0, fa3
 ; RV64-i64-NEXT:    sd a0, 96(sp)
 ; RV64-i64-NEXT:    vfmv.f.s fa5, v8
 ; RV64-i64-NEXT:    fcvt.l.d a0, fa5
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll
index 92c8eee728308e..3cd2cc0e12696b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll
@@ -3183,7 +3183,7 @@ define void @mscatter_truemask_v4i64(<4 x i64> %val, <4 x ptr> %ptrs) {
 ;
 ; RV64ZVE32F-LABEL: mscatter_truemask_v4i64:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    ld a6, 8(a1)
+; RV64ZVE32F-NEXT:    ld a7, 8(a1)
 ; RV64ZVE32F-NEXT:    ld a4, 16(a1)
 ; RV64ZVE32F-NEXT:    ld a2, 24(a1)
 ; RV64ZVE32F-NEXT:    ld t0, 8(a0)
@@ -3191,16 +3191,16 @@ define void @mscatter_truemask_v4i64(<4 x i64> %val, <4 x ptr> %ptrs) {
 ; RV64ZVE32F-NEXT:    ld a3, 24(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vmset.m v8
-; RV64ZVE32F-NEXT:    vmv.x.s a7, v8
+; RV64ZVE32F-NEXT:    vmv.x.s a6, v8
 ; RV64ZVE32F-NEXT:    beqz zero, .LBB39_5
 ; RV64ZVE32F-NEXT:  # %bb.1: # %else
-; RV64ZVE32F-NEXT:    andi a0, a7, 2
+; RV64ZVE32F-NEXT:    andi a0, a6, 2
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB39_6
 ; RV64ZVE32F-NEXT:  .LBB39_2: # %else2
-; RV64ZVE32F-NEXT:    andi a0, a7, 4
+; RV64ZVE32F-NEXT:    andi a0, a6, 4
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB39_7
 ; RV64ZVE32F-NEXT:  .LBB39_3: # %else4
-; RV64ZVE32F-NEXT:    andi a0, a7, 8
+; RV64ZVE32F-NEXT:    andi a0, a6, 8
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB39_8
 ; RV64ZVE32F-NEXT:  .LBB39_4: # %else6
 ; RV64ZVE32F-NEXT:    ret
@@ -3208,15 +3208,15 @@ define void @mscatter_truemask_v4i64(<4 x i64> %val, <4 x ptr> %ptrs) {
 ; RV64ZVE32F-NEXT:    ld a1, 0(a1)
 ; RV64ZVE32F-NEXT:    ld a0, 0(a0)
 ; RV64ZVE32F-NEXT:    sd a0, 0(a1)
-; RV64ZVE32F-NEXT:    andi a0, a7, 2
+; RV64ZVE32F-NEXT:    andi a0, a6, 2
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB39_2
 ; RV64ZVE32F-NEXT:  .LBB39_6: # %cond.store1
-; RV64ZVE32F-NEXT:    sd t0, 0(a6)
-; RV64ZVE32F-NEXT:    andi a0, a7, 4
+; RV64ZVE32F-NEXT:    sd t0, 0(a7)
+; RV64ZVE32F-NEXT:    andi a0, a6, 4
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB39_3
 ; RV64ZVE32F-NEXT:  .LBB39_7: # %cond.store3
 ; RV64ZVE32F-NEXT:    sd a5, 0(a4)
-; RV64ZVE32F-NEXT:    andi a0, a7, 8
+; RV64ZVE32F-NEXT:    andi a0, a6, 8
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB39_4
 ; RV64ZVE32F-NEXT:  .LBB39_8: # %cond.store5
 ; RV64ZVE32F-NEXT:    sd a3, 0(a2)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll
index 2ff2529e259a8d..412d759beb7137 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll
@@ -1322,21 +1322,21 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN32-NEXT:    flh fa4, 486(sp)
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a0, 51(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 228(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 484(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    flh fa5, 222(sp)
+; ZVFHMIN32-NEXT:    flh fa4, 224(sp)
+; ZVFHMIN32-NEXT:    flh fa3, 226(sp)
+; ZVFHMIN32-NEXT:    flh fa2, 228(sp)
+; ZVFHMIN32-NEXT:    flh fa1, 484(sp)
+; ZVFHMIN32-NEXT:    flh fa0, 482(sp)
+; ZVFHMIN32-NEXT:    flh ft0, 478(sp)
+; ZVFHMIN32-NEXT:    flh ft1, 480(sp)
+; ZVFHMIN32-NEXT:    feq.h a0, fa2, fa1
 ; ZVFHMIN32-NEXT:    sb a0, 50(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 226(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 482(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    feq.h a0, fa3, fa0
 ; ZVFHMIN32-NEXT:    sb a0, 49(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 224(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 480(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    feq.h a0, fa4, ft1
 ; ZVFHMIN32-NEXT:    sb a0, 48(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 222(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 478(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, ft0
 ; ZVFHMIN32-NEXT:    sb a0, 47(sp)
 ; ZVFHMIN32-NEXT:    flh fa5, 382(sp)
 ; ZVFHMIN32-NEXT:    flh fa4, 638(sp)
@@ -1390,21 +1390,21 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN32-NEXT:    flh fa4, 614(sp)
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a0, 115(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 356(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 612(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    flh fa5, 350(sp)
+; ZVFHMIN32-NEXT:    flh fa4, 352(sp)
+; ZVFHMIN32-NEXT:    flh fa3, 354(sp)
+; ZVFHMIN32-NEXT:    flh fa2, 356(sp)
+; ZVFHMIN32-NEXT:    flh fa1, 612(sp)
+; ZVFHMIN32-NEXT:    flh fa0, 610(sp)
+; ZVFHMIN32-NEXT:    flh ft0, 606(sp)
+; ZVFHMIN32-NEXT:    flh ft1, 608(sp)
+; ZVFHMIN32-NEXT:    feq.h a0, fa2, fa1
 ; ZVFHMIN32-NEXT:    sb a0, 114(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 354(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 610(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    feq.h a0, fa3, fa0
 ; ZVFHMIN32-NEXT:    sb a0, 113(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 352(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 608(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    feq.h a0, fa4, ft1
 ; ZVFHMIN32-NEXT:    sb a0, 112(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 350(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 606(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, ft0
 ; ZVFHMIN32-NEXT:    sb a0, 111(sp)
 ; ZVFHMIN32-NEXT:    flh fa5, 220(sp)
 ; ZVFHMIN32-NEXT:    flh fa4, 476(sp)
@@ -1458,21 +1458,21 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN32-NEXT:    flh fa4, 452(sp)
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a0, 34(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 194(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 450(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    flh fa5, 188(sp)
+; ZVFHMIN32-NEXT:    flh fa4, 190(sp)
+; ZVFHMIN32-NEXT:    flh fa3, 192(sp)
+; ZVFHMIN32-NEXT:    flh fa2, 194(sp)
+; ZVFHMIN32-NEXT:    flh fa1, 450(sp)
+; ZVFHMIN32-NEXT:    flh fa0, 448(sp)
+; ZVFHMIN32-NEXT:    flh ft0, 444(sp)
+; ZVFHMIN32-NEXT:    flh ft1, 446(sp)
+; ZVFHMIN32-NEXT:    feq.h a0, fa2, fa1
 ; ZVFHMIN32-NEXT:    sb a0, 33(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 192(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 448(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    feq.h a0, fa3, fa0
 ; ZVFHMIN32-NEXT:    sb a0, 32(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 190(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 446(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    feq.h a0, fa4, ft1
 ; ZVFHMIN32-NEXT:    sb a0, 31(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 188(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 444(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, ft0
 ; ZVFHMIN32-NEXT:    sb a0, 30(sp)
 ; ZVFHMIN32-NEXT:    flh fa5, 348(sp)
 ; ZVFHMIN32-NEXT:    flh fa4, 604(sp)
@@ -1526,21 +1526,21 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN32-NEXT:    flh fa4, 580(sp)
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a0, 98(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 322(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 578(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    flh fa5, 316(sp)
+; ZVFHMIN32-NEXT:    flh fa4, 318(sp)
+; ZVFHMIN32-NEXT:    flh fa3, 320(sp)
+; ZVFHMIN32-NEXT:    flh fa2, 322(sp)
+; ZVFHMIN32-NEXT:    flh fa1, 578(sp)
+; ZVFHMIN32-NEXT:    flh fa0, 576(sp)
+; ZVFHMIN32-NEXT:    flh ft0, 572(sp)
+; ZVFHMIN32-NEXT:    flh ft1, 574(sp)
+; ZVFHMIN32-NEXT:    feq.h a0, fa2, fa1
 ; ZVFHMIN32-NEXT:    sb a0, 97(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 320(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 576(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    feq.h a0, fa3, fa0
 ; ZVFHMIN32-NEXT:    sb a0, 96(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 318(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 574(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    feq.h a0, fa4, ft1
 ; ZVFHMIN32-NEXT:    sb a0, 95(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 316(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 572(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, ft0
 ; ZVFHMIN32-NEXT:    sb a0, 94(sp)
 ; ZVFHMIN32-NEXT:    flh fa5, 186(sp)
 ; ZVFHMIN32-NEXT:    flh fa4, 442(sp)
@@ -1594,21 +1594,21 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN32-NEXT:    flh fa4, 418(sp)
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a0, 17(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 160(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 416(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    flh fa5, 154(sp)
+; ZVFHMIN32-NEXT:    flh fa4, 156(sp)
+; ZVFHMIN32-NEXT:    flh fa3, 158(sp)
+; ZVFHMIN32-NEXT:    flh fa2, 160(sp)
+; ZVFHMIN32-NEXT:    flh fa1, 416(sp)
+; ZVFHMIN32-NEXT:    flh fa0, 414(sp)
+; ZVFHMIN32-NEXT:    flh ft0, 410(sp)
+; ZVFHMIN32-NEXT:    flh ft1, 412(sp)
+; ZVFHMIN32-NEXT:    feq.h a0, fa2, fa1
 ; ZVFHMIN32-NEXT:    sb a0, 16(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 158(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 414(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    feq.h a0, fa3, fa0
 ; ZVFHMIN32-NEXT:    sb a0, 15(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 156(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 412(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    feq.h a0, fa4, ft1
 ; ZVFHMIN32-NEXT:    sb a0, 14(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 154(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 410(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, ft0
 ; ZVFHMIN32-NEXT:    sb a0, 13(sp)
 ; ZVFHMIN32-NEXT:    flh fa5, 314(sp)
 ; ZVFHMIN32-NEXT:    flh fa4, 570(sp)
@@ -1662,21 +1662,21 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN32-NEXT:    flh fa4, 546(sp)
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a0, 81(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 288(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 544(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    flh fa5, 282(sp)
+; ZVFHMIN32-NEXT:    flh fa4, 284(sp)
+; ZVFHMIN32-NEXT:    flh fa3, 286(sp)
+; ZVFHMIN32-NEXT:    flh fa2, 288(sp)
+; ZVFHMIN32-NEXT:    flh fa1, 544(sp)
+; ZVFHMIN32-NEXT:    flh fa0, 542(sp)
+; ZVFHMIN32-NEXT:    flh ft0, 538(sp)
+; ZVFHMIN32-NEXT:    flh ft1, 540(sp)
+; ZVFHMIN32-NEXT:    feq.h a0, fa2, fa1
 ; ZVFHMIN32-NEXT:    sb a0, 80(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 286(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 542(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    feq.h a0, fa3, fa0
 ; ZVFHMIN32-NEXT:    sb a0, 79(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 284(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 540(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    feq.h a0, fa4, ft1
 ; ZVFHMIN32-NEXT:    sb a0, 78(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 282(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 538(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, ft0
 ; ZVFHMIN32-NEXT:    sb a0, 77(sp)
 ; ZVFHMIN32-NEXT:    flh fa5, 152(sp)
 ; ZVFHMIN32-NEXT:    flh fa4, 408(sp)
@@ -1714,21 +1714,21 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN32-NEXT:    flh fa4, 392(sp)
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a0, 4(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 134(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 390(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    flh fa5, 128(sp)
+; ZVFHMIN32-NEXT:    flh fa4, 130(sp)
+; ZVFHMIN32-NEXT:    flh fa3, 132(sp)
+; ZVFHMIN32-NEXT:    flh fa2, 134(sp)
+; ZVFHMIN32-NEXT:    flh fa1, 390(sp)
+; ZVFHMIN32-NEXT:    flh fa0, 388(sp)
+; ZVFHMIN32-NEXT:    flh ft0, 384(sp)
+; ZVFHMIN32-NEXT:    flh ft1, 386(sp)
+; ZVFHMIN32-NEXT:    feq.h a0, fa2, fa1
 ; ZVFHMIN32-NEXT:    sb a0, 3(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 132(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 388(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    feq.h a0, fa3, fa0
 ; ZVFHMIN32-NEXT:    sb a0, 2(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 130(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 386(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    feq.h a0, fa4, ft1
 ; ZVFHMIN32-NEXT:    sb a0, 1(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 128(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 384(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, ft0
 ; ZVFHMIN32-NEXT:    sb a0, 0(sp)
 ; ZVFHMIN32-NEXT:    flh fa5, 280(sp)
 ; ZVFHMIN32-NEXT:    flh fa4, 536(sp)
@@ -1766,21 +1766,21 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN32-NEXT:    flh fa4, 520(sp)
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a0, 68(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 262(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 518(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    flh fa5, 256(sp)
+; ZVFHMIN32-NEXT:    flh fa4, 258(sp)
+; ZVFHMIN32-NEXT:    flh fa3, 260(sp)
+; ZVFHMIN32-NEXT:    flh fa2, 262(sp)
+; ZVFHMIN32-NEXT:    flh fa1, 518(sp)
+; ZVFHMIN32-NEXT:    flh fa0, 516(sp)
+; ZVFHMIN32-NEXT:    flh ft0, 512(sp)
+; ZVFHMIN32-NEXT:    flh ft1, 514(sp)
+; ZVFHMIN32-NEXT:    feq.h a0, fa2, fa1
 ; ZVFHMIN32-NEXT:    sb a0, 67(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 260(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 516(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    feq.h a0, fa3, fa0
 ; ZVFHMIN32-NEXT:    sb a0, 66(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 258(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 514(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    feq.h a0, fa4, ft1
 ; ZVFHMIN32-NEXT:    sb a0, 65(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 256(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 512(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, ft0
 ; ZVFHMIN32-NEXT:    sb a0, 64(sp)
 ; ZVFHMIN32-NEXT:    li a0, 128
 ; ZVFHMIN32-NEXT:    mv a1, sp
@@ -1870,21 +1870,21 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN64-NEXT:    flh fa4, 486(sp)
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a0, 51(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 228(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 484(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    flh fa5, 222(sp)
+; ZVFHMIN64-NEXT:    flh fa4, 224(sp)
+; ZVFHMIN64-NEXT:    flh fa3, 226(sp)
+; ZVFHMIN64-NEXT:    flh fa2, 228(sp)
+; ZVFHMIN64-NEXT:    flh fa1, 484(sp)
+; ZVFHMIN64-NEXT:    flh fa0, 482(sp)
+; ZVFHMIN64-NEXT:    flh ft0, 478(sp)
+; ZVFHMIN64-NEXT:    flh ft1, 480(sp)
+; ZVFHMIN64-NEXT:    feq.h a0, fa2, fa1
 ; ZVFHMIN64-NEXT:    sb a0, 50(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 226(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 482(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    feq.h a0, fa3, fa0
 ; ZVFHMIN64-NEXT:    sb a0, 49(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 224(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 480(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    feq.h a0, fa4, ft1
 ; ZVFHMIN64-NEXT:    sb a0, 48(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 222(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 478(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, ft0
 ; ZVFHMIN64-NEXT:    sb a0, 47(sp)
 ; ZVFHMIN64-NEXT:    flh fa5, 382(sp)
 ; ZVFHMIN64-NEXT:    flh fa4, 638(sp)
@@ -1938,21 +1938,21 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN64-NEXT:    flh fa4, 614(sp)
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a0, 115(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 356(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 612(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    flh fa5, 350(sp)
+; ZVFHMIN64-NEXT:    flh fa4, 352(sp)
+; ZVFHMIN64-NEXT:    flh fa3, 354(sp)
+; ZVFHMIN64-NEXT:    flh fa2, 356(sp)
+; ZVFHMIN64-NEXT:    flh fa1, 612(sp)
+; ZVFHMIN64-NEXT:    flh fa0, 610(sp)
+; ZVFHMIN64-NEXT:    flh ft0, 606(sp)
+; ZVFHMIN64-NEXT:    flh ft1, 608(sp)
+; ZVFHMIN64-NEXT:    feq.h a0, fa2, fa1
 ; ZVFHMIN64-NEXT:    sb a0, 114(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 354(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 610(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    feq.h a0, fa3, fa0
 ; ZVFHMIN64-NEXT:    sb a0, 113(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 352(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 608(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    feq.h a0, fa4, ft1
 ; ZVFHMIN64-NEXT:    sb a0, 112(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 350(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 606(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, ft0
 ; ZVFHMIN64-NEXT:    sb a0, 111(sp)
 ; ZVFHMIN64-NEXT:    flh fa5, 220(sp)
 ; ZVFHMIN64-NEXT:    flh fa4, 476(sp)
@@ -2006,21 +2006,21 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN64-NEXT:    flh fa4, 452(sp)
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a0, 34(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 194(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 450(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    flh fa5, 188(sp)
+; ZVFHMIN64-NEXT:    flh fa4, 190(sp)
+; ZVFHMIN64-NEXT:    flh fa3, 192(sp)
+; ZVFHMIN64-NEXT:    flh fa2, 194(sp)
+; ZVFHMIN64-NEXT:    flh fa1, 450(sp)
+; ZVFHMIN64-NEXT:    flh fa0, 448(sp)
+; ZVFHMIN64-NEXT:    flh ft0, 444(sp)
+; ZVFHMIN64-NEXT:    flh ft1, 446(sp)
+; ZVFHMIN64-NEXT:    feq.h a0, fa2, fa1
 ; ZVFHMIN64-NEXT:    sb a0, 33(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 192(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 448(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    feq.h a0, fa3, fa0
 ; ZVFHMIN64-NEXT:    sb a0, 32(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 190(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 446(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    feq.h a0, fa4, ft1
 ; ZVFHMIN64-NEXT:    sb a0, 31(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 188(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 444(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, ft0
 ; ZVFHMIN64-NEXT:    sb a0, 30(sp)
 ; ZVFHMIN64-NEXT:    flh fa5, 348(sp)
 ; ZVFHMIN64-NEXT:    flh fa4, 604(sp)
@@ -2074,21 +2074,21 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN64-NEXT:    flh fa4, 580(sp)
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a0, 98(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 322(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 578(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    flh fa5, 316(sp)
+; ZVFHMIN64-NEXT:    flh fa4, 318(sp)
+; ZVFHMIN64-NEXT:    flh fa3, 320(sp)
+; ZVFHMIN64-NEXT:    flh fa2, 322(sp)
+; ZVFHMIN64-NEXT:    flh fa1, 578(sp)
+; ZVFHMIN64-NEXT:    flh fa0, 576(sp)
+; ZVFHMIN64-NEXT:    flh ft0, 572(sp)
+; ZVFHMIN64-NEXT:    flh ft1, 574(sp)
+; ZVFHMIN64-NEXT:    feq.h a0, fa2, fa1
 ; ZVFHMIN64-NEXT:    sb a0, 97(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 320(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 576(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    feq.h a0, fa3, fa0
 ; ZVFHMIN64-NEXT:    sb a0, 96(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 318(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 574(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    feq.h a0, fa4, ft1
 ; ZVFHMIN64-NEXT:    sb a0, 95(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 316(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 572(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, ft0
 ; ZVFHMIN64-NEXT:    sb a0, 94(sp)
 ; ZVFHMIN64-NEXT:    flh fa5, 186(sp)
 ; ZVFHMIN64-NEXT:    flh fa4, 442(sp)
@@ -2142,21 +2142,21 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN64-NEXT:    flh fa4, 418(sp)
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a0, 17(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 160(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 416(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    flh fa5, 154(sp)
+; ZVFHMIN64-NEXT:    flh fa4, 156(sp)
+; ZVFHMIN64-NEXT:    flh fa3, 158(sp)
+; ZVFHMIN64-NEXT:    flh fa2, 160(sp)
+; ZVFHMIN64-NEXT:    flh fa1, 416(sp)
+; ZVFHMIN64-NEXT:    flh fa0, 414(sp)
+; ZVFHMIN64-NEXT:    flh ft0, 410(sp)
+; ZVFHMIN64-NEXT:    flh ft1, 412(sp)
+; ZVFHMIN64-NEXT:    feq.h a0, fa2, fa1
 ; ZVFHMIN64-NEXT:    sb a0, 16(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 158(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 414(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    feq.h a0, fa3, fa0
 ; ZVFHMIN64-NEXT:    sb a0, 15(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 156(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 412(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    feq.h a0, fa4, ft1
 ; ZVFHMIN64-NEXT:    sb a0, 14(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 154(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 410(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, ft0
 ; ZVFHMIN64-NEXT:    sb a0, 13(sp)
 ; ZVFHMIN64-NEXT:    flh fa5, 314(sp)
 ; ZVFHMIN64-NEXT:    flh fa4, 570(sp)
@@ -2210,21 +2210,21 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN64-NEXT:    flh fa4, 546(sp)
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a0, 81(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 288(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 544(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    flh fa5, 282(sp)
+; ZVFHMIN64-NEXT:    flh fa4, 284(sp)
+; ZVFHMIN64-NEXT:    flh fa3, 286(sp)
+; ZVFHMIN64-NEXT:    flh fa2, 288(sp)
+; ZVFHMIN64-NEXT:    flh fa1, 544(sp)
+; ZVFHMIN64-NEXT:    flh fa0, 542(sp)
+; ZVFHMIN64-NEXT:    flh ft0, 538(sp)
+; ZVFHMIN64-NEXT:    flh ft1, 540(sp)
+; ZVFHMIN64-NEXT:    feq.h a0, fa2, fa1
 ; ZVFHMIN64-NEXT:    sb a0, 80(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 286(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 542(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    feq.h a0, fa3, fa0
 ; ZVFHMIN64-NEXT:    sb a0, 79(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 284(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 540(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    feq.h a0, fa4, ft1
 ; ZVFHMIN64-NEXT:    sb a0, 78(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 282(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 538(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, ft0
 ; ZVFHMIN64-NEXT:    sb a0, 77(sp)
 ; ZVFHMIN64-NEXT:    flh fa5, 152(sp)
 ; ZVFHMIN64-NEXT:    flh fa4, 408(sp)
@@ -2262,21 +2262,21 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN64-NEXT:    flh fa4, 392(sp)
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a0, 4(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 134(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 390(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    flh fa5, 128(sp)
+; ZVFHMIN64-NEXT:    flh fa4, 130(sp)
+; ZVFHMIN64-NEXT:    flh fa3, 132(sp)
+; ZVFHMIN64-NEXT:    flh fa2, 134(sp)
+; ZVFHMIN64-NEXT:    flh fa1, 390(sp)
+; ZVFHMIN64-NEXT:    flh fa0, 388(sp)
+; ZVFHMIN64-NEXT:    flh ft0, 384(sp)
+; ZVFHMIN64-NEXT:    flh ft1, 386(sp)
+; ZVFHMIN64-NEXT:    feq.h a0, fa2, fa1
 ; ZVFHMIN64-NEXT:    sb a0, 3(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 132(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 388(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    feq.h a0, fa3, fa0
 ; ZVFHMIN64-NEXT:    sb a0, 2(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 130(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 386(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    feq.h a0, fa4, ft1
 ; ZVFHMIN64-NEXT:    sb a0, 1(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 128(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 384(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, ft0
 ; ZVFHMIN64-NEXT:    sb a0, 0(sp)
 ; ZVFHMIN64-NEXT:    flh fa5, 280(sp)
 ; ZVFHMIN64-NEXT:    flh fa4, 536(sp)
@@ -2314,21 +2314,21 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN64-NEXT:    flh fa4, 520(sp)
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a0, 68(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 262(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 518(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    flh fa5, 256(sp)
+; ZVFHMIN64-NEXT:    flh fa4, 258(sp)
+; ZVFHMIN64-NEXT:    flh fa3, 260(sp)
+; ZVFHMIN64-NEXT:    flh fa2, 262(sp)
+; ZVFHMIN64-NEXT:    flh fa1, 518(sp)
+; ZVFHMIN64-NEXT:    flh fa0, 516(sp)
+; ZVFHMIN64-NEXT:    flh ft0, 512(sp)
+; ZVFHMIN64-NEXT:    flh ft1, 514(sp)
+; ZVFHMIN64-NEXT:    feq.h a0, fa2, fa1
 ; ZVFHMIN64-NEXT:    sb a0, 67(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 260(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 516(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    feq.h a0, fa3, fa0
 ; ZVFHMIN64-NEXT:    sb a0, 66(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 258(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 514(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    feq.h a0, fa4, ft1
 ; ZVFHMIN64-NEXT:    sb a0, 65(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 256(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 512(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, ft0
 ; ZVFHMIN64-NEXT:    sb a0, 64(sp)
 ; ZVFHMIN64-NEXT:    li a0, 128
 ; ZVFHMIN64-NEXT:    mv a1, sp
diff --git a/llvm/test/CodeGen/RISCV/rvv/memcpy-inline.ll b/llvm/test/CodeGen/RISCV/rvv/memcpy-inline.ll
index 485f94ee2a1026..c3586095a12f6b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/memcpy-inline.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/memcpy-inline.ll
@@ -162,16 +162,16 @@ define void @unaligned_memcpy7(ptr nocapture %dest, ptr %src) nounwind {
 ; RV32-FAST-LABEL: unaligned_memcpy7:
 ; RV32-FAST:       # %bb.0: # %entry
 ; RV32-FAST-NEXT:    lw a2, 3(a1)
-; RV32-FAST-NEXT:    sw a2, 3(a0)
 ; RV32-FAST-NEXT:    lw a1, 0(a1)
+; RV32-FAST-NEXT:    sw a2, 3(a0)
 ; RV32-FAST-NEXT:    sw a1, 0(a0)
 ; RV32-FAST-NEXT:    ret
 ;
 ; RV64-FAST-LABEL: unaligned_memcpy7:
 ; RV64-FAST:       # %bb.0: # %entry
 ; RV64-FAST-NEXT:    lw a2, 3(a1)
-; RV64-FAST-NEXT:    sw a2, 3(a0)
 ; RV64-FAST-NEXT:    lw a1, 0(a1)
+; RV64-FAST-NEXT:    sw a2, 3(a0)
 ; RV64-FAST-NEXT:    sw a1, 0(a0)
 ; RV64-FAST-NEXT:    ret
 entry:
@@ -253,9 +253,9 @@ define void @unaligned_memcpy15(ptr nocapture %dest, ptr %src) nounwind {
 ; RV32-FAST-LABEL: unaligned_memcpy15:
 ; RV32-FAST:       # %bb.0: # %entry
 ; RV32-FAST-NEXT:    lw a2, 11(a1)
+; RV32-FAST-NEXT:    lw a3, 8(a1)
 ; RV32-FAST-NEXT:    sw a2, 11(a0)
-; RV32-FAST-NEXT:    lw a2, 8(a1)
-; RV32-FAST-NEXT:    sw a2, 8(a0)
+; RV32-FAST-NEXT:    sw a3, 8(a0)
 ; RV32-FAST-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV32-FAST-NEXT:    vle32.v v8, (a1)
 ; RV32-FAST-NEXT:    vse32.v v8, (a0)
@@ -264,8 +264,8 @@ define void @unaligned_memcpy15(ptr nocapture %dest, ptr %src) nounwind {
 ; RV64-FAST-LABEL: unaligned_memcpy15:
 ; RV64-FAST:       # %bb.0: # %entry
 ; RV64-FAST-NEXT:    ld a2, 7(a1)
-; RV64-FAST-NEXT:    sd a2, 7(a0)
 ; RV64-FAST-NEXT:    ld a1, 0(a1)
+; RV64-FAST-NEXT:    sd a2, 7(a0)
 ; RV64-FAST-NEXT:    sd a1, 0(a0)
 ; RV64-FAST-NEXT:    ret
 entry:
@@ -531,12 +531,12 @@ define void @unaligned_memcpy196(ptr nocapture %dest, ptr %src) nounwind {
 ; RV32-NEXT:    addi a2, a0, 128
 ; RV32-NEXT:    vse8.v v8, (a2)
 ; RV32-NEXT:    lbu a2, 195(a1)
-; RV32-NEXT:    sb a2, 195(a0)
-; RV32-NEXT:    lbu a2, 194(a1)
-; RV32-NEXT:    sb a2, 194(a0)
-; RV32-NEXT:    lbu a2, 193(a1)
-; RV32-NEXT:    sb a2, 193(a0)
+; RV32-NEXT:    lbu a3, 194(a1)
+; RV32-NEXT:    lbu a4, 193(a1)
 ; RV32-NEXT:    lbu a1, 192(a1)
+; RV32-NEXT:    sb a2, 195(a0)
+; RV32-NEXT:    sb a3, 194(a0)
+; RV32-NEXT:    sb a4, 193(a0)
 ; RV32-NEXT:    sb a1, 192(a0)
 ; RV32-NEXT:    ret
 ;
@@ -553,12 +553,12 @@ define void @unaligned_memcpy196(ptr nocapture %dest, ptr %src) nounwind {
 ; RV64-NEXT:    addi a2, a0, 128
 ; RV64-NEXT:    vse8.v v8, (a2)
 ; RV64-NEXT:    lbu a2, 195(a1)
-; RV64-NEXT:    sb a2, 195(a0)
-; RV64-NEXT:    lbu a2, 194(a1)
-; RV64-NEXT:    sb a2, 194(a0)
-; RV64-NEXT:    lbu a2, 193(a1)
-; RV64-NEXT:    sb a2, 193(a0)
+; RV64-NEXT:    lbu a3, 194(a1)
+; RV64-NEXT:    lbu a4, 193(a1)
 ; RV64-NEXT:    lbu a1, 192(a1)
+; RV64-NEXT:    sb a2, 195(a0)
+; RV64-NEXT:    sb a3, 194(a0)
+; RV64-NEXT:    sb a4, 193(a0)
 ; RV64-NEXT:    sb a1, 192(a0)
 ; RV64-NEXT:    ret
 ;
@@ -728,16 +728,16 @@ define void @aligned_memcpy7(ptr nocapture %dest, ptr %src) nounwind {
 ; RV32-FAST-LABEL: aligned_memcpy7:
 ; RV32-FAST:       # %bb.0: # %entry
 ; RV32-FAST-NEXT:    lw a2, 3(a1)
-; RV32-FAST-NEXT:    sw a2, 3(a0)
 ; RV32-FAST-NEXT:    lw a1, 0(a1)
+; RV32-FAST-NEXT:    sw a2, 3(a0)
 ; RV32-FAST-NEXT:    sw a1, 0(a0)
 ; RV32-FAST-NEXT:    ret
 ;
 ; RV64-FAST-LABEL: aligned_memcpy7:
 ; RV64-FAST:       # %bb.0: # %entry
 ; RV64-FAST-NEXT:    lw a2, 3(a1)
-; RV64-FAST-NEXT:    sw a2, 3(a0)
 ; RV64-FAST-NEXT:    lw a1, 0(a1)
+; RV64-FAST-NEXT:    sw a2, 3(a0)
 ; RV64-FAST-NEXT:    sw a1, 0(a0)
 ; RV64-FAST-NEXT:    ret
 entry:
@@ -792,9 +792,9 @@ define void @aligned_memcpy15(ptr nocapture %dest, ptr %src) nounwind {
 ; RV32-FAST-LABEL: aligned_memcpy15:
 ; RV32-FAST:       # %bb.0: # %entry
 ; RV32-FAST-NEXT:    lw a2, 11(a1)
+; RV32-FAST-NEXT:    lw a3, 8(a1)
 ; RV32-FAST-NEXT:    sw a2, 11(a0)
-; RV32-FAST-NEXT:    lw a2, 8(a1)
-; RV32-FAST-NEXT:    sw a2, 8(a0)
+; RV32-FAST-NEXT:    sw a3, 8(a0)
 ; RV32-FAST-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV32-FAST-NEXT:    vle32.v v8, (a1)
 ; RV32-FAST-NEXT:    vse32.v v8, (a0)
@@ -803,8 +803,8 @@ define void @aligned_memcpy15(ptr nocapture %dest, ptr %src) nounwind {
 ; RV64-FAST-LABEL: aligned_memcpy15:
 ; RV64-FAST:       # %bb.0: # %entry
 ; RV64-FAST-NEXT:    ld a2, 7(a1)
-; RV64-FAST-NEXT:    sd a2, 7(a0)
 ; RV64-FAST-NEXT:    ld a1, 0(a1)
+; RV64-FAST-NEXT:    sd a2, 7(a0)
 ; RV64-FAST-NEXT:    sd a1, 0(a0)
 ; RV64-FAST-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll
index 856a68d5f277a9..04533659cd1466 100644
--- a/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll
@@ -312,8 +312,8 @@ define void @test_srem_vec(ptr %X) nounwind {
 ; RV32-NEXT:    mv s0, a0
 ; RV32-NEXT:    lbu a1, 12(a0)
 ; RV32-NEXT:    lw a2, 8(a0)
-; RV32-NEXT:    lw a3, 4(a0)
 ; RV32-NEXT:    lw a0, 0(a0)
+; RV32-NEXT:    lw a3, 4(s0)
 ; RV32-NEXT:    slli a4, a1, 30
 ; RV32-NEXT:    srli s1, a2, 2
 ; RV32-NEXT:    or s1, s1, a4
@@ -464,8 +464,8 @@ define void @test_srem_vec(ptr %X) nounwind {
 ; RV32M-NEXT:    mv s0, a0
 ; RV32M-NEXT:    lbu a1, 12(a0)
 ; RV32M-NEXT:    lw a2, 8(a0)
-; RV32M-NEXT:    lw a3, 4(a0)
 ; RV32M-NEXT:    lw a0, 0(a0)
+; RV32M-NEXT:    lw a3, 4(s0)
 ; RV32M-NEXT:    slli a4, a1, 30
 ; RV32M-NEXT:    srli s1, a2, 2
 ; RV32M-NEXT:    or s1, s1, a4
@@ -614,8 +614,8 @@ define void @test_srem_vec(ptr %X) nounwind {
 ; RV32MV-NEXT:    mv s0, a0
 ; RV32MV-NEXT:    lbu a1, 12(a0)
 ; RV32MV-NEXT:    lw a2, 8(a0)
-; RV32MV-NEXT:    lw a3, 4(a0)
 ; RV32MV-NEXT:    lw a0, 0(a0)
+; RV32MV-NEXT:    lw a3, 4(s0)
 ; RV32MV-NEXT:    slli a4, a1, 30
 ; RV32MV-NEXT:    srli s1, a2, 2
 ; RV32MV-NEXT:    or s1, s1, a4
diff --git a/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll b/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll
index ed10341580ef8d..2d84cbf3e41fd8 100644
--- a/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll
+++ b/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll
@@ -655,36 +655,36 @@ define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) nounwind {
 ;
 ; RV32IM-LABEL: dont_fold_srem_power_of_two:
 ; RV32IM:       # %bb.0:
-; RV32IM-NEXT:    lh a2, 4(a1)
-; RV32IM-NEXT:    lh a3, 8(a1)
-; RV32IM-NEXT:    lh a4, 12(a1)
-; RV32IM-NEXT:    lh a1, 0(a1)
+; RV32IM-NEXT:    lh a2, 12(a1)
+; RV32IM-NEXT:    lh a3, 0(a1)
+; RV32IM-NEXT:    lh a4, 4(a1)
+; RV32IM-NEXT:    lh a1, 8(a1)
 ; RV32IM-NEXT:    lui a5, 706409
 ; RV32IM-NEXT:    addi a5, a5, 389
-; RV32IM-NEXT:    mulh a5, a4, a5
-; RV32IM-NEXT:    add a5, a5, a4
+; RV32IM-NEXT:    mulh a5, a2, a5
+; RV32IM-NEXT:    add a5, a5, a2
 ; RV32IM-NEXT:    srli a6, a5, 31
 ; RV32IM-NEXT:    srli a5, a5, 6
 ; RV32IM-NEXT:    add a5, a5, a6
 ; RV32IM-NEXT:    li a6, 95
 ; RV32IM-NEXT:    mul a5, a5, a6
-; RV32IM-NEXT:    sub a4, a4, a5
-; RV32IM-NEXT:    srli a5, a1, 26
-; RV32IM-NEXT:    add a5, a1, a5
-; RV32IM-NEXT:    andi a5, a5, -64
-; RV32IM-NEXT:    sub a1, a1, a5
-; RV32IM-NEXT:    srli a5, a2, 27
-; RV32IM-NEXT:    add a5, a2, a5
-; RV32IM-NEXT:    andi a5, a5, -32
 ; RV32IM-NEXT:    sub a2, a2, a5
-; RV32IM-NEXT:    srli a5, a3, 29
+; RV32IM-NEXT:    srli a5, a3, 26
 ; RV32IM-NEXT:    add a5, a3, a5
-; RV32IM-NEXT:    andi a5, a5, -8
+; RV32IM-NEXT:    andi a5, a5, -64
 ; RV32IM-NEXT:    sub a3, a3, a5
-; RV32IM-NEXT:    sh a3, 4(a0)
-; RV32IM-NEXT:    sh a2, 2(a0)
-; RV32IM-NEXT:    sh a1, 0(a0)
-; RV32IM-NEXT:    sh a4, 6(a0)
+; RV32IM-NEXT:    srli a5, a4, 27
+; RV32IM-NEXT:    add a5, a4, a5
+; RV32IM-NEXT:    andi a5, a5, -32
+; RV32IM-NEXT:    sub a4, a4, a5
+; RV32IM-NEXT:    srli a5, a1, 29
+; RV32IM-NEXT:    add a5, a1, a5
+; RV32IM-NEXT:    andi a5, a5, -8
+; RV32IM-NEXT:    sub a1, a1, a5
+; RV32IM-NEXT:    sh a1, 4(a0)
+; RV32IM-NEXT:    sh a4, 2(a0)
+; RV32IM-NEXT:    sh a3, 0(a0)
+; RV32IM-NEXT:    sh a2, 6(a0)
 ; RV32IM-NEXT:    ret
 ;
 ; RV64I-LABEL: dont_fold_srem_power_of_two:
@@ -731,9 +731,9 @@ define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) nounwind {
 ; RV64IM-NEXT:    lh a2, 24(a1)
 ; RV64IM-NEXT:    lui a3, %hi(.LCPI3_0)
 ; RV64IM-NEXT:    ld a3, %lo(.LCPI3_0)(a3)
-; RV64IM-NEXT:    lh a4, 16(a1)
+; RV64IM-NEXT:    lh a4, 0(a1)
 ; RV64IM-NEXT:    lh a5, 8(a1)
-; RV64IM-NEXT:    lh a1, 0(a1)
+; RV64IM-NEXT:    lh a1, 16(a1)
 ; RV64IM-NEXT:    mulh a3, a2, a3
 ; RV64IM-NEXT:    add a3, a3, a2
 ; RV64IM-NEXT:    srli a6, a3, 63
@@ -742,21 +742,21 @@ define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) nounwind {
 ; RV64IM-NEXT:    li a6, 95
 ; RV64IM-NEXT:    mul a3, a3, a6
 ; RV64IM-NEXT:    subw a2, a2, a3
-; RV64IM-NEXT:    srli a3, a1, 58
-; RV64IM-NEXT:    add a3, a1, a3
+; RV64IM-NEXT:    srli a3, a4, 58
+; RV64IM-NEXT:    add a3, a4, a3
 ; RV64IM-NEXT:    andi a3, a3, -64
-; RV64IM-NEXT:    subw a1, a1, a3
+; RV64IM-NEXT:    subw a4, a4, a3
 ; RV64IM-NEXT:    srli a3, a5, 59
 ; RV64IM-NEXT:    add a3, a5, a3
 ; RV64IM-NEXT:    andi a3, a3, -32
 ; RV64IM-NEXT:    subw a5, a5, a3
-; RV64IM-NEXT:    srli a3, a4, 61
-; RV64IM-NEXT:    add a3, a4, a3
+; RV64IM-NEXT:    srli a3, a1, 61
+; RV64IM-NEXT:    add a3, a1, a3
 ; RV64IM-NEXT:    andi a3, a3, -8
-; RV64IM-NEXT:    subw a4, a4, a3
-; RV64IM-NEXT:    sh a4, 4(a0)
+; RV64IM-NEXT:    subw a1, a1, a3
+; RV64IM-NEXT:    sh a1, 4(a0)
 ; RV64IM-NEXT:    sh a5, 2(a0)
-; RV64IM-NEXT:    sh a1, 0(a0)
+; RV64IM-NEXT:    sh a4, 0(a0)
 ; RV64IM-NEXT:    sh a2, 6(a0)
 ; RV64IM-NEXT:    ret
   %1 = srem <4 x i16> %x, <i16 64, i16 32, i16 8, i16 95>
@@ -1085,50 +1085,50 @@ define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) nounwind {
 ; RV32I-NEXT:    sw s6, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s7, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s8, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lw a3, 0(a1)
-; RV32I-NEXT:    lw a4, 4(a1)
-; RV32I-NEXT:    lw s0, 8(a1)
-; RV32I-NEXT:    lw s1, 12(a1)
-; RV32I-NEXT:    lw s2, 16(a1)
-; RV32I-NEXT:    lw s3, 20(a1)
+; RV32I-NEXT:    lw s0, 0(a1)
+; RV32I-NEXT:    lw s1, 4(a1)
+; RV32I-NEXT:    lw s2, 8(a1)
+; RV32I-NEXT:    lw s3, 12(a1)
+; RV32I-NEXT:    lw a3, 16(a1)
+; RV32I-NEXT:    lw a4, 20(a1)
 ; RV32I-NEXT:    lw s4, 24(a1)
 ; RV32I-NEXT:    lw s5, 28(a1)
 ; RV32I-NEXT:    mv s6, a0
-; RV32I-NEXT:    li a2, 1
+; RV32I-NEXT:    li a2, 23
 ; RV32I-NEXT:    mv a0, a3
 ; RV32I-NEXT:    mv a1, a4
 ; RV32I-NEXT:    li a3, 0
 ; RV32I-NEXT:    call __moddi3 at plt
 ; RV32I-NEXT:    mv s7, a0
 ; RV32I-NEXT:    mv s8, a1
-; RV32I-NEXT:    li a2, 654
+; RV32I-NEXT:    lui a0, 1
+; RV32I-NEXT:    addi a2, a0, 1327
+; RV32I-NEXT:    mv a0, s4
+; RV32I-NEXT:    mv a1, s5
+; RV32I-NEXT:    li a3, 0
+; RV32I-NEXT:    call __moddi3 at plt
+; RV32I-NEXT:    mv s4, a0
+; RV32I-NEXT:    mv s5, a1
+; RV32I-NEXT:    li a2, 1
 ; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    mv a1, s1
 ; RV32I-NEXT:    li a3, 0
 ; RV32I-NEXT:    call __moddi3 at plt
 ; RV32I-NEXT:    mv s0, a0
 ; RV32I-NEXT:    mv s1, a1
-; RV32I-NEXT:    li a2, 23
+; RV32I-NEXT:    li a2, 654
 ; RV32I-NEXT:    mv a0, s2
 ; RV32I-NEXT:    mv a1, s3
 ; RV32I-NEXT:    li a3, 0
 ; RV32I-NEXT:    call __moddi3 at plt
-; RV32I-NEXT:    mv s2, a0
-; RV32I-NEXT:    mv s3, a1
-; RV32I-NEXT:    lui a0, 1
-; RV32I-NEXT:    addi a2, a0, 1327
-; RV32I-NEXT:    mv a0, s4
-; RV32I-NEXT:    mv a1, s5
-; RV32I-NEXT:    li a3, 0
-; RV32I-NEXT:    call __moddi3 at plt
-; RV32I-NEXT:    sw a1, 28(s6)
-; RV32I-NEXT:    sw a0, 24(s6)
-; RV32I-NEXT:    sw s3, 20(s6)
-; RV32I-NEXT:    sw s2, 16(s6)
-; RV32I-NEXT:    sw s1, 12(s6)
-; RV32I-NEXT:    sw s0, 8(s6)
-; RV32I-NEXT:    sw s8, 4(s6)
-; RV32I-NEXT:    sw s7, 0(s6)
+; RV32I-NEXT:    sw s5, 28(s6)
+; RV32I-NEXT:    sw s4, 24(s6)
+; RV32I-NEXT:    sw s8, 20(s6)
+; RV32I-NEXT:    sw s7, 16(s6)
+; RV32I-NEXT:    sw a1, 12(s6)
+; RV32I-NEXT:    sw a0, 8(s6)
+; RV32I-NEXT:    sw s1, 4(s6)
+; RV32I-NEXT:    sw s0, 0(s6)
 ; RV32I-NEXT:    lw ra, 44(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 40(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 36(sp) # 4-byte Folded Reload
@@ -1155,50 +1155,50 @@ define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) nounwind {
 ; RV32IM-NEXT:    sw s6, 16(sp) # 4-byte Folded Spill
 ; RV32IM-NEXT:    sw s7, 12(sp) # 4-byte Folded Spill
 ; RV32IM-NEXT:    sw s8, 8(sp) # 4-byte Folded Spill
-; RV32IM-NEXT:    lw a3, 0(a1)
-; RV32IM-NEXT:    lw a4, 4(a1)
-; RV32IM-NEXT:    lw s0, 8(a1)
-; RV32IM-NEXT:    lw s1, 12(a1)
-; RV32IM-NEXT:    lw s2, 16(a1)
-; RV32IM-NEXT:    lw s3, 20(a1)
+; RV32IM-NEXT:    lw s0, 0(a1)
+; RV32IM-NEXT:    lw s1, 4(a1)
+; RV32IM-NEXT:    lw s2, 8(a1)
+; RV32IM-NEXT:    lw s3, 12(a1)
+; RV32IM-NEXT:    lw a3, 16(a1)
+; RV32IM-NEXT:    lw a4, 20(a1)
 ; RV32IM-NEXT:    lw s4, 24(a1)
 ; RV32IM-NEXT:    lw s5, 28(a1)
 ; RV32IM-NEXT:    mv s6, a0
-; RV32IM-NEXT:    li a2, 1
+; RV32IM-NEXT:    li a2, 23
 ; RV32IM-NEXT:    mv a0, a3
 ; RV32IM-NEXT:    mv a1, a4
 ; RV32IM-NEXT:    li a3, 0
 ; RV32IM-NEXT:    call __moddi3 at plt
 ; RV32IM-NEXT:    mv s7, a0
 ; RV32IM-NEXT:    mv s8, a1
-; RV32IM-NEXT:    li a2, 654
+; RV32IM-NEXT:    lui a0, 1
+; RV32IM-NEXT:    addi a2, a0, 1327
+; RV32IM-NEXT:    mv a0, s4
+; RV32IM-NEXT:    mv a1, s5
+; RV32IM-NEXT:    li a3, 0
+; RV32IM-NEXT:    call __moddi3 at plt
+; RV32IM-NEXT:    mv s4, a0
+; RV32IM-NEXT:    mv s5, a1
+; RV32IM-NEXT:    li a2, 1
 ; RV32IM-NEXT:    mv a0, s0
 ; RV32IM-NEXT:    mv a1, s1
 ; RV32IM-NEXT:    li a3, 0
 ; RV32IM-NEXT:    call __moddi3 at plt
 ; RV32IM-NEXT:    mv s0, a0
 ; RV32IM-NEXT:    mv s1, a1
-; RV32IM-NEXT:    li a2, 23
+; RV32IM-NEXT:    li a2, 654
 ; RV32IM-NEXT:    mv a0, s2
 ; RV32IM-NEXT:    mv a1, s3
 ; RV32IM-NEXT:    li a3, 0
 ; RV32IM-NEXT:    call __moddi3 at plt
-; RV32IM-NEXT:    mv s2, a0
-; RV32IM-NEXT:    mv s3, a1
-; RV32IM-NEXT:    lui a0, 1
-; RV32IM-NEXT:    addi a2, a0, 1327
-; RV32IM-NEXT:    mv a0, s4
-; RV32IM-NEXT:    mv a1, s5
-; RV32IM-NEXT:    li a3, 0
-; RV32IM-NEXT:    call __moddi3 at plt
-; RV32IM-NEXT:    sw a1, 28(s6)
-; RV32IM-NEXT:    sw a0, 24(s6)
-; RV32IM-NEXT:    sw s3, 20(s6)
-; RV32IM-NEXT:    sw s2, 16(s6)
-; RV32IM-NEXT:    sw s1, 12(s6)
-; RV32IM-NEXT:    sw s0, 8(s6)
-; RV32IM-NEXT:    sw s8, 4(s6)
-; RV32IM-NEXT:    sw s7, 0(s6)
+; RV32IM-NEXT:    sw s5, 28(s6)
+; RV32IM-NEXT:    sw s4, 24(s6)
+; RV32IM-NEXT:    sw s8, 20(s6)
+; RV32IM-NEXT:    sw s7, 16(s6)
+; RV32IM-NEXT:    sw a1, 12(s6)
+; RV32IM-NEXT:    sw a0, 8(s6)
+; RV32IM-NEXT:    sw s1, 4(s6)
+; RV32IM-NEXT:    sw s0, 0(s6)
 ; RV32IM-NEXT:    lw ra, 44(sp) # 4-byte Folded Reload
 ; RV32IM-NEXT:    lw s0, 40(sp) # 4-byte Folded Reload
 ; RV32IM-NEXT:    lw s1, 36(sp) # 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/RISCV/umulo-128-legalisation-lowering.ll
index 1a9126acac8da1..1ce388fc7ac4a5 100644
--- a/llvm/test/CodeGen/RISCV/umulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/RISCV/umulo-128-legalisation-lowering.ll
@@ -10,47 +10,47 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) #0 {
 ; RISCV32-NEXT:    sw s2, 20(sp) # 4-byte Folded Spill
 ; RISCV32-NEXT:    sw s3, 16(sp) # 4-byte Folded Spill
 ; RISCV32-NEXT:    sw s4, 12(sp) # 4-byte Folded Spill
-; RISCV32-NEXT:    lw a3, 0(a1)
+; RISCV32-NEXT:    lw a3, 0(a2)
+; RISCV32-NEXT:    lw a7, 4(a2)
+; RISCV32-NEXT:    lw a4, 8(a2)
+; RISCV32-NEXT:    lw a5, 12(a2)
+; RISCV32-NEXT:    lw a2, 0(a1)
 ; RISCV32-NEXT:    lw t2, 4(a1)
-; RISCV32-NEXT:    lw a4, 8(a1)
-; RISCV32-NEXT:    lw a5, 12(a1)
-; RISCV32-NEXT:    lw a1, 0(a2)
-; RISCV32-NEXT:    lw t0, 4(a2)
-; RISCV32-NEXT:    lw a6, 8(a2)
-; RISCV32-NEXT:    lw a7, 12(a2)
-; RISCV32-NEXT:    mulhu a2, a3, a1
-; RISCV32-NEXT:    mul t1, t2, a1
-; RISCV32-NEXT:    add a2, t1, a2
-; RISCV32-NEXT:    sltu t1, a2, t1
-; RISCV32-NEXT:    mulhu t3, t2, a1
+; RISCV32-NEXT:    lw a6, 8(a1)
+; RISCV32-NEXT:    lw a1, 12(a1)
+; RISCV32-NEXT:    mulhu t0, a2, a3
+; RISCV32-NEXT:    mul t1, t2, a3
+; RISCV32-NEXT:    add t0, t1, t0
+; RISCV32-NEXT:    sltu t1, t0, t1
+; RISCV32-NEXT:    mulhu t3, t2, a3
 ; RISCV32-NEXT:    add t4, t3, t1
-; RISCV32-NEXT:    mul t1, a3, t0
-; RISCV32-NEXT:    add a2, t1, a2
-; RISCV32-NEXT:    sltu t1, a2, t1
-; RISCV32-NEXT:    mulhu t3, a3, t0
+; RISCV32-NEXT:    mul t1, a2, a7
+; RISCV32-NEXT:    add t0, t1, t0
+; RISCV32-NEXT:    sltu t1, t0, t1
+; RISCV32-NEXT:    mulhu t3, a2, a7
 ; RISCV32-NEXT:    add t1, t3, t1
 ; RISCV32-NEXT:    add t5, t4, t1
-; RISCV32-NEXT:    mul t6, t2, t0
+; RISCV32-NEXT:    mul t6, t2, a7
 ; RISCV32-NEXT:    add s0, t6, t5
-; RISCV32-NEXT:    mul t1, a6, a3
-; RISCV32-NEXT:    mul s3, a4, a1
+; RISCV32-NEXT:    mul t1, a4, a2
+; RISCV32-NEXT:    mul s3, a6, a3
 ; RISCV32-NEXT:    add s4, s3, t1
 ; RISCV32-NEXT:    add t1, s0, s4
 ; RISCV32-NEXT:    sltu t3, t1, s0
 ; RISCV32-NEXT:    sltu s0, s0, t6
 ; RISCV32-NEXT:    sltu t4, t5, t4
-; RISCV32-NEXT:    mulhu t5, t2, t0
+; RISCV32-NEXT:    mulhu t5, t2, a7
 ; RISCV32-NEXT:    add t4, t5, t4
 ; RISCV32-NEXT:    add s0, t4, s0
-; RISCV32-NEXT:    mul t4, t2, a6
-; RISCV32-NEXT:    mul t5, a7, a3
+; RISCV32-NEXT:    mul t4, t2, a4
+; RISCV32-NEXT:    mul t5, a5, a2
 ; RISCV32-NEXT:    add t4, t5, t4
-; RISCV32-NEXT:    mulhu s1, a6, a3
+; RISCV32-NEXT:    mulhu s1, a4, a2
 ; RISCV32-NEXT:    add s2, s1, t4
-; RISCV32-NEXT:    mul t4, t0, a4
-; RISCV32-NEXT:    mul t5, a5, a1
+; RISCV32-NEXT:    mul t4, a7, a6
+; RISCV32-NEXT:    mul t5, a1, a3
 ; RISCV32-NEXT:    add t4, t5, t4
-; RISCV32-NEXT:    mulhu t5, a4, a1
+; RISCV32-NEXT:    mulhu t5, a6, a3
 ; RISCV32-NEXT:    add t6, t5, t4
 ; RISCV32-NEXT:    add t4, t6, s2
 ; RISCV32-NEXT:    sltu s3, s4, s3
@@ -63,41 +63,41 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) #0 {
 ; RISCV32-NEXT:  .LBB0_2: # %start
 ; RISCV32-NEXT:    sltu s0, s2, s1
 ; RISCV32-NEXT:    snez s1, t2
-; RISCV32-NEXT:    snez s2, a7
+; RISCV32-NEXT:    snez s2, a5
 ; RISCV32-NEXT:    and s1, s2, s1
-; RISCV32-NEXT:    mulhu s2, a7, a3
+; RISCV32-NEXT:    mulhu s2, a5, a2
 ; RISCV32-NEXT:    snez s2, s2
 ; RISCV32-NEXT:    or s1, s1, s2
-; RISCV32-NEXT:    mulhu t2, t2, a6
+; RISCV32-NEXT:    mulhu t2, t2, a4
 ; RISCV32-NEXT:    snez t2, t2
 ; RISCV32-NEXT:    or t2, s1, t2
 ; RISCV32-NEXT:    or t2, t2, s0
 ; RISCV32-NEXT:    sltu t5, t6, t5
-; RISCV32-NEXT:    snez t6, t0
-; RISCV32-NEXT:    snez s0, a5
+; RISCV32-NEXT:    snez t6, a7
+; RISCV32-NEXT:    snez s0, a1
 ; RISCV32-NEXT:    and t6, s0, t6
-; RISCV32-NEXT:    mulhu s0, a5, a1
+; RISCV32-NEXT:    mulhu s0, a1, a3
 ; RISCV32-NEXT:    snez s0, s0
 ; RISCV32-NEXT:    or t6, t6, s0
-; RISCV32-NEXT:    mulhu t0, t0, a4
-; RISCV32-NEXT:    snez t0, t0
-; RISCV32-NEXT:    or t0, t6, t0
-; RISCV32-NEXT:    or t0, t0, t5
-; RISCV32-NEXT:    or a6, a6, a7
-; RISCV32-NEXT:    snez a6, a6
+; RISCV32-NEXT:    mulhu a7, a7, a6
+; RISCV32-NEXT:    snez a7, a7
+; RISCV32-NEXT:    or a7, t6, a7
+; RISCV32-NEXT:    or a7, a7, t5
 ; RISCV32-NEXT:    or a4, a4, a5
 ; RISCV32-NEXT:    snez a4, a4
-; RISCV32-NEXT:    and a4, a4, a6
-; RISCV32-NEXT:    or a4, a4, t0
-; RISCV32-NEXT:    or a5, t2, t3
-; RISCV32-NEXT:    or a4, a4, a5
-; RISCV32-NEXT:    mul a1, a3, a1
-; RISCV32-NEXT:    andi a4, a4, 1
-; RISCV32-NEXT:    sw a1, 0(a0)
-; RISCV32-NEXT:    sw a2, 4(a0)
+; RISCV32-NEXT:    or a1, a6, a1
+; RISCV32-NEXT:    snez a1, a1
+; RISCV32-NEXT:    and a1, a1, a4
+; RISCV32-NEXT:    or a1, a1, a7
+; RISCV32-NEXT:    or a4, t2, t3
+; RISCV32-NEXT:    or a1, a1, a4
+; RISCV32-NEXT:    mul a2, a2, a3
+; RISCV32-NEXT:    andi a1, a1, 1
+; RISCV32-NEXT:    sw a2, 0(a0)
+; RISCV32-NEXT:    sw t0, 4(a0)
 ; RISCV32-NEXT:    sw t1, 8(a0)
 ; RISCV32-NEXT:    sw t4, 12(a0)
-; RISCV32-NEXT:    sb a4, 16(a0)
+; RISCV32-NEXT:    sb a1, 16(a0)
 ; RISCV32-NEXT:    lw s0, 28(sp) # 4-byte Folded Reload
 ; RISCV32-NEXT:    lw s1, 24(sp) # 4-byte Folded Reload
 ; RISCV32-NEXT:    lw s2, 20(sp) # 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll b/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll
index 765a3a1ee801c1..228fb308ab44d4 100644
--- a/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll
+++ b/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll
@@ -556,23 +556,23 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) nounwind {
 ;
 ; RV32IM-LABEL: dont_fold_urem_power_of_two:
 ; RV32IM:       # %bb.0:
-; RV32IM-NEXT:    lhu a2, 4(a1)
-; RV32IM-NEXT:    lhu a3, 8(a1)
-; RV32IM-NEXT:    lhu a4, 12(a1)
-; RV32IM-NEXT:    lhu a1, 0(a1)
+; RV32IM-NEXT:    lhu a2, 12(a1)
+; RV32IM-NEXT:    lhu a3, 0(a1)
+; RV32IM-NEXT:    lhu a4, 4(a1)
+; RV32IM-NEXT:    lhu a1, 8(a1)
 ; RV32IM-NEXT:    lui a5, 11038
 ; RV32IM-NEXT:    addi a5, a5, -1465
-; RV32IM-NEXT:    mulhu a5, a4, a5
+; RV32IM-NEXT:    mulhu a5, a2, a5
 ; RV32IM-NEXT:    li a6, 95
 ; RV32IM-NEXT:    mul a5, a5, a6
-; RV32IM-NEXT:    sub a4, a4, a5
-; RV32IM-NEXT:    andi a1, a1, 63
-; RV32IM-NEXT:    andi a2, a2, 31
-; RV32IM-NEXT:    andi a3, a3, 7
-; RV32IM-NEXT:    sh a3, 4(a0)
-; RV32IM-NEXT:    sh a2, 2(a0)
-; RV32IM-NEXT:    sh a1, 0(a0)
-; RV32IM-NEXT:    sh a4, 6(a0)
+; RV32IM-NEXT:    sub a2, a2, a5
+; RV32IM-NEXT:    andi a3, a3, 63
+; RV32IM-NEXT:    andi a4, a4, 31
+; RV32IM-NEXT:    andi a1, a1, 7
+; RV32IM-NEXT:    sh a1, 4(a0)
+; RV32IM-NEXT:    sh a4, 2(a0)
+; RV32IM-NEXT:    sh a3, 0(a0)
+; RV32IM-NEXT:    sh a2, 6(a0)
 ; RV32IM-NEXT:    ret
 ;
 ; RV64I-LABEL: dont_fold_urem_power_of_two:
@@ -611,19 +611,19 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) nounwind {
 ; RV64IM-NEXT:    lhu a2, 24(a1)
 ; RV64IM-NEXT:    lui a3, %hi(.LCPI3_0)
 ; RV64IM-NEXT:    ld a3, %lo(.LCPI3_0)(a3)
-; RV64IM-NEXT:    lhu a4, 16(a1)
+; RV64IM-NEXT:    lhu a4, 0(a1)
 ; RV64IM-NEXT:    lhu a5, 8(a1)
-; RV64IM-NEXT:    lhu a1, 0(a1)
+; RV64IM-NEXT:    lhu a1, 16(a1)
 ; RV64IM-NEXT:    mulhu a3, a2, a3
 ; RV64IM-NEXT:    li a6, 95
 ; RV64IM-NEXT:    mul a3, a3, a6
 ; RV64IM-NEXT:    subw a2, a2, a3
-; RV64IM-NEXT:    andi a1, a1, 63
+; RV64IM-NEXT:    andi a3, a4, 63
 ; RV64IM-NEXT:    andi a5, a5, 31
-; RV64IM-NEXT:    andi a4, a4, 7
-; RV64IM-NEXT:    sh a4, 4(a0)
+; RV64IM-NEXT:    andi a1, a1, 7
+; RV64IM-NEXT:    sh a1, 4(a0)
 ; RV64IM-NEXT:    sh a5, 2(a0)
-; RV64IM-NEXT:    sh a1, 0(a0)
+; RV64IM-NEXT:    sh a3, 0(a0)
 ; RV64IM-NEXT:    sh a2, 6(a0)
 ; RV64IM-NEXT:    ret
   %1 = urem <4 x i16> %x, <i16 64, i16 32, i16 8, i16 95>
@@ -791,50 +791,50 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) nounwind {
 ; RV32I-NEXT:    sw s6, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s7, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s8, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lw a3, 0(a1)
-; RV32I-NEXT:    lw a4, 4(a1)
-; RV32I-NEXT:    lw s0, 8(a1)
-; RV32I-NEXT:    lw s1, 12(a1)
-; RV32I-NEXT:    lw s2, 16(a1)
-; RV32I-NEXT:    lw s3, 20(a1)
+; RV32I-NEXT:    lw s0, 0(a1)
+; RV32I-NEXT:    lw s1, 4(a1)
+; RV32I-NEXT:    lw s2, 8(a1)
+; RV32I-NEXT:    lw s3, 12(a1)
+; RV32I-NEXT:    lw a3, 16(a1)
+; RV32I-NEXT:    lw a4, 20(a1)
 ; RV32I-NEXT:    lw s4, 24(a1)
 ; RV32I-NEXT:    lw s5, 28(a1)
 ; RV32I-NEXT:    mv s6, a0
-; RV32I-NEXT:    li a2, 1
+; RV32I-NEXT:    li a2, 23
 ; RV32I-NEXT:    mv a0, a3
 ; RV32I-NEXT:    mv a1, a4
 ; RV32I-NEXT:    li a3, 0
 ; RV32I-NEXT:    call __umoddi3 at plt
 ; RV32I-NEXT:    mv s7, a0
 ; RV32I-NEXT:    mv s8, a1
-; RV32I-NEXT:    li a2, 654
+; RV32I-NEXT:    lui a0, 1
+; RV32I-NEXT:    addi a2, a0, 1327
+; RV32I-NEXT:    mv a0, s4
+; RV32I-NEXT:    mv a1, s5
+; RV32I-NEXT:    li a3, 0
+; RV32I-NEXT:    call __umoddi3 at plt
+; RV32I-NEXT:    mv s4, a0
+; RV32I-NEXT:    mv s5, a1
+; RV32I-NEXT:    li a2, 1
 ; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    mv a1, s1
 ; RV32I-NEXT:    li a3, 0
 ; RV32I-NEXT:    call __umoddi3 at plt
 ; RV32I-NEXT:    mv s0, a0
 ; RV32I-NEXT:    mv s1, a1
-; RV32I-NEXT:    li a2, 23
+; RV32I-NEXT:    li a2, 654
 ; RV32I-NEXT:    mv a0, s2
 ; RV32I-NEXT:    mv a1, s3
 ; RV32I-NEXT:    li a3, 0
 ; RV32I-NEXT:    call __umoddi3 at plt
-; RV32I-NEXT:    mv s2, a0
-; RV32I-NEXT:    mv s3, a1
-; RV32I-NEXT:    lui a0, 1
-; RV32I-NEXT:    addi a2, a0, 1327
-; RV32I-NEXT:    mv a0, s4
-; RV32I-NEXT:    mv a1, s5
-; RV32I-NEXT:    li a3, 0
-; RV32I-NEXT:    call __umoddi3 at plt
-; RV32I-NEXT:    sw a1, 28(s6)
-; RV32I-NEXT:    sw a0, 24(s6)
-; RV32I-NEXT:    sw s3, 20(s6)
-; RV32I-NEXT:    sw s2, 16(s6)
-; RV32I-NEXT:    sw s1, 12(s6)
-; RV32I-NEXT:    sw s0, 8(s6)
-; RV32I-NEXT:    sw s8, 4(s6)
-; RV32I-NEXT:    sw s7, 0(s6)
+; RV32I-NEXT:    sw s5, 28(s6)
+; RV32I-NEXT:    sw s4, 24(s6)
+; RV32I-NEXT:    sw s8, 20(s6)
+; RV32I-NEXT:    sw s7, 16(s6)
+; RV32I-NEXT:    sw a1, 12(s6)
+; RV32I-NEXT:    sw a0, 8(s6)
+; RV32I-NEXT:    sw s1, 4(s6)
+; RV32I-NEXT:    sw s0, 0(s6)
 ; RV32I-NEXT:    lw ra, 44(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 40(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 36(sp) # 4-byte Folded Reload
@@ -861,50 +861,50 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) nounwind {
 ; RV32IM-NEXT:    sw s6, 16(sp) # 4-byte Folded Spill
 ; RV32IM-NEXT:    sw s7, 12(sp) # 4-byte Folded Spill
 ; RV32IM-NEXT:    sw s8, 8(sp) # 4-byte Folded Spill
-; RV32IM-NEXT:    lw a3, 0(a1)
-; RV32IM-NEXT:    lw a4, 4(a1)
-; RV32IM-NEXT:    lw s0, 8(a1)
-; RV32IM-NEXT:    lw s1, 12(a1)
-; RV32IM-NEXT:    lw s2, 16(a1)
-; RV32IM-NEXT:    lw s3, 20(a1)
+; RV32IM-NEXT:    lw s0, 0(a1)
+; RV32IM-NEXT:    lw s1, 4(a1)
+; RV32IM-NEXT:    lw s2, 8(a1)
+; RV32IM-NEXT:    lw s3, 12(a1)
+; RV32IM-NEXT:    lw a3, 16(a1)
+; RV32IM-NEXT:    lw a4, 20(a1)
 ; RV32IM-NEXT:    lw s4, 24(a1)
 ; RV32IM-NEXT:    lw s5, 28(a1)
 ; RV32IM-NEXT:    mv s6, a0
-; RV32IM-NEXT:    li a2, 1
+; RV32IM-NEXT:    li a2, 23
 ; RV32IM-NEXT:    mv a0, a3
 ; RV32IM-NEXT:    mv a1, a4
 ; RV32IM-NEXT:    li a3, 0
 ; RV32IM-NEXT:    call __umoddi3 at plt
 ; RV32IM-NEXT:    mv s7, a0
 ; RV32IM-NEXT:    mv s8, a1
-; RV32IM-NEXT:    li a2, 654
+; RV32IM-NEXT:    lui a0, 1
+; RV32IM-NEXT:    addi a2, a0, 1327
+; RV32IM-NEXT:    mv a0, s4
+; RV32IM-NEXT:    mv a1, s5
+; RV32IM-NEXT:    li a3, 0
+; RV32IM-NEXT:    call __umoddi3 at plt
+; RV32IM-NEXT:    mv s4, a0
+; RV32IM-NEXT:    mv s5, a1
+; RV32IM-NEXT:    li a2, 1
 ; RV32IM-NEXT:    mv a0, s0
 ; RV32IM-NEXT:    mv a1, s1
 ; RV32IM-NEXT:    li a3, 0
 ; RV32IM-NEXT:    call __umoddi3 at plt
 ; RV32IM-NEXT:    mv s0, a0
 ; RV32IM-NEXT:    mv s1, a1
-; RV32IM-NEXT:    li a2, 23
+; RV32IM-NEXT:    li a2, 654
 ; RV32IM-NEXT:    mv a0, s2
 ; RV32IM-NEXT:    mv a1, s3
 ; RV32IM-NEXT:    li a3, 0
 ; RV32IM-NEXT:    call __umoddi3 at plt
-; RV32IM-NEXT:    mv s2, a0
-; RV32IM-NEXT:    mv s3, a1
-; RV32IM-NEXT:    lui a0, 1
-; RV32IM-NEXT:    addi a2, a0, 1327
-; RV32IM-NEXT:    mv a0, s4
-; RV32IM-NEXT:    mv a1, s5
-; RV32IM-NEXT:    li a3, 0
-; RV32IM-NEXT:    call __umoddi3 at plt
-; RV32IM-NEXT:    sw a1, 28(s6)
-; RV32IM-NEXT:    sw a0, 24(s6)
-; RV32IM-NEXT:    sw s3, 20(s6)
-; RV32IM-NEXT:    sw s2, 16(s6)
-; RV32IM-NEXT:    sw s1, 12(s6)
-; RV32IM-NEXT:    sw s0, 8(s6)
-; RV32IM-NEXT:    sw s8, 4(s6)
-; RV32IM-NEXT:    sw s7, 0(s6)
+; RV32IM-NEXT:    sw s5, 28(s6)
+; RV32IM-NEXT:    sw s4, 24(s6)
+; RV32IM-NEXT:    sw s8, 20(s6)
+; RV32IM-NEXT:    sw s7, 16(s6)
+; RV32IM-NEXT:    sw a1, 12(s6)
+; RV32IM-NEXT:    sw a0, 8(s6)
+; RV32IM-NEXT:    sw s1, 4(s6)
+; RV32IM-NEXT:    sw s0, 0(s6)
 ; RV32IM-NEXT:    lw ra, 44(sp) # 4-byte Folded Reload
 ; RV32IM-NEXT:    lw s0, 40(sp) # 4-byte Folded Reload
 ; RV32IM-NEXT:    lw s1, 36(sp) # 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
index 3db42f1ba1a012..381e7b75080eb2 100644
--- a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
+++ b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
@@ -711,13 +711,9 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sw s3, 48(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s4, 44(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s5, 40(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 1(a1)
-; RV32I-NEXT:    lbu a4, 0(a1)
-; RV32I-NEXT:    lbu a5, 2(a1)
-; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a4
-; RV32I-NEXT:    lbu a4, 0(a0)
+; RV32I-NEXT:    lbu a3, 0(a1)
+; RV32I-NEXT:    lbu a4, 1(a1)
+; RV32I-NEXT:    lbu a5, 0(a0)
 ; RV32I-NEXT:    lbu a6, 1(a0)
 ; RV32I-NEXT:    lbu a7, 2(a0)
 ; RV32I-NEXT:    lbu t0, 3(a0)
@@ -733,9 +729,13 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    lbu s3, 13(a0)
 ; RV32I-NEXT:    lbu s4, 14(a0)
 ; RV32I-NEXT:    lbu s5, 15(a0)
-; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    lbu a0, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    slli a0, a0, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a0, a1, a5
+; RV32I-NEXT:    or a0, a1, a0
 ; RV32I-NEXT:    or a0, a0, a3
 ; RV32I-NEXT:    sb zero, 39(sp)
 ; RV32I-NEXT:    sb zero, 38(sp)
@@ -768,7 +768,7 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sb t0, 11(sp)
 ; RV32I-NEXT:    sb a7, 10(sp)
 ; RV32I-NEXT:    sb a6, 9(sp)
-; RV32I-NEXT:    sb a4, 8(sp)
+; RV32I-NEXT:    sb a5, 8(sp)
 ; RV32I-NEXT:    slli a1, a0, 25
 ; RV32I-NEXT:    srli a1, a1, 28
 ; RV32I-NEXT:    addi a3, sp, 8
@@ -996,13 +996,9 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sw s3, 48(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s4, 44(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s5, 40(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 1(a1)
-; RV32I-NEXT:    lbu a4, 0(a1)
-; RV32I-NEXT:    lbu a5, 2(a1)
-; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a4
-; RV32I-NEXT:    lbu a4, 0(a0)
+; RV32I-NEXT:    lbu a3, 0(a1)
+; RV32I-NEXT:    lbu a4, 1(a1)
+; RV32I-NEXT:    lbu a5, 0(a0)
 ; RV32I-NEXT:    lbu a6, 1(a0)
 ; RV32I-NEXT:    lbu a7, 2(a0)
 ; RV32I-NEXT:    lbu t0, 3(a0)
@@ -1018,9 +1014,13 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    lbu s3, 13(a0)
 ; RV32I-NEXT:    lbu s4, 14(a0)
 ; RV32I-NEXT:    lbu s5, 15(a0)
-; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    lbu a0, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    slli a0, a0, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a0, a1, a5
+; RV32I-NEXT:    or a0, a1, a0
 ; RV32I-NEXT:    or a0, a0, a3
 ; RV32I-NEXT:    sb zero, 23(sp)
 ; RV32I-NEXT:    sb zero, 22(sp)
@@ -1053,7 +1053,7 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sb t0, 27(sp)
 ; RV32I-NEXT:    sb a7, 26(sp)
 ; RV32I-NEXT:    sb a6, 25(sp)
-; RV32I-NEXT:    sb a4, 24(sp)
+; RV32I-NEXT:    sb a5, 24(sp)
 ; RV32I-NEXT:    slli a1, a0, 25
 ; RV32I-NEXT:    srli a1, a1, 28
 ; RV32I-NEXT:    addi a3, sp, 24
@@ -1281,72 +1281,73 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sw s3, 48(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s4, 44(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s5, 40(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 0(a1)
-; RV32I-NEXT:    lbu a4, 1(a1)
-; RV32I-NEXT:    lbu a5, 0(a0)
-; RV32I-NEXT:    lbu a6, 1(a0)
-; RV32I-NEXT:    lbu a7, 2(a0)
-; RV32I-NEXT:    lbu t0, 3(a0)
-; RV32I-NEXT:    lbu t1, 4(a0)
-; RV32I-NEXT:    lbu t2, 5(a0)
-; RV32I-NEXT:    lbu t3, 6(a0)
-; RV32I-NEXT:    lbu t4, 7(a0)
-; RV32I-NEXT:    lbu t5, 8(a0)
-; RV32I-NEXT:    lbu t6, 9(a0)
-; RV32I-NEXT:    lbu s0, 10(a0)
-; RV32I-NEXT:    lbu s1, 11(a0)
-; RV32I-NEXT:    lbu s2, 12(a0)
-; RV32I-NEXT:    lbu s3, 13(a0)
-; RV32I-NEXT:    lbu s4, 14(a0)
-; RV32I-NEXT:    lbu s5, 15(a0)
-; RV32I-NEXT:    lbu a0, 2(a1)
+; RV32I-NEXT:    sw s6, 36(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a3, 15(a0)
+; RV32I-NEXT:    lbu a4, 0(a0)
+; RV32I-NEXT:    lbu a5, 1(a0)
+; RV32I-NEXT:    lbu a6, 2(a0)
+; RV32I-NEXT:    lbu a7, 3(a0)
+; RV32I-NEXT:    lbu t0, 4(a0)
+; RV32I-NEXT:    lbu t1, 5(a0)
+; RV32I-NEXT:    lbu t2, 6(a0)
+; RV32I-NEXT:    lbu t3, 7(a0)
+; RV32I-NEXT:    lbu t4, 8(a0)
+; RV32I-NEXT:    lbu t5, 9(a0)
+; RV32I-NEXT:    lbu t6, 10(a0)
+; RV32I-NEXT:    lbu s0, 11(a0)
+; RV32I-NEXT:    lbu s1, 12(a0)
+; RV32I-NEXT:    lbu s2, 13(a0)
+; RV32I-NEXT:    lbu s3, 14(a0)
+; RV32I-NEXT:    lbu a0, 0(a1)
+; RV32I-NEXT:    lbu s4, 1(a1)
+; RV32I-NEXT:    slli s5, a3, 24
+; RV32I-NEXT:    lbu s6, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    slli a4, s5, 24
-; RV32I-NEXT:    slli a0, a0, 16
+; RV32I-NEXT:    slli s4, s4, 8
+; RV32I-NEXT:    or a0, s4, a0
+; RV32I-NEXT:    slli s6, s6, 16
 ; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or a1, a1, s6
 ; RV32I-NEXT:    or a0, a1, a0
-; RV32I-NEXT:    or a0, a0, a3
-; RV32I-NEXT:    sb s5, 23(sp)
-; RV32I-NEXT:    sb s4, 22(sp)
-; RV32I-NEXT:    sb s3, 21(sp)
-; RV32I-NEXT:    sb s2, 20(sp)
-; RV32I-NEXT:    sb s1, 19(sp)
-; RV32I-NEXT:    sb s0, 18(sp)
-; RV32I-NEXT:    sb t6, 17(sp)
-; RV32I-NEXT:    sb t5, 16(sp)
-; RV32I-NEXT:    sb t4, 15(sp)
-; RV32I-NEXT:    sb t3, 14(sp)
-; RV32I-NEXT:    sb t2, 13(sp)
-; RV32I-NEXT:    sb t1, 12(sp)
-; RV32I-NEXT:    sb t0, 11(sp)
-; RV32I-NEXT:    sb a7, 10(sp)
-; RV32I-NEXT:    sb a6, 9(sp)
-; RV32I-NEXT:    sb a5, 8(sp)
-; RV32I-NEXT:    srai a4, a4, 31
-; RV32I-NEXT:    sb a4, 36(sp)
-; RV32I-NEXT:    sb a4, 32(sp)
-; RV32I-NEXT:    sb a4, 28(sp)
-; RV32I-NEXT:    sb a4, 24(sp)
-; RV32I-NEXT:    srli a1, a4, 24
-; RV32I-NEXT:    sb a1, 39(sp)
-; RV32I-NEXT:    srli a3, a4, 16
-; RV32I-NEXT:    sb a3, 38(sp)
-; RV32I-NEXT:    srli a4, a4, 8
-; RV32I-NEXT:    sb a4, 37(sp)
-; RV32I-NEXT:    sb a1, 35(sp)
-; RV32I-NEXT:    sb a3, 34(sp)
-; RV32I-NEXT:    sb a4, 33(sp)
-; RV32I-NEXT:    sb a1, 31(sp)
-; RV32I-NEXT:    sb a3, 30(sp)
-; RV32I-NEXT:    sb a4, 29(sp)
-; RV32I-NEXT:    sb a1, 27(sp)
-; RV32I-NEXT:    sb a3, 26(sp)
-; RV32I-NEXT:    sb a4, 25(sp)
+; RV32I-NEXT:    sb a3, 19(sp)
+; RV32I-NEXT:    sb s3, 18(sp)
+; RV32I-NEXT:    sb s2, 17(sp)
+; RV32I-NEXT:    sb s1, 16(sp)
+; RV32I-NEXT:    sb s0, 15(sp)
+; RV32I-NEXT:    sb t6, 14(sp)
+; RV32I-NEXT:    sb t5, 13(sp)
+; RV32I-NEXT:    sb t4, 12(sp)
+; RV32I-NEXT:    sb t3, 11(sp)
+; RV32I-NEXT:    sb t2, 10(sp)
+; RV32I-NEXT:    sb t1, 9(sp)
+; RV32I-NEXT:    sb t0, 8(sp)
+; RV32I-NEXT:    sb a7, 7(sp)
+; RV32I-NEXT:    sb a6, 6(sp)
+; RV32I-NEXT:    sb a5, 5(sp)
+; RV32I-NEXT:    sb a4, 4(sp)
+; RV32I-NEXT:    srai a1, s5, 31
+; RV32I-NEXT:    sb a1, 32(sp)
+; RV32I-NEXT:    sb a1, 28(sp)
+; RV32I-NEXT:    sb a1, 24(sp)
+; RV32I-NEXT:    sb a1, 20(sp)
+; RV32I-NEXT:    srli a3, a1, 24
+; RV32I-NEXT:    sb a3, 35(sp)
+; RV32I-NEXT:    srli a4, a1, 16
+; RV32I-NEXT:    sb a4, 34(sp)
+; RV32I-NEXT:    srli a1, a1, 8
+; RV32I-NEXT:    sb a1, 33(sp)
+; RV32I-NEXT:    sb a3, 31(sp)
+; RV32I-NEXT:    sb a4, 30(sp)
+; RV32I-NEXT:    sb a1, 29(sp)
+; RV32I-NEXT:    sb a3, 27(sp)
+; RV32I-NEXT:    sb a4, 26(sp)
+; RV32I-NEXT:    sb a1, 25(sp)
+; RV32I-NEXT:    sb a3, 23(sp)
+; RV32I-NEXT:    sb a4, 22(sp)
+; RV32I-NEXT:    sb a1, 21(sp)
 ; RV32I-NEXT:    slli a1, a0, 25
 ; RV32I-NEXT:    srli a1, a1, 28
-; RV32I-NEXT:    addi a3, sp, 8
+; RV32I-NEXT:    addi a3, sp, 4
 ; RV32I-NEXT:    add a1, a3, a1
 ; RV32I-NEXT:    lbu a3, 0(a1)
 ; RV32I-NEXT:    lbu a4, 1(a1)
@@ -1438,6 +1439,7 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    lw s3, 48(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s4, 44(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s5, 40(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s6, 36(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    addi sp, sp, 64
 ; RV32I-NEXT:    ret
   %src = load i128, ptr %src.ptr, align 1
@@ -1795,20 +1797,20 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sw s9, 100(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s10, 96(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s11, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a5, 0(a1)
-; RV32I-NEXT:    lbu a6, 1(a1)
-; RV32I-NEXT:    lbu ra, 29(a0)
-; RV32I-NEXT:    lbu t0, 30(a0)
+; RV32I-NEXT:    lbu a6, 0(a1)
+; RV32I-NEXT:    lbu a7, 1(a1)
+; RV32I-NEXT:    lbu t1, 29(a0)
+; RV32I-NEXT:    lbu a5, 30(a0)
 ; RV32I-NEXT:    lbu a4, 31(a0)
-; RV32I-NEXT:    lbu a7, 2(a1)
+; RV32I-NEXT:    lbu t0, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a6, a6, 8
-; RV32I-NEXT:    or a5, a6, a5
-; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    slli t0, t0, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, a7
-; RV32I-NEXT:    or a6, a1, a5
-; RV32I-NEXT:    sw a6, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    or a1, a1, t0
+; RV32I-NEXT:    or a7, a1, a6
+; RV32I-NEXT:    sw a7, 24(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    lbu a1, 0(a0)
 ; RV32I-NEXT:    sw a1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    lbu a1, 1(a0)
@@ -1821,7 +1823,7 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sw a1, 4(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    lbu a1, 5(a0)
 ; RV32I-NEXT:    sw a1, 0(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu t1, 6(a0)
+; RV32I-NEXT:    lbu t0, 6(a0)
 ; RV32I-NEXT:    lbu t2, 7(a0)
 ; RV32I-NEXT:    lbu t3, 8(a0)
 ; RV32I-NEXT:    lbu t4, 9(a0)
@@ -1839,14 +1841,14 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    lbu s9, 21(a0)
 ; RV32I-NEXT:    lbu s10, 22(a0)
 ; RV32I-NEXT:    lbu s11, 23(a0)
-; RV32I-NEXT:    lbu a7, 24(a0)
-; RV32I-NEXT:    lbu a5, 25(a0)
+; RV32I-NEXT:    lbu ra, 24(a0)
+; RV32I-NEXT:    lbu a6, 25(a0)
 ; RV32I-NEXT:    lbu a3, 26(a0)
 ; RV32I-NEXT:    lbu a1, 27(a0)
 ; RV32I-NEXT:    lbu a0, 28(a0)
 ; RV32I-NEXT:    sb a4, 59(sp)
-; RV32I-NEXT:    sb t0, 58(sp)
-; RV32I-NEXT:    sb ra, 57(sp)
+; RV32I-NEXT:    sb a5, 58(sp)
+; RV32I-NEXT:    sb t1, 57(sp)
 ; RV32I-NEXT:    sb a0, 56(sp)
 ; RV32I-NEXT:    sb a1, 55(sp)
 ; RV32I-NEXT:    sb a3, 54(sp)
@@ -1882,8 +1884,8 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sb zero, 62(sp)
 ; RV32I-NEXT:    sb zero, 61(sp)
 ; RV32I-NEXT:    sb zero, 60(sp)
-; RV32I-NEXT:    sb a5, 53(sp)
-; RV32I-NEXT:    sb a7, 52(sp)
+; RV32I-NEXT:    sb a6, 53(sp)
+; RV32I-NEXT:    sb ra, 52(sp)
 ; RV32I-NEXT:    sb s11, 51(sp)
 ; RV32I-NEXT:    sb s10, 50(sp)
 ; RV32I-NEXT:    sb s9, 49(sp)
@@ -1901,7 +1903,7 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sb t4, 37(sp)
 ; RV32I-NEXT:    sb t3, 36(sp)
 ; RV32I-NEXT:    sb t2, 35(sp)
-; RV32I-NEXT:    sb t1, 34(sp)
+; RV32I-NEXT:    sb t0, 34(sp)
 ; RV32I-NEXT:    lw a0, 0(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    sb a0, 33(sp)
 ; RV32I-NEXT:    lw a0, 4(sp) # 4-byte Folded Reload
@@ -1914,7 +1916,7 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sb a0, 29(sp)
 ; RV32I-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    sb a0, 28(sp)
-; RV32I-NEXT:    slli a0, a6, 24
+; RV32I-NEXT:    slli a0, a7, 24
 ; RV32I-NEXT:    srli a0, a0, 27
 ; RV32I-NEXT:    addi a1, sp, 28
 ; RV32I-NEXT:    add a0, a1, a0
@@ -2459,20 +2461,20 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sw s9, 100(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s10, 96(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s11, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a5, 0(a1)
-; RV32I-NEXT:    lbu a6, 1(a1)
-; RV32I-NEXT:    lbu ra, 29(a0)
-; RV32I-NEXT:    lbu t0, 30(a0)
+; RV32I-NEXT:    lbu a6, 0(a1)
+; RV32I-NEXT:    lbu a7, 1(a1)
+; RV32I-NEXT:    lbu t1, 29(a0)
+; RV32I-NEXT:    lbu a5, 30(a0)
 ; RV32I-NEXT:    lbu a4, 31(a0)
-; RV32I-NEXT:    lbu a7, 2(a1)
+; RV32I-NEXT:    lbu t0, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a6, a6, 8
-; RV32I-NEXT:    or a5, a6, a5
-; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    slli t0, t0, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, a7
-; RV32I-NEXT:    or a6, a1, a5
-; RV32I-NEXT:    sw a6, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    or a1, a1, t0
+; RV32I-NEXT:    or a7, a1, a6
+; RV32I-NEXT:    sw a7, 24(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    lbu a1, 0(a0)
 ; RV32I-NEXT:    sw a1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    lbu a1, 1(a0)
@@ -2485,7 +2487,7 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sw a1, 4(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    lbu a1, 5(a0)
 ; RV32I-NEXT:    sw a1, 0(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu t1, 6(a0)
+; RV32I-NEXT:    lbu t0, 6(a0)
 ; RV32I-NEXT:    lbu t2, 7(a0)
 ; RV32I-NEXT:    lbu t3, 8(a0)
 ; RV32I-NEXT:    lbu t4, 9(a0)
@@ -2503,14 +2505,14 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    lbu s9, 21(a0)
 ; RV32I-NEXT:    lbu s10, 22(a0)
 ; RV32I-NEXT:    lbu s11, 23(a0)
-; RV32I-NEXT:    lbu a7, 24(a0)
-; RV32I-NEXT:    lbu a5, 25(a0)
+; RV32I-NEXT:    lbu ra, 24(a0)
+; RV32I-NEXT:    lbu a6, 25(a0)
 ; RV32I-NEXT:    lbu a3, 26(a0)
 ; RV32I-NEXT:    lbu a1, 27(a0)
 ; RV32I-NEXT:    lbu a0, 28(a0)
 ; RV32I-NEXT:    sb a4, 91(sp)
-; RV32I-NEXT:    sb t0, 90(sp)
-; RV32I-NEXT:    sb ra, 89(sp)
+; RV32I-NEXT:    sb a5, 90(sp)
+; RV32I-NEXT:    sb t1, 89(sp)
 ; RV32I-NEXT:    sb a0, 88(sp)
 ; RV32I-NEXT:    sb a1, 87(sp)
 ; RV32I-NEXT:    sb a3, 86(sp)
@@ -2546,8 +2548,8 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sb zero, 30(sp)
 ; RV32I-NEXT:    sb zero, 29(sp)
 ; RV32I-NEXT:    sb zero, 28(sp)
-; RV32I-NEXT:    sb a5, 85(sp)
-; RV32I-NEXT:    sb a7, 84(sp)
+; RV32I-NEXT:    sb a6, 85(sp)
+; RV32I-NEXT:    sb ra, 84(sp)
 ; RV32I-NEXT:    sb s11, 83(sp)
 ; RV32I-NEXT:    sb s10, 82(sp)
 ; RV32I-NEXT:    sb s9, 81(sp)
@@ -2565,7 +2567,7 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sb t4, 69(sp)
 ; RV32I-NEXT:    sb t3, 68(sp)
 ; RV32I-NEXT:    sb t2, 67(sp)
-; RV32I-NEXT:    sb t1, 66(sp)
+; RV32I-NEXT:    sb t0, 66(sp)
 ; RV32I-NEXT:    lw a0, 0(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    sb a0, 65(sp)
 ; RV32I-NEXT:    lw a0, 4(sp) # 4-byte Folded Reload
@@ -2578,7 +2580,7 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sb a0, 61(sp)
 ; RV32I-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    sb a0, 60(sp)
-; RV32I-NEXT:    slli a0, a6, 24
+; RV32I-NEXT:    slli a0, a7, 24
 ; RV32I-NEXT:    srli a0, a0, 27
 ; RV32I-NEXT:    addi a1, sp, 60
 ; RV32I-NEXT:    sub a0, a1, a0
@@ -2597,9 +2599,9 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    lbu t4, 14(a0)
 ; RV32I-NEXT:    lbu t5, 15(a0)
 ; RV32I-NEXT:    lbu s0, 16(a0)
-; RV32I-NEXT:    lbu t0, 17(a0)
-; RV32I-NEXT:    lbu s1, 18(a0)
-; RV32I-NEXT:    lbu s2, 19(a0)
+; RV32I-NEXT:    lbu s1, 17(a0)
+; RV32I-NEXT:    lbu s2, 18(a0)
+; RV32I-NEXT:    lbu t0, 19(a0)
 ; RV32I-NEXT:    lbu s5, 20(a0)
 ; RV32I-NEXT:    lbu s10, 21(a0)
 ; RV32I-NEXT:    lbu s8, 22(a0)
@@ -2648,76 +2650,76 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    slli s9, s9, 24
 ; RV32I-NEXT:    or a1, s9, s8
 ; RV32I-NEXT:    or t2, a1, a0
-; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    or a0, t0, s0
-; RV32I-NEXT:    lw a1, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    andi t0, a1, 7
-; RV32I-NEXT:    slli s1, s1, 16
-; RV32I-NEXT:    slli s2, s2, 24
-; RV32I-NEXT:    or a1, s2, s1
-; RV32I-NEXT:    srli a3, a4, 1
-; RV32I-NEXT:    or t4, a1, a0
-; RV32I-NEXT:    xori t5, t0, 31
-; RV32I-NEXT:    srl a0, a3, t5
+; RV32I-NEXT:    slli s1, s1, 8
+; RV32I-NEXT:    or s0, s1, s0
+; RV32I-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    andi t4, a0, 7
+; RV32I-NEXT:    slli s2, s2, 16
+; RV32I-NEXT:    slli t0, t0, 24
+; RV32I-NEXT:    or a0, t0, s2
+; RV32I-NEXT:    srli a1, a4, 1
+; RV32I-NEXT:    or s0, a0, s0
+; RV32I-NEXT:    xori a3, t4, 31
+; RV32I-NEXT:    srl a0, a1, a3
 ; RV32I-NEXT:    slli ra, ra, 8
-; RV32I-NEXT:    or a3, ra, s11
+; RV32I-NEXT:    or t0, ra, s11
 ; RV32I-NEXT:    srli a1, t1, 1
-; RV32I-NEXT:    srl a1, a1, t5
+; RV32I-NEXT:    srl a1, a1, a3
 ; RV32I-NEXT:    slli a6, a6, 16
 ; RV32I-NEXT:    slli a7, a7, 24
 ; RV32I-NEXT:    or a6, a7, a6
 ; RV32I-NEXT:    srli a7, a5, 1
-; RV32I-NEXT:    or t6, a6, a3
-; RV32I-NEXT:    not a3, t0
-; RV32I-NEXT:    srl a6, a7, a3
+; RV32I-NEXT:    or t0, a6, t0
+; RV32I-NEXT:    not t5, t4
+; RV32I-NEXT:    srl a6, a7, t5
 ; RV32I-NEXT:    slli s6, s6, 8
 ; RV32I-NEXT:    or a7, s6, s3
-; RV32I-NEXT:    srli s0, t4, 1
-; RV32I-NEXT:    srl s0, s0, t5
+; RV32I-NEXT:    srli t6, s0, 1
+; RV32I-NEXT:    srl t6, t6, a3
 ; RV32I-NEXT:    slli s4, s4, 16
 ; RV32I-NEXT:    slli s7, s7, 24
 ; RV32I-NEXT:    or s1, s7, s4
 ; RV32I-NEXT:    srli s2, t3, 1
-; RV32I-NEXT:    srl s2, s2, a3
+; RV32I-NEXT:    srl s2, s2, t5
 ; RV32I-NEXT:    or a7, s1, a7
 ; RV32I-NEXT:    srli s1, a7, 1
-; RV32I-NEXT:    srl t5, s1, t5
-; RV32I-NEXT:    srli s1, t2, 1
 ; RV32I-NEXT:    srl s1, s1, a3
-; RV32I-NEXT:    sll a3, a5, t0
-; RV32I-NEXT:    sll a5, t3, t0
-; RV32I-NEXT:    sll t1, t1, t0
-; RV32I-NEXT:    sll t2, t2, t0
-; RV32I-NEXT:    sll t3, t4, t0
-; RV32I-NEXT:    sll t4, t6, t0
-; RV32I-NEXT:    sll a7, a7, t0
-; RV32I-NEXT:    sll a4, a4, t0
-; RV32I-NEXT:    srli t0, a7, 24
-; RV32I-NEXT:    sb t0, 27(a2)
-; RV32I-NEXT:    srli t0, a7, 16
-; RV32I-NEXT:    sb t0, 26(a2)
-; RV32I-NEXT:    or t0, a7, s1
+; RV32I-NEXT:    srli a3, t2, 1
+; RV32I-NEXT:    srl t5, a3, t5
+; RV32I-NEXT:    sll a3, a5, t4
+; RV32I-NEXT:    sll a5, t3, t4
+; RV32I-NEXT:    sll t1, t1, t4
+; RV32I-NEXT:    sll t2, t2, t4
+; RV32I-NEXT:    sll t3, s0, t4
+; RV32I-NEXT:    sll t0, t0, t4
+; RV32I-NEXT:    sll a7, a7, t4
+; RV32I-NEXT:    sll a4, a4, t4
+; RV32I-NEXT:    srli t4, a7, 24
+; RV32I-NEXT:    sb t4, 27(a2)
+; RV32I-NEXT:    srli t4, a7, 16
+; RV32I-NEXT:    sb t4, 26(a2)
+; RV32I-NEXT:    or t4, a7, t5
 ; RV32I-NEXT:    srli a7, a7, 8
 ; RV32I-NEXT:    sb a7, 25(a2)
-; RV32I-NEXT:    srli a7, t4, 24
+; RV32I-NEXT:    srli a7, t0, 24
 ; RV32I-NEXT:    sb a7, 31(a2)
-; RV32I-NEXT:    srli a7, t4, 16
+; RV32I-NEXT:    srli a7, t0, 16
 ; RV32I-NEXT:    sb a7, 30(a2)
-; RV32I-NEXT:    or a7, t4, t5
-; RV32I-NEXT:    srli t4, t4, 8
-; RV32I-NEXT:    sb t4, 29(a2)
-; RV32I-NEXT:    srli t4, t3, 24
-; RV32I-NEXT:    sb t4, 19(a2)
-; RV32I-NEXT:    srli t4, t3, 16
-; RV32I-NEXT:    sb t4, 18(a2)
-; RV32I-NEXT:    or t4, t3, s2
+; RV32I-NEXT:    or a7, t0, s1
+; RV32I-NEXT:    srli t0, t0, 8
+; RV32I-NEXT:    sb t0, 29(a2)
+; RV32I-NEXT:    srli t0, t3, 24
+; RV32I-NEXT:    sb t0, 19(a2)
+; RV32I-NEXT:    srli t0, t3, 16
+; RV32I-NEXT:    sb t0, 18(a2)
+; RV32I-NEXT:    or t0, t3, s2
 ; RV32I-NEXT:    srli t3, t3, 8
 ; RV32I-NEXT:    sb t3, 17(a2)
 ; RV32I-NEXT:    srli t3, t2, 24
 ; RV32I-NEXT:    sb t3, 23(a2)
 ; RV32I-NEXT:    srli t3, t2, 16
 ; RV32I-NEXT:    sb t3, 22(a2)
-; RV32I-NEXT:    or t3, t2, s0
+; RV32I-NEXT:    or t3, t2, t6
 ; RV32I-NEXT:    srli t2, t2, 8
 ; RV32I-NEXT:    sb t2, 21(a2)
 ; RV32I-NEXT:    srli t2, t1, 24
@@ -2748,9 +2750,9 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    or a0, a3, a0
 ; RV32I-NEXT:    srli a3, a3, 8
 ; RV32I-NEXT:    sb a3, 5(a2)
-; RV32I-NEXT:    sb t0, 24(a2)
+; RV32I-NEXT:    sb t4, 24(a2)
 ; RV32I-NEXT:    sb a7, 28(a2)
-; RV32I-NEXT:    sb t4, 16(a2)
+; RV32I-NEXT:    sb t0, 16(a2)
 ; RV32I-NEXT:    sb t3, 20(a2)
 ; RV32I-NEXT:    sb a6, 8(a2)
 ; RV32I-NEXT:    sb a1, 12(a2)
diff --git a/llvm/test/CodeGen/RISCV/xtheadmempair.ll b/llvm/test/CodeGen/RISCV/xtheadmempair.ll
index 95cb61ef050519..b51903765d114e 100644
--- a/llvm/test/CodeGen/RISCV/xtheadmempair.ll
+++ b/llvm/test/CodeGen/RISCV/xtheadmempair.ll
@@ -57,14 +57,14 @@ define i64 @lwud(i32* %a) {
 define i64 @ldd(i64* %a) {
 ; RV32XTHEADMEMPAIR-LABEL: ldd:
 ; RV32XTHEADMEMPAIR:       # %bb.0:
-; RV32XTHEADMEMPAIR-NEXT:    lw a1, 44(a0)
+; RV32XTHEADMEMPAIR-NEXT:    lw a1, 40(a0)
 ; RV32XTHEADMEMPAIR-NEXT:    lw a2, 32(a0)
 ; RV32XTHEADMEMPAIR-NEXT:    lw a3, 36(a0)
-; RV32XTHEADMEMPAIR-NEXT:    lw a0, 40(a0)
+; RV32XTHEADMEMPAIR-NEXT:    lw a4, 44(a0)
+; RV32XTHEADMEMPAIR-NEXT:    add a0, a2, a1
+; RV32XTHEADMEMPAIR-NEXT:    sltu a1, a0, a2
+; RV32XTHEADMEMPAIR-NEXT:    add a3, a3, a4
 ; RV32XTHEADMEMPAIR-NEXT:    add a1, a3, a1
-; RV32XTHEADMEMPAIR-NEXT:    add a0, a2, a0
-; RV32XTHEADMEMPAIR-NEXT:    sltu a2, a0, a2
-; RV32XTHEADMEMPAIR-NEXT:    add a1, a1, a2
 ; RV32XTHEADMEMPAIR-NEXT:    ret
 ;
 ; RV64XTHEADMEMPAIR-LABEL: ldd:



More information about the llvm-commits mailing list